In [1]:
import os
import scanpy as sc
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
import warnings
import anndata as ad

In [2]:
class Concat:
    def __init__(self, base_path, output_file, dataset_name):
        self.base_path = base_path
        self.output_file = output_file
        self.dataset_name = dataset_name
        self.ann_data_list = []
        self.process_directories()
        
    def process_directories(self):
        highly_variable_genes = None 
        for root, dirs, files in os.walk(self.base_path):
            for file in files:
                if file.endswith("_output_preprocessed.h5ad"):
                    file_path = os.path.join(root, file)
                    print(f"Loading Anndata: {file_path}")
                    adata = self.load_anndata(file_path)
                    print("Adding Metadata")
                    self.add_metadata(adata, root)
                    # print('Calculating HVGs')
                    # self.calculate_HVGs(adata)
                    self.ann_data_list.append(adata)
        print(self.ann_data_list)            
        print('Concatinating Datasets')
        self.concat_and_save()

    def load_anndata(self, file_path):
        return sc.read_h5ad(file_path)

    def add_metadata(self, adata, directory_name):
        print('Changing Var Index')
        adata.var['1'] = adata.var['1'].astype(str)
        adata.var.index = adata.var['1']
        adata.var_names_make_unique()
        # Extracting directory name from the path
        directory_name = os.path.basename(directory_name)
        # Adding columns to adata.obs
        adata.obs['ID'] = directory_name
        adata.obs['Dataset'] = self.dataset_name

    # def calculate_HVGs(self, adata):
    #     sc.pp.normalize_total(adata, target_sum=1e4)
    #     sc.pp.log1p(adata)
    #     sc.pp.highly_variable_genes(adata, n_top_genes=3000)
    #     adata._inplace_subset_var(adata.var['highly_variable'])
    
    def concat_and_save(self):
        if self.ann_data_list:
            concatenated_anndata = ad.concat(self.ann_data_list, join='outer', fill_value=0)
            concatenated_anndata.write_h5ad(self.output_file)


In [56]:
Simeone = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone', 
                 output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/Simeone_concat.h5ad', dataset_name='Simeone')

Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/GSM6204125_P17/GSM6204125_P17_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/GSM6204130_P22/GSM6204130_P22_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/GSM6204112_P04/GSM6204112_P04_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/GSM6204111_P03/GSM6204111_P03_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/GSM6204123_P15/GSM6204123_P15_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn

  utils.warn_names_duplicates("obs")


In [60]:
Lee = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee', 
             output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/Lee_concat.h5ad', dataset_name='Lee')

Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/PBMC-P4_filtered_feature_bc_matrix/PBMC-P4_filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/PBMC-VM_filtered_feature_bc_matrix/PBMC-VM_filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/LiM_filtered_feature_bc_matrix/LiM_filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/P1_filtered_feature_bc_matrix/P1_filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/PBMC-P3_filte

  utils.warn_names_duplicates("obs")


In [3]:
adata = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/Lee_concat.h5ad')

  utils.warn_names_duplicates("obs")


In [4]:
adata

AnnData object with n_obs × n_vars = 45447 × 21161
    obs: 'n_counts', 'log_counts', 'n_genes', 'mt_frac', 'ID', 'Dataset'

In [61]:
Steele = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele', 
             output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/Steele_concat.h5ad', dataset_name='Steele')

Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/PDAC_TISSUE_2/filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/PDAC_PBMC_4/filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/PDAC_TISSUE_4/filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/PDAC_TISSUE_10/filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/Healthy_PBMC_1/filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var I

  utils.warn_names_duplicates("obs")


In [45]:
adata = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/Steele_concat.h5ad')

  utils.warn_names_duplicates("obs")


In [47]:
adata = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zenodo/Data/pk_all_preprocessed.h5ad')

In [49]:
adata.obs.columns

Index(['CELL', 'Patient', 'Type', 'Cell_type', 'celltype0', 'celltype1',
       'celltype2', 'celltype3', 'Patient2', 'nCount_RNA', 'nFeature_RNA',
       'Project', 'orig.ident', 'Stage', 'Grade', 'Gender', 'Age',
       'Percent_mito', 'Percent_ribo', 'Percent_hemo', 'percent.mt',
       'predicted.id', 'prediction.score.Fibroblast.cell',
       'prediction.score.Stellate.cell', 'prediction.score.Macrophage.cell',
       'prediction.score.Endothelial.cell', 'prediction.score.T.cell',
       'prediction.score.B.cell', 'prediction.score.Ductal.cell.type.2',
       'prediction.score.Endocrine.cell',
       'prediction.score.Ductal.cell.type.1', 'prediction.score.Acinar.cell',
       'prediction.score.max', 'classical_score1', 'basal_score1',
       'classical_score21', 'basal_score21', 'endocrine_score_1',
       'immune_score_1', 'exocrine_score_1', 'activated_stroma_score_1',
       'histone_score_1', 'normal_stroma_score_1', 'olfactory_score_1',
       'na_score_1', 'n_counts', 'log_

In [46]:
lee = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/Lee_concat.h5ad')

  utils.warn_names_duplicates("obs")


In [47]:
lee.var_names

Index(['NOC2L', 'ISG15', 'C1orf159', 'TNFRSF18', 'TNFRSF4', 'SDF4', 'B3GALT6',
       'UBE2J2', 'ACAP3', 'PUSL1',
       ...
       'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4',
       'MT-ND5', 'MT-ND6', 'MT-CYB'],
      dtype='object', name='1', length=8426)

In [58]:
simeone = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/Simeone_concat.h5ad')

  utils.warn_names_duplicates("obs")


In [59]:
simeone

AnnData object with n_obs × n_vars = 213711 × 26053
    obs: 'n_counts', 'log_counts', 'n_genes', 'mt_frac', 'ID', 'Dataset'

In [2]:
steele = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/Steele_concat.h5ad')

  utils.warn_names_duplicates("obs")


In [4]:
steele.obs

Unnamed: 0_level_0,n_counts,log_counts,n_genes,mt_frac,ID,Dataset
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAACCCAAGGTAATCA-1,938,6.843750,637,0.100213,PDAC_TISSUE_2,Steele
AAACCCACATAAGCAA-1,2022,7.611842,994,0.064787,PDAC_TISSUE_2,Steele
AAACGAACAAATTGGA-1,11113,9.315871,3096,0.056510,PDAC_TISSUE_2,Steele
AAACGAACATGTAACC-1,8834,9.086363,2645,0.015395,PDAC_TISSUE_2,Steele
AAACGCTAGAGAGCGG-1,1161,7.057037,655,0.013781,PDAC_TISSUE_2,Steele
...,...,...,...,...,...,...
TTTGGTTGTTAAAGTG-1,4814,8.479284,1640,0.045908,PDAC_PBMC_15,Steele
TTTGGTTTCATCGCAA-1,3975,8.287780,1400,0.087044,PDAC_PBMC_15,Steele
TTTGGTTTCTGCTTTA-1,4311,8.368925,1755,0.044073,PDAC_PBMC_15,Steele
TTTGTTGAGAAGGTAG-1,4692,8.453614,1641,0.036658,PDAC_PBMC_15,Steele


In [5]:
Caronni = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni', 
             output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/Caronni_concat.h5ad', dataset_name='Caronni')

Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/GSM6727542_LPDAC/GSM6727542_LPDAC_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/GSM6727551_PDAC/GSM6727551_PDAC_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/GSM6727548_PDAC/GSM6727548_PDAC_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/GSM6727547_PDAC/GSM6727547_PDAC_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/GSM6727545_LPDAC/GSM6727545_LPDAC_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/l

  utils.warn_names_duplicates("obs")


In [6]:
Zhang = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang', 
             output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/Zhang_concat.h5ad', dataset_name='Zhang')

Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/GSM5910788_Case2-ZY_LM/GSM5910788_Case2-ZY_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/GSM5910785_Case1-ZY_LM/GSM5910785_Case1-ZY_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/GSM5910790_Case3-ZY_LM/GSM5910790_Case3-ZY_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/GSM5910789_Case3-YF_PDAC/GSM5910789_Case3-YF_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/GSM5910786_Case2-ZC_NPT/GSM5910786_Case2-ZC_output_preprocessed.h5ad
Adding Metadata
Changi

  utils.warn_names_duplicates("obs")


In [7]:
zhang = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/Zhang_concat.h5ad')

  utils.warn_names_duplicates("obs")


In [9]:
zhang.var

A1BG
A1BG-AS1
A1CF
A2M
A2M-AS1
...
ZXDB
ZXDC
ZYG11B
ZYX
ZZEF1


In [None]:
caronni = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/Caronni_concat.h5ad', dataset_name='Caronn