In [None]:
import os
import scanpy as sc
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
import warnings
import anndata as ad

In [2]:
class Concat:
    def __init__(self, base_path, output_file, dataset_name):
        self.base_path = base_path
        self.output_file = output_file
        self.dataset_name = dataset_name
        self.ann_data_list = []
        self.process_directories()
        
    def process_directories(self):
        highly_variable_genes = None 
        for root, dirs, files in os.walk(self.base_path):
            for file in files:
                if file.endswith("_output_preprocessed.h5ad"):
                    file_path = os.path.join(root, file)
                    print(f"Loading Anndata: {file_path}")
                    adata = self.load_anndata(file_path)
                    print("Adding Metadata")
                    self.add_metadata(adata, root)
                    # print('Calculating HVGs')
                    # self.calculate_HVGs(adata)
                    self.ann_data_list.append(adata)
        print(self.ann_data_list)            
        print('Concatinating Datasets')
        self.concat_and_save()

    def load_anndata(self, file_path):
        return sc.read_h5ad(file_path)

    def add_metadata(self, adata, directory_name):
        print('Changing Var Index')
        adata.var['1'] = adata.var['1'].astype(str)
        adata.var.index = adata.var['1']
        adata.var_names_make_unique()
        # Extracting directory name from the path
        directory_name = os.path.basename(directory_name)
        # Adding columns to adata.obs
        adata.obs['ID'] = directory_name
        adata.obs['Dataset'] = self.dataset_name

    # def calculate_HVGs(self, adata):
    #     sc.pp.normalize_total(adata, target_sum=1e4)
    #     sc.pp.log1p(adata)
    #     sc.pp.highly_variable_genes(adata, n_top_genes=3000)
    #     adata._inplace_subset_var(adata.var['highly_variable'])
    
    def concat_and_save(self):
        if self.ann_data_list:
            concatenated_anndata = ad.concat(self.ann_data_list, join='outer', fill_value=0)
            concatenated_anndata.write_h5ad(self.output_file)


In [56]:
Simeone = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone', 
                 output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/Simeone_concat.h5ad', dataset_name='Simeone')

Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/GSM6204125_P17/GSM6204125_P17_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/GSM6204130_P22/GSM6204130_P22_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/GSM6204112_P04/GSM6204112_P04_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/GSM6204111_P03/GSM6204111_P03_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/GSM6204123_P15/GSM6204123_P15_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn

  utils.warn_names_duplicates("obs")


In [60]:
Lee = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee', 
             output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/Lee_concat.h5ad', dataset_name='Lee')

Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/PBMC-P4_filtered_feature_bc_matrix/PBMC-P4_filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/PBMC-VM_filtered_feature_bc_matrix/PBMC-VM_filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/LiM_filtered_feature_bc_matrix/LiM_filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/P1_filtered_feature_bc_matrix/P1_filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/PBMC-P3_filte

  utils.warn_names_duplicates("obs")


In [3]:
adata = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/Lee_concat.h5ad')

  utils.warn_names_duplicates("obs")


In [61]:
Steele = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele', 
             output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/Steele_concat.h5ad', dataset_name='Steele')

Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/PDAC_TISSUE_2/filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/PDAC_PBMC_4/filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/PDAC_TISSUE_4/filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/PDAC_TISSUE_10/filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/Healthy_PBMC_1/filtered_feature_bc_matrix_output_preprocessed.h5ad
Adding Metadata
Changing Var I

  utils.warn_names_duplicates("obs")


In [45]:
adata = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/Steele_concat.h5ad')

  utils.warn_names_duplicates("obs")


In [47]:
adata = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zenodo/Data/pk_all_preprocessed.h5ad')

In [46]:
lee = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/Lee_concat.h5ad')

  utils.warn_names_duplicates("obs")


In [58]:
simeone = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/Simeone_concat.h5ad')

  utils.warn_names_duplicates("obs")


In [2]:
steele = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/Steele_concat.h5ad')

  utils.warn_names_duplicates("obs")


In [5]:
Caronni = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni', 
             output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/Caronni_concat.h5ad', dataset_name='Caronni')

Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/GSM6727542_LPDAC/GSM6727542_LPDAC_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/GSM6727551_PDAC/GSM6727551_PDAC_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/GSM6727548_PDAC/GSM6727548_PDAC_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/GSM6727547_PDAC/GSM6727547_PDAC_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/GSM6727545_LPDAC/GSM6727545_LPDAC_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/l

  utils.warn_names_duplicates("obs")


In [6]:
Zhang = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang', 
             output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/Zhang_concat.h5ad', dataset_name='Zhang')

Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/GSM5910788_Case2-ZY_LM/GSM5910788_Case2-ZY_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/GSM5910785_Case1-ZY_LM/GSM5910785_Case1-ZY_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/GSM5910790_Case3-ZY_LM/GSM5910790_Case3-ZY_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/GSM5910789_Case3-YF_PDAC/GSM5910789_Case3-YF_output_preprocessed.h5ad
Adding Metadata
Changing Var Index
Loading Anndata: /dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/GSM5910786_Case2-ZC_NPT/GSM5910786_Case2-ZC_output_preprocessed.h5ad
Adding Metadata
Changi

  utils.warn_names_duplicates("obs")


In [7]:
zhang = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/Zhang_concat.h5ad')

  utils.warn_names_duplicates("obs")


In [None]:
caronni = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/Caronni_concat.h5ad')

In [None]:
#TODO: Is it necessary to read the object after concatenation all the time? if not remove