In [None]:
import os
import scanpy as sc
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
import warnings
import anndata as ad

In [None]:
class Concat:
    def __init__(self, base_path, output_file, dataset_name):
        self.base_path = base_path
        self.output_file = output_file
        self.dataset_name = dataset_name
        self.ann_data_list = []
        self.process_directories()
        
    def process_directories(self):
        highly_variable_genes = None 
        for root, dirs, files in os.walk(self.base_path):
            for file in files:
                if file.endswith("_output_preprocessed.h5ad"):
                    file_path = os.path.join(root, file)
                    print(f"Loading Anndata: {file_path}")
                    adata = self.load_anndata(file_path)
                    print("Adding Metadata")
                    self.add_metadata(adata, root)
                    # print('Calculating HVGs')
                    # self.calculate_HVGs(adata)
                    self.ann_data_list.append(adata)
        print(self.ann_data_list)            
        print('Concatinating Datasets')
        self.concat_and_save()

    def load_anndata(self, file_path):
        return sc.read_h5ad(file_path)

    def add_metadata(self, adata, directory_name):
        print('Changing Var Index')
        adata.var['1'] = adata.var['1'].astype(str)
        adata.var.index = adata.var['1']
        adata.var_names_make_unique()
        # Extracting directory name from the path
        directory_name = os.path.basename(directory_name)
        # Adding columns to adata.obs
        adata.obs['ID'] = directory_name
        adata.obs['Dataset'] = self.dataset_name

    # def calculate_HVGs(self, adata):
    #     sc.pp.normalize_total(adata, target_sum=1e4)
    #     sc.pp.log1p(adata)
    #     sc.pp.highly_variable_genes(adata, n_top_genes=3000)
    #     adata._inplace_subset_var(adata.var['highly_variable'])
    
    def concat_and_save(self):
        if self.ann_data_list:
            concatenated_anndata = ad.concat(self.ann_data_list, join='outer', fill_value=0)
            concatenated_anndata.write_h5ad(self.output_file)


In [None]:
Simeone = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone', 
                 output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/Simeone_concat.h5ad', dataset_name='Simeone')

In [None]:
Lee = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee', 
             output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/Lee_concat.h5ad', dataset_name='Lee')

In [None]:
adata = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/Lee_concat.h5ad')

In [None]:
Steele = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele', 
             output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/Steele_concat.h5ad', dataset_name='Steele')

In [None]:
adata = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/Steele_concat.h5ad')

In [None]:
adata = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zenodo/Data/pk_all_preprocessed.h5ad')

In [None]:
lee = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Lee/Lee_concat.h5ad')

In [None]:
simeone = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Simeone/Simeone_concat.h5ad')

In [None]:
steele = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Steele/Steele_concat.h5ad')

In [None]:
Caronni = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni', 
             output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/Caronni_concat.h5ad', dataset_name='Caronni')

In [None]:
Zhang = Concat(base_path='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang', 
             output_file='/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/Zhang_concat.h5ad', dataset_name='Zhang')

In [None]:
zhang = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Zhang/Zhang_concat.h5ad')

In [None]:
caronni = sc.read_h5ad('/dss/dssfs02/lwp-dss-0001/pn36po/pn36po-dss-0001/di35nod/PDAC_data/raw_data/Caronni/Caronni_concat.h5ad')

In [None]:
#TODO: Is it necessary to read the object after concatenation all the time? if not remove