In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import os
import gc
import anndata as ad
os.chdir('/home/aih/shrey.parikh/PDAC/PDAC/processed_datasets/')

In [None]:
# Concat Lin

In [None]:
class Concat:
    def __init__(self, base_path, output_file, dataset_name):
        self.base_path = base_path
        self.output_file = output_file
        self.dataset_name = dataset_name
        self.ann_data_list = []
        self.process_directories()
        
    def process_directories(self):
        highly_variable_genes = None 
        for root, dirs, files in os.walk(self.base_path):
            for file in files:
                if file.endswith(".h5ad"):
                    file_path = os.path.join(root, file)
                    print(f"Loading Anndata: {file_path}")
                    adata = self.load_anndata(file_path)
                    print("Adding Metadata")
                    self.add_metadata(adata, root)
                    # print('Calculating HVGs')
                    # self.calculate_HVGs(adata)
                    self.ann_data_list.append(adata)
        print(self.ann_data_list)            
        print('Concatinating Datasets')
        self.concat_and_save()

    def load_anndata(self, file_path):
        return sc.read_h5ad(file_path)

    def add_metadata(self, adata, directory_name):
        print('Changing Var Index')
        adata.var['1'] = adata.var['1'].astype(str)
        adata.var.index = adata.var['1']
        adata.var_names_make_unique()
        # Extracting directory name from the path
        directory_name = os.path.basename(directory_name)
        # Adding columns to adata.obs
        adata.obs['ID'] = directory_name
        adata.obs['Dataset'] = self.dataset_name

    # def calculate_HVGs(self, adata):
    #     sc.pp.normalize_total(adata, target_sum=1e4)
    #     sc.pp.log1p(adata)
    #     sc.pp.highly_variable_genes(adata, n_top_genes=3000)
    #     adata._inplace_subset_var(adata.var['highly_variable'])
    
    def concat_and_save(self):
        if self.ann_data_list:
            concatenated_anndata = ad.concat(self.ann_data_list, join='outer', fill_value=0)
            concatenated_anndata.write_h5ad(self.output_file)

#TODO: remove commented code if not necessary

In [None]:
Lin = Concat(base_path='/lustre/groups/ml01/workspace/shrey.parikh/PDAC/raw_data/Lin', 
                 output_file='/lustre/groups/ml01/workspace/shrey.parikh/PDAC/raw_data/Lin/Lin_concat.h5ad', dataset_name='Lin')

In [None]:
peng_PDAC = sc.read_h5ad('All_genes/Peng_PDAC.h5ad')

ding_PDAC = sc.read_h5ad('All_genes/Ding_PDAC.h5ad')

Lee_PDAC = sc.read_h5ad('All_genes/Lee_concat_PDAC.h5ad')

regev_PDAC = sc.read_h5ad('All_genes/Regev_PDAC.h5ad')

simeone_PDAC = sc.read_h5ad('All_genes/Simeone_concat.h5ad')

steele_PDAC = sc.read_h5ad('All_genes/Steele_concat_PDAC.h5ad')

caronni_PDAC = sc.read_h5ad('All_genes/Caronni_concat.h5ad')

zhang_PDAC = sc.read_h5ad('All_genes/Zhang_concat_PDAC.h5ad')

zenodo_PDAC = sc.read_h5ad('All_genes/zenodo_PDAC_new.h5ad')

lin_PDAC = sc.read_h5ad('All_genes/Lin_concat.h5ad')

schlesinger_PDAC = sc.read_h5ad('All_genes/Schlesinger.h5ad')

steele_adj_norm = sc.read_h5ad('Healthy/Steele_adj_norm.h5ad')

peng_normal = sc.read_h5ad('Healthy/Peng_Normal.h5ad')

adata_list = [ding_PDAC, Lee_PDAC, regev_PDAC, simeone_PDAC, steele_PDAC, peng_PDAC, caronni_PDAC, zhang_PDAC, zenodo_PDAC, lin_PDAC, schlesinger_PDAC, steele_adj_norm, peng_normal]

schlesinger_PDAC.obs['ID'] = 'GSM4293555'
schlesinger_PDAC.obs['Dataset'] = 'Schlesinger'

In [None]:
for adata in adata_list:
    adata.layers['raw'] = adata.X.copy()

In [None]:
keys=['Ding', 'Lee', 'Regev', 'Simeone', 'Steele', 'Peng', 'Caronni', 'Zhang', 'Zenodo', 'Lin', 'Schlesinger', 'Steel_Adj_Norm', 'Peng_Normal']
for adata,key in zip(adata_list, keys):
    print(f'Accounting for {key}')
    # print(np.isnan(adata.X.toarray()).any())
    adata.layers['raw'] = adata.X
    raw_counts = adata.layers['raw'].toarray()
    
    # Check if the data are integer value
    # print(f"Dataset: {adata}")
    print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")

    # Check some basic statistics
    print(f"Mean raw counts: {np.mean(raw_counts)}")
    print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
    print(f"Percentage of zero counts: {np.mean(raw_counts == 0) * 100:.2f}%")
    print("-" * 50)
    del adata

    gc.collect()

# regev layers['raw'] is actually not raw

In [None]:
# Check if counts is raw 
raw_counts = regev_PDAC.layers['counts'].toarray()
print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")

# Check some basic statistics
print(f"Mean raw counts: {np.mean(raw_counts)}")
print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
print(f"Percentage of zero counts: {np.mean(raw_counts == 0) * 100:.2f}%")
print("-" * 50)

In [None]:
regev_PDAC.layers['raw'] = regev_PDAC.layers['counts'].copy()

# add the steele_adj_norm raw counts

In [None]:
steele_adj_norm_raw = sc.read_h5ad('Healthy/Steele_adj_norm_raw.h5ad')
steele_adj_norm.layers['raw'] = steele_adj_norm_raw.layers['raw'].copy()
raw_counts = steele_adj_norm.layers['raw'].toarray()
print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")

# Check some basic statistics
print(f"Mean raw counts: {np.mean(raw_counts)}")
print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
print(f"Percentage of zero counts: {np.mean(raw_counts == 0) * 100:.2f}%")
print("-" * 50)

# add the Peng_normal raw counts

In [None]:
peng_normal_raw = sc.read_h5ad('../processed_datasets/Healthy/Peng_Normal_raw.h5ad')
peng_normal.layers['raw'] = peng_normal_raw.layers['raw'].copy()

raw_counts = peng_normal.layers['raw'].toarray()
print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")

# Check some basic statistics
print(f"Mean raw counts: {np.mean(raw_counts)}")
print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
print(f"Percentage of zero counts: {np.mean(raw_counts == 0) * 100:.2f}%")
print("-" * 50)

# check the same of zenodo original dataset

In [None]:
zenodo = sc.read_h5ad('/ictstr01/groups/ml01/workspace/shrey.parikh/PDAC/raw_data/Zenodo/Data/pk_all.h5ad')

# checked in the rds file, no raw counts anywhere

In [None]:
raw_counts = zenodo.X.toarray()
print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")

# Check some basic statistics
print(f"Mean raw counts: {np.mean(raw_counts)}")
print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
print(f"Percentage of zero counts: {np.mean(raw_counts == 0) * 100:.2f}%")
print("-" * 50)

In [None]:
keys=['Ding', 'Lee', 'Regev', 'Simeone', 'Steele', 'Peng', 'Caronni', 'Zhang', 'Zenodo', 'Lin', 'Schlesinger', 'Steel_Adj_Norm', 'Peng_Normal']
for adata,key in zip(adata_list, keys):
    print(f'Accounting for {key}')
    # print(np.isnan(adata.X.toarray()).any())
    # adata.layers['raw'] = adata.X
    raw_counts = adata.layers['raw'].toarray()
    
    # Check if the data are integer value
    # print(f"Dataset: {adata}")
    print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")

    # Check some basic statistics
    print(f"Mean raw counts: {np.mean(raw_counts)}")
    print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
    print(f"Percentage of zero counts: {np.mean(raw_counts == 0) * 100:.2f}%")
    print("-" * 50)
    del adata
    del raw_counts
    gc.collect()

# removing zenodo for now

In [None]:
adata_list = [ding_PDAC, Lee_PDAC, regev_PDAC, simeone_PDAC, steele_PDAC, peng_PDAC, caronni_PDAC, zhang_PDAC, lin_PDAC, schlesinger_PDAC, steele_adj_norm, peng_normal]

In [None]:
# adata_concat = ad.concat(adata_list, join='outer', fill_value=0)
adata_concat = ad.concat(
    adata_list, 
    axis=0, 
    join='outer', 
    label='batch', 
    keys=['Ding', 'Lee', 'Regev', 'Simeone', 'Steele', 'Peng', 'Caronni', 'Zhang', 'Lin', 'Schlesinger', 'Steele_Adj_Norm', 'Peng_Norm'], 
    index_unique=None,
    fill_value=0
)


In [None]:
adata_concat.write('All_genes/Concat_All_Genes.h5ad')

In [None]:
adata_subset = sc.pp.subsample(adata_concat, fraction=0.1, copy=True)
raw_counts = adata_subset.layers['raw'].toarray()
print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")
# print(f"Mean raw counts: {np.mean(raw_counts)}")
print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
# print(f"Percentage of zero counts: {np.mean(raw_counts == 0) * 100:.2f}%")
print("-" * 50)

In [None]:
adata_concat = sc.read_h5ad('All_genes/Concat_All_Genes.h5ad')

In [None]:
sc.pp.filter_genes(adata_concat, min_cells=5)

adata_concat.var_names[adata_concat.var_names.str.contains('class')]

In [None]:
keys=['Ding', 'Lee', 'Regev', 'Simeone', 'Steele', 'Peng', 'Caronni', 'Zhang', 'Zenodo', 'Lin', 'Schlesinger', 'Steel_Adj_Norm', 'Peng_Normal']

In [None]:
for key,adata in zip(keys,adata_list):
    if adata.var_names.str.contains('class').any():
        print(key)

# < class > in var_names whatever is also in the peng raw so just remove it

In [None]:
adata_concat = adata_concat[:, ~adata_concat.var_names.str.contains('class')]

In [None]:
adata_concat.write('All_genes/Concat_All_Genes_filtered.h5ad')

In [None]:
adata_concat = sc.read_h5ad('All_genes/Concat_All_Genes_filtered.h5ad')

In [None]:
adata_subset = sc.pp.subsample(adata_concat, fraction=0.1, copy=True)
raw_counts = adata_subset.layers['raw'].toarray()

# Check if the data are integer value
# print(f"Dataset: {adata}")
print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")

# Check some basic statistics
print(f"Mean raw counts: {np.mean(raw_counts)}")
print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
print(f"Percentage of zero counts: {np.mean(raw_counts == 0) * 100:.2f}%")
print("-" * 50)
del adata_subset
gc.collect()

In [None]:
sc.pl.umap(adata_filtered, color='Dataset')
#TODO: adata_filtered is not defined

In [None]:
from scipy.stats import median_abs_deviation
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
adata_filtered = adata_concat

In [None]:
sc.pp.calculate_qc_metrics(adata_filtered, layer='raw', inplace=True)

In [None]:
datasets = adata_filtered.obs['Dataset'].unique()
dataset_subsets = {ds: adata_filtered.obs[adata_filtered.obs['Dataset'] == ds] for ds in datasets}
counts = {ds: subset["log1p_total_counts"] for ds, subset in dataset_subsets.items()} 

fig, ax = plt.subplots(figsize=(10, 6))

colors = adata_filtered.uns['Dataset_colors']

for (ds, count_data), color in zip(counts.items(), colors):
    # Compute histogram data
    hist_data, bin_edges = np.histogram(count_data, bins=30, density=True)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

    # Plot as a line
    ax.plot(bin_centers, hist_data, label=ds, color=color)

ax.set_xlabel('Total Counts per Cell')
ax.set_ylabel('Density')
ax.set_title('Normalized Count Distribution by Dataset')
ax.legend(title='Dataset')

plt.show()

In [None]:
datasets = adata_filtered.obs['Dataset'].unique()
dataset_subsets = {ds: adata_filtered.obs[adata_filtered.obs['Dataset'] == ds] for ds in datasets}
counts = {ds: subset["log1p_n_genes_by_counts"] for ds, subset in dataset_subsets.items()} 

fig, ax = plt.subplots(figsize=(10, 6))

colors = adata_filtered.uns['Dataset_colors']

for (ds, count_data), color in zip(counts.items(), colors):
    # Compute histogram data
    hist_data, bin_edges = np.histogram(count_data, bins=30, density=True)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

    # Plot as a line
    ax.plot(bin_centers, hist_data, label=ds, color=color)

ax.set_xlabel('Total Counts per Cell')
ax.set_ylabel('Density')
ax.set_title('Normalized Count Distribution by Dataset')
ax.legend(title='Dataset')

plt.show()

In [None]:
datasets = adata_filtered.obs['Dataset'].unique()
num_datasets = len(datasets)

nrows = int(np.ceil(num_datasets / 3))
ncols = 3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 5 * nrows))
axes = axes.flatten()

for i, dataset in enumerate(datasets):
    dataset_mask = adata_filtered.obs['Dataset'] == dataset
    raw_counts = adata_filtered[dataset_mask].layers['raw'].toarray() if not isinstance(adata_filtered.layers['raw'], np.ndarray) else adata_filtered[dataset_mask].layers['raw']
    total_counts_per_cell = raw_counts.sum(axis=1)
    axes[i].hist(total_counts_per_cell, bins=50, color='blue', alpha=0.7)
    axes[i].set_title(f'{dataset} - Raw Counts')
    axes[i].set_xlabel('Total Counts per Cell')
    axes[i].set_ylabel('Frequency')

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
import scanpy as sc
import numpy as np
#TODO: is this necessary?

In [None]:
adata_hcg = sc.read_h5ad('../../PDAC/processed_datasets/Concat_HVG_filtered.h5ad')

In [None]:
ding_sn_map = ['HT224P1',
 'HT231P1',
 'HT232P1',
 'HT242P1',
 'HT259P1',
 'HT264P1',
 'HT270P1',
 'HT284P1',
 'HT288P1',
 'HT306P1',
 'HT412P1']
adata_hcg.obs.Dataset = np.where(adata_hcg.obs.ID.isin(ding_sn_map), 'Ding_snRNA-seq', adata_hcg.obs.Dataset)

In [None]:
adata_hcg.obs['batch_covariate'] = adata_hcg.obs['Dataset'].astype(str) + '_' + adata_hcg.obs['Condition'].astype(str)
adata_hcg.obs.batch_covariate = adata_hcg.obs.batch_covariate.replace('Ding_snRNA-seq_snRNA-seq',  'Ding_snRNA-seq')

In [None]:
raw_counts = adata_hcg.X.toarray()
print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")
print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
print("-" * 50)

In [None]:
raw_counts = adata_hcg.layers['raw'].toarray()
print(f"Are raw counts integers? {np.all(raw_counts.astype(int) == raw_counts)}")
print(f"Range of raw counts: {np.min(raw_counts)} to {np.max(raw_counts)}")
print("-" * 50)

In [None]:
adata_hcg.write('../../PDAC/drvi/adata_drvi.h5ad')