In [None]:
import scanpy as sc
import pandas as pd
import numpy as np

In [None]:
pwd

In [None]:
import os
os.chdir('../../PDAC_Work_Dir/PDAC_Final/Downstream/')

In [None]:
sc.set_figure_params(dpi=100)
pd.set_option('display.max_columns', None)

In [None]:
steele_atlas = sc.read_h5ad('../../Steele_Atlas_counts.h5ad')

In [None]:
subset = sc.pp.subsample(steele_atlas, fraction=0.01, copy=True)
X = subset.X.toarray() if hasattr(subset.X, "toarray") else subset.X

print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")

# Check if all values are integers (raw count hint)
is_integer = np.allclose(X, X.astype(int))
print(f"All values are integers: {is_integer}")

In [None]:
steele_atlas

In [None]:
sc.pl.umap(steele_atlas, color=['Clusters', 'GSE.SRA..Study.', 'Study..Citation..PMID.'], frameon=False, wspace=0.75)

In [None]:
steele_atlas.obs = steele_atlas.obs.astype(str)

In [None]:
sc.pl.umap(steele_atlas, color=['If.metastatic..location', 'Clusters', 'Treatment', 'DiseaseState', 'TreatmentType', 'seurat_clusters'], frameon=False, wspace=0.75)

In [None]:
steele_atlas.obs.groupby(['If.metastatic..location', 'DiseaseState']).size().unstack()

In [None]:
steele_atlas.obs.rename(columns={'GSE.SRA..Study.': 'Dataset', 'Name': 'Donor_ID', 'DiseaseState':'Condition', 'If.metastatic..location': 'Location'}, inplace=True)

In [None]:
needed = ['EGAS00001002543', 'GSE158356', 'GSE194247', 'GSE211644', 'GSE229413', 'phs001840.v1.p1']

In [None]:
# steele_atlas.obs.drop('nCount_RNA', 'nFeature_RNA', 'percent.mt', 'seurat_clusters'

In [None]:
steele_atlas.obs.groupby('Dataset').size()

In [None]:
steele_atlas_needed = steele_atlas[steele_atlas.obs['Dataset'].isin(needed)]

In [None]:
steele_atlas_needed.shape

In [None]:
steele_atlas_needed.obs.groupby(['Dataset', 'Condition']).size().unstack()

In [None]:
steele_atlas_needed.obs.groupby(['Dataset', 'Clusters']).size().unstack()

In [None]:
import os
os.chdir('Extension')

In [None]:
for dataset in needed:
    dataset_name = dataset.replace('.','_') + '.h5ad'
    print(f'Saving: {dataset_name}')
    adata_temp = steele_atlas[steele_atlas.obs['Dataset'] == dataset].copy()
    subset = sc.pp.subsample(adata_temp, fraction=0.01, copy=True)
    X = subset.X.toarray() if hasattr(subset.X, "toarray") else subset.X
    print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")
    # Check if all values are integers (raw count hint)
    is_integer = np.allclose(X, X.astype(int))
    print(f"All values are integers: {is_integer}")
    adata_temp.layers['raw'] = adata_temp.X.copy()
    sc.pp.normalize_total(adata_temp, target_sum=1e4)
    sc.pp.log1p(adata_temp)
    adata_temp.layers['log_norm'] = adata_temp.X.copy()
    adata_temp.write(dataset_name)
    print('_'*50)

In [None]:
subset = sc.pp.subsample(adata_temp, fraction=0.01, copy=True)
X = subset.X.toarray() if hasattr(subset.X, "toarray") else subset.X
print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")
# Check if all values are integers (raw count hint)
is_integer = np.allclose(X, X.astype(int))
print(f"All values are integers: {is_integer}")

# Load Other Datasets

In [None]:
adata = sc.read_h5ad('../../Binned_Data/adata_scpoli_final_all_genes.h5ad')

In [None]:
adata

In [None]:
adata.obs.groupby('Dataset').size()

# Zhang

In [None]:
zhang = adata[adata.obs.Dataset.str.contains('Zhang')]

In [None]:
zhang.obs.ID_batch_covariate.unique().tolist()

In [None]:
sc.pl.umap(zhang, color=['Level_1_refined', 'Tissue'], frameon=False, wspace=0.5)

## zhang all donors

In [None]:
zhang_all = sc.read_h5ad('/lustre/groups/ml01/workspace/shrey.parikh/PDAC/raw_data/Zhang/Zhang_concat.h5ad')

In [None]:
zhang_loc = {'GSM5910784_Case1-YF_PDAC': 'Tumour', 
'GSM5910788_Case2-ZY_LM': 'Liver Metastatses',
'GSM5910785_Case1-ZY_LM':'Liver Metastatses', 
'GSM5910790_Case3-ZY_LM':'Liver Metastatses', 
'GSM5910789_Case3-YF_PDAC': 'Tumour', 
'GSM5910786_Case2-ZC_NPT': 'Adjacent Normal', 
'GSM5910791_Case4-ZY_LM': 'Liver Metastatses', 
'GSM5910787_Case2-YF_PDAC': 'Tumour', 
'GSM5910784_Case1-YF_PDAC': 'Tumour'}
zhang_all.obs['Location'] = zhang_all.obs.ID.map(zhang_loc)

In [None]:
zhang_all.obs.groupby('ID').size()

In [None]:
subset = sc.pp.subsample(zhang_all, fraction=0.01, copy=True)
X = subset.X.toarray() if hasattr(subset.X, "toarray") else subset.X

print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")

# Check if all values are integers (raw count hint)
is_integer = np.allclose(X, X.astype(int))
print(f"All values are integers: {is_integer}")

In [None]:
zhang_all.obs.head()

In [None]:
zhang_all.layers['raw'] = zhang_all.X.copy()
sc.pp.normalize_total(zhang_all, target_sum=1e4)
sc.pp.log1p(zhang_all)
zhang_all.layers['log_norm'] = zhang_all.X.copy()
zhang_all.write('Zhang_GSE197177.h5ad')

In [None]:
zhang_all

In [None]:
zhang_all.shape

In [None]:
subset = sc.pp.subsample(zhang_all, fraction=0.01, copy=True)
X = subset.X.toarray() if hasattr(subset.X, "toarray") else subset.X
print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")
# Check if all values are integers (raw count hint)
is_integer = np.allclose(X, X.astype(int))
print(f"All values are integers: {is_integer}")

# Lin MET

In [None]:
lin_metastasis = adata[adata.obs.Dataset.str.contains('Lin')]

In [None]:
lin_metastasis.obs.groupby('Tissue').size()

In [None]:
lin_metastasis = lin_metastasis[lin_metastasis.obs.Tissue == 'Liver']

In [None]:
lin_metastasis.shape

In [None]:
lin_metastasis.obs.Unique_ID.unique()

In [None]:
subset = sc.pp.subsample(lin_metastasis, fraction=0.01, copy=True)
X = subset.X.toarray() if hasattr(subset.X, "toarray") else subset.X
print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")
# Check if all values are integers (raw count hint)
is_integer = np.allclose(X, X.astype(int))
print(f"All values are integers: {is_integer}")

In [None]:
lin_metastasis.layers['raw'] = lin_metastasis.X.copy()
sc.pp.normalize_total(lin_metastasis, target_sum=1e4)
sc.pp.log1p(lin_metastasis)
lin_metastasis.layers['log_norm'] = lin_metastasis.X.copy()

In [None]:
subset = sc.pp.subsample(lin_metastasis, fraction=0.01, copy=True)
X = subset.X.toarray() if hasattr(subset.X, "toarray") else subset.X
print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")
# Check if all values are integers (raw count hint)
is_integer = np.allclose(X, X.astype(int))
print(f"All values are integers: {is_integer}")

In [None]:
lin_metastasis.write('Lin_MET_GSE154778.h5ad')

In [None]:
lin_metastasis

# Simeone MET

In [None]:
simeone_metastasis = adata[adata.obs.Dataset.str.contains('Simeon')]

In [None]:
simeone_metastasis.obs.groupby('Tissue').size()

In [None]:
simeone_metastasis = simeone_metastasis[simeone_metastasis.obs.Tissue == 'Liver']

In [None]:
simeone_metastasis.shape

In [None]:
subset = sc.pp.subsample(simeone_metastasis, fraction=0.01, copy=True)
X = subset.X.toarray() if hasattr(subset.X, "toarray") else subset.X
print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")
# Check if all values are integers (raw count hint)
is_integer = np.allclose(X, X.astype(int))
print(f"All values are integers: {is_integer}")

In [None]:
simeone_metastasis.layers['raw'] = simeone_metastasis.X.copy()
sc.pp.normalize_total(simeone_metastasis, target_sum=1e4)
sc.pp.log1p(simeone_metastasis)
simeone_metastasis.layers['log_norm'] = simeone_metastasis.X.copy()

In [None]:
subset = sc.pp.subsample(simeone_metastasis, fraction=0.01, copy=True)
X = subset.X.toarray() if hasattr(subset.X, "toarray") else subset.X
print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")
# Check if all values are integers (raw count hint)
is_integer = np.allclose(X, X.astype(int))
print(f"All values are integers: {is_integer}")

In [None]:
simeone_metastasis.write('Simeone_MET_GSE205013.h5ad')

In [None]:
simeone_metastasis

In [None]:
simeone_metastasis.shape[0] + lin_metastasis.shape[0] + zhang_all.shape[0] + steele_atlas_needed.shape[0] 
print(simeone_metastasis.shape[0] + lin_metastasis.shape[0] + zhang_all.shape[0] + steele_atlas_needed.shape[0] + 726054)

In [None]:
pwd

In [None]:
simeone_metastasis = sc.read_h5ad('Extension/Simeone_MET_GSE205013.h5ad')

In [None]:
simeone_metastasis.obs.head()

# Lee MET

In [None]:
lee_all = sc.read_h5ad('/lustre/groups/ml01/workspace/shrey.parikh/PDAC/raw_data/Lee/Lee_concat.h5ad')

In [None]:
lee_all_met = lee_all[lee_all.obs.ID.isin(['LiM_filtered_feature_bc_matrix',  'VM_filtered_feature_bc_matrix', 'LuM_filtered_feature_bc_matrix'])]

In [None]:
lee_all_met.obs.groupby('ID').size()

In [None]:
subset = sc.pp.subsample(lee_all_met, fraction=0.01, copy=True)
X = subset.X.toarray() if hasattr(subset.X, "toarray") else subset.X

print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")

# Check if all values are integers (raw count hint)
is_integer = np.allclose(X, X.astype(int))
print(f"All values are integers: {is_integer}")

In [None]:
lee_all_met.obs['Location'] = lee_all_met.obs.ID.map({'LiM_filtered_feature_bc_matrix': 'Liver',  
                                                      'VM_filtered_feature_bc_matrix': 'Vaginal Apex', 
                                                      'LuM_filtered_feature_bc_matrix': 'Lung'})

In [None]:
lee_all_met.obs.groupby(['Location']).size()

In [None]:
lee_all_met.layers['raw'] = lee_all_met.X.copy()
sc.pp.normalize_total(lee_all_met, target_sum=1e4)
sc.pp.log1p(lee_all_met)
lee_all_met.layers['log_norm'] = lee_all_met.X.copy()

In [None]:
subset = sc.pp.subsample(lee_all_met, fraction=0.01, copy=True)
X = subset.X.toarray() if hasattr(subset.X, "toarray") else subset.X

print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")

# Check if all values are integers (raw count hint)
is_integer = np.allclose(X, X.astype(int))
print(f"All values are integers: {is_integer}")

In [None]:
lee_all_met

In [None]:
lee_all_met.write('Lee_MET_GSE156405.h5ad')

# Reload all datasets and concat

In [None]:
import os
os.chdir('Extension')

In [None]:
for file in os.listdir():
    print(file.split('.')[0])

In [None]:
adatas ={}
for file in os.listdir():
    if 'Extension_Datasets_Combined' in file:
        continue
    print(f'Loading: {file}')
    if 'h5ad' in file:
        adatas[file] = sc.read_h5ad(file)
        adatas[file].obs['Dataset'] = file.split('.')[0]

In [None]:
for key,adata in adatas.items():
    print('-'*50)
    print(f'Columns in {key}: {adata.obs.columns}')
    print(f'Layers: {adata.layers}')
    print('-'*50)

In [None]:
adata_combined = sc.concat(adatas.values(), label='batch', keys=adatas.keys(), join='outer')

In [None]:
adata_combined

In [None]:
for dataset in adata_combined.obs.Dataset.unique():
    print(f'{dataset}')
    adata_temp = adata_combined[adata_combined.obs.Dataset == dataset]
    subset = sc.pp.subsample(adata_temp, fraction=0.01, copy=True)
    X = subset.layers['raw'].toarray() if hasattr(subset.X, "toarray") else subset.X
    
    print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")
    
    # Check if all values are integers (raw count hint)
    is_integer = np.allclose(X, X.astype(int))
    print(f"All values are integers: {is_integer}")
    print('-'*50)

In [None]:
adata_combined.obs.Location = adata_combined.obs.Location.replace('Liver Metastatses', 'Liver')

In [None]:
adata_combined.obs.Location = adata_combined.obs.Location.replace('nan', 'NA')

In [None]:
adata_combined.obs.groupby(['Dataset', 'Location']).size().unstack()

In [None]:
adata_combined.obs.Condition = adata_combined.obs.Condition.replace('Adjacent normal', 'Healthy')

In [None]:
adata_combined.obs.groupby(['Dataset', 'Condition']).size().unstack()

In [None]:
# location: pancreas,

# Fix  GSE229413

In [None]:
mask = adata_combined.obs.Dataset == 'GSE229413' #.obs.Donor_ID.unique().tolist()
adata_combined.obs.loc[mask, 'Condition'] = 'Healthy'

In [None]:
adata_combined.obs.Condition = adata_combined.obs.Condition.astype(str)

# Fix  Lin, Simeone

In [None]:
mask = adata_combined.obs.Dataset.str.contains('Lin') #.obs.Donor_ID.unique().tolist()
adata_combined.obs.loc[mask, 'Condition'] = 'Metastatic Lesion'
mask = adata_combined.obs.Dataset.str.contains('Simeone') #.obs.Donor_ID.unique().tolist()
adata_combined.obs.loc[mask, 'Condition'] = 'Metastatic Lesion'
mask = adata_combined.obs.Dataset.str.contains('Lee') #.obs.Donor_ID.unique().tolist()
adata_combined.obs.loc[mask, 'Condition'] = 'Metastatic Lesion'

In [None]:
adata_combined.obs.Condition = adata_combined.obs.Condition.replace('Metastatic lesion', 'Metastatic Lesion')

In [None]:
adata_combined.obs.groupby(['Dataset', 'Condition']).size().unstack()

In [None]:
mask = adata_combined.obs.Dataset.str.contains('Zhang') #.obs.Donor_ID.unique().tolist()
adata_combined.obs.loc[mask, 'Condition'] = adata_combined.obs.loc[mask, 'Location']

In [None]:
adata_combined.obs.Condition = adata_combined.obs.Condition.replace('Tumour', 'Primary Tumor')
adata_combined.obs.Condition = adata_combined.obs.Condition.replace('Liver Metastatses', 'Metastatic Lesion')
adata_combined.obs.Condition = adata_combined.obs.Condition.replace('Adjacent Normal', 'Healhty')

In [None]:
adata_combined.obs.Condition = adata_combined.obs.Condition.replace('Healhty', 'Healthy')
adata_combined.obs.Condition = adata_combined.obs.Condition.replace('Primary tumor', 'Primary Tumor')

In [None]:
adata_combined.obs.Condition = adata_combined.obs.Condition.replace('Liver', 'Metastatic Lesion')

In [None]:
adata_combined.obs.groupby(['Dataset', 'Condition']).size().unstack()

In [None]:
adata_combined.obs.groupby(['Condition']).size()

In [None]:
adata_combined.obs = adata_combined.obs.astype(str)

In [None]:
adata_combined.obs.replace("nan", np.nan, inplace=True)

In [None]:
adata_combined.obs['Atlas_Extension_CellType'] = adata_combined.obs['Level_1_refined'].fillna(adata_combined.obs['Clusters'])

In [None]:
adata_combined.obs.columns

In [None]:
adata_combined.obs.drop(['Level_1', 'Level_1_refined', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'seurat_clusters', 'Count',
       'Study..Citation..PMID.','Clusters','n_genes', 'n_counts',
       'log_counts', 'mt_frac', 'n_genes_by_counts', 'log1p_n_genes_by_counts',
       'total_counts', 'log1p_total_counts', 'total_counts_mito',
       'log1p_total_counts_mito', 'pct_counts_mito', 'leiden', 'batch',
       'leiden_0.2', 'leiden_0.2_annotation', 'leiden_subcluster',
       'level0_leiden_subcluster', 'leiden_0.5','Level_0'], inplace=True, axis=1)

In [None]:
adata_combined.obs.head()

# Fix Location

In [None]:
adata_combined.obs.groupby(['Dataset', 'Location']).size().unstack()

In [None]:
mask = adata_combined.obs.Dataset == 'GSE229413' #.obs.Donor_ID.unique().tolist()
adata_combined.obs.loc[mask, 'Location'] = 'Pancreas'

In [None]:
mask = adata_combined.obs.Dataset == 'GSE194247' #.obs.Donor_ID.unique().tolist()
adata_combined.obs.loc[mask, 'Location'] = 'Pancreas'

In [None]:
mask = adata_combined.obs.Dataset == 'GSE211644' #.obs.Donor_ID.unique().tolist()
adata_combined.obs.loc[mask, 'Location'] = 'Pancreas'

In [None]:
mask = adata_combined.obs.Dataset == 'phs001840.v1.p1' #.obs.Donor_ID.unique().tolist()
adata_combined.obs.loc[mask, 'Location'] = 'Pancreas'

In [None]:
mask = adata_combined.obs.Dataset == 'EGAS00001002543' #.obs.Donor_ID.unique().tolist()
adata_combined.obs.loc[mask, 'Location'] = 'Pancreas'

In [None]:
mask = (adata_combined.obs.Dataset == 'Zhang') & (adata_combined.obs.Location == 'Tumour')
adata_combined.obs.loc[mask, 'Location'] = 'Pancreas'

In [None]:
# Convert to categorical if it's not already
adata_combined.obs['Location'] = adata_combined.obs['Location'].astype('category')
# Now remove unwanted category
adata_combined.obs['Location'] = adata_combined.obs['Location'].cat.remove_categories(['Tumour'])

In [None]:
adata_combined.obs.Location = adata_combined.obs.Location.replace('Adjacent Normal', 'Pancreas')

In [None]:
adata_combined.obs.Location = adata_combined.obs.Location.replace('NA', 'Pancreas')

In [None]:
adata_combined.obs.groupby(['Dataset', 'Location']).size().unstack()

In [None]:
adata_combined.obs.groupby(['Location']).size()

In [None]:
adata_combined.X = adata_combined.layers['raw'].copy()

In [None]:
subset = sc.pp.subsample(adata_combined, fraction=0.01, copy=True)
X = subset.X.toarray() if hasattr(subset.X, "toarray") else subset.X

print(f"Min: {X.min()}, Max: {X.max()}, Mean: {X.mean()}")

# Check if all values are integers (raw count hint)
is_integer = np.allclose(X, X.astype(int))
print(f"All values are integers: {is_integer}")

In [None]:
adata_combined.write('Extension_Datasets_Combined.h5ad')