In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import scipy.sparse as sp
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sb
pd.set_option('display.max_columns', None)

# Load everything

In [None]:
print("Loading gene lists...")
#load the gene list
mofa_genes_df = pd.read_csv('../Expimap/expimap_10_factors_selected_norepeatinggenes/var_names.csv')
broad_markers = pd.read_csv('broad_markers.csv', index_col='Unnamed: 0')
broad_markers_list = list(set(broad_markers.values.flatten().astype(str)[broad_markers.values.flatten().astype(str) != 'nan']))
de_genes_df = pd.read_pickle('de_genes_to_be_added.csv')
mofa_genes = mofa_genes_df.values.flatten().tolist()
de_genes = de_genes_df.values.flatten().tolist()
xenium_df = pd.read_csv('pdac_xenium_panel.csv')
xenium_genes = list(set(xenium_df.Gene.tolist()))

print(f"Length of MOFA genes: {len(mofa_genes)}")
print(f"Length of broad marker genes: {len(broad_markers_list)}")
print(f"Length of DE genes: {len(de_genes)}")
print(f"Length of xenium panel genes: {len(xenium_genes)}")


all_genes = list(set(mofa_genes + broad_markers_list + de_genes + xenium_genes))
print(f"Total unique genes combined: {len(all_genes)}")

#load the anndata objects
adata_sc = sc.read_h5ad('../single_cell_int/adata_sc_int_cnv.h5ad')
adata_sn = sc.read_h5ad('../single_nuc_int/adata_nuc_int_outlier_genes.h5ad')

# Concatenate the anndata

In [None]:
adata = adata_sc.concatenate(adata_sn, join='outer')

# Load the zarr and add the attributes to concatenated adata

In [None]:
zarr_scpoli = ad.read_zarr('../Finalized/adata_scpoli_final.zarr/')

In [None]:
zarr_scpoli.obsm['X_scpoli'].shape

In [None]:
adata.obsm['X_scpoli'] = zarr_scpoli.obsm['X_scpoli'].copy()
adata.obsm['X_umap'] = zarr_scpoli.obsm['X_umap'].copy()
adata.obsp = zarr_scpoli.obsp.copy()
# adata.varm = zarr_scpoli.varm.copy()
adata.uns = zarr_scpoli.uns.copy()

In [None]:
adata.obs = zarr_scpoli.obs.copy()

In [None]:
df = adata.obs[['Level_0', 'Level_1_refined']]
dominant_mapping = (
    df.groupby(['Level_1_refined', 'Level_0'])
    .size()
    .unstack(fill_value=0)
    .idxmax(axis=1)
)

df['Corrected_Level_0'] = df['Level_1_refined'].map(dominant_mapping)
misclassified = df[df['Level_0'] != df['Corrected_Level_0']]
adata.obs.loc[misclassified.index, 'Level_0'] = misclassified['Corrected_Level_0']

In [None]:
sc.pl.umap(adata, color=['Level_0','Level_1_refined'], frameon=False, ncols=1)

# Mark the manual genes in Manual_Genes Column

In [None]:
valid_genes_sc = [gene for gene in all_genes if gene in adata_sc.var_names]
valid_genes_sn = [gene for gene in all_genes if gene in adata_sn.var_names]
print(f"Total valied genes combined: {len(valid_genes_sc)}")
print(f"Total valied genes combined: {len(valid_genes_sn)}")

In [None]:
all_genes = list(set(valid_genes_sc) & set(valid_genes_sn))

In [None]:
adata.var['Manual_Genes'] = adata.var.index.isin(all_genes)

# Refine Obs Cols

In [None]:
df1 = pd.DataFrame(adata_sc.obs.columns, columns=["Column"])
df2 = pd.DataFrame(adata_sn.obs.columns, columns=["Column"])
df = df1.join(df2, lsuffix="_adata_sc", rsuffix="_adata_sn")

In [None]:
cols_removed = []
for col in adata.obs.columns:
    if adata.obs[col].isna().all():
        print(f"Column '{col}' contains only NaN values.")
        cols_removed.append(col)

In [None]:
obs_cols_removed= ['total_counts']

# Var columns extended by -0 from adata_sc and -1 from adata_sn because the var values are different and caluclated separately for SC and SN

In [None]:
# Rename the suffixes to indicate the dataset explicitly
adata.var.columns = adata.var.columns.str.replace("-0", "_adata_sc", regex=False)
adata.var.columns = adata.var.columns.str.replace("-1", "_adata_sn", regex=False)

In [None]:
obsm_removed = ['X_harmony', 'X_pca', 'X_cnv', 'X_cnv_pca', 'X_cnv_umap']
for i in obsm_removed:
    del adata.obsm[i]

In [None]:
adata.obs.drop('n_counts', axis=1, inplace=True)

# Filter empty droplets

In [None]:
adata.obs['MALAT1_lognorm'] = (adata[:, 'MALAT1'].layers['log_norm']).toarray()

In [None]:
for condition in adata.obs.Condition.unique():
    print(f"Condition: {condition}")
    plt.figure(figsize=(8, 6))
    sb.kdeplot(adata[adata.obs.Condition == condition].obs['MALAT1_lognorm'], shade=True, color='blue', label='MALAT1_lognorm')
    plt.title('Density Plot of MALAT1_lognorm')
    plt.xlabel('MALAT1_lognorm Values')
    plt.ylabel('Density')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
adata.obs['empty_droplet'] = np.where(
    (adata.obs['Condition'] == 'scRNA-seq') & (adata.obs['MALAT1_lognorm'] > 3.5), False,
    np.where(
        (adata.obs['Condition'] != 'scRNA-seq') & (adata.obs['MALAT1_lognorm'] >= 4), False, True
    )
)


In [None]:
plt.figure(figsize=(8, 6))
sb.kdeplot(adata[adata.obs.n_genes_by_counts <= 2000].obs['n_genes_by_counts'], shade=True, color='blue', label='n_genes_by_counts')
plt.title('Density Plot of detected_genes')
plt.xlabel('MALAT1_lognorm Values')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
adata = adata[~adata.obs.empty_droplet].copy()

In [None]:
sc.pp.filter_cells(adata, min_genes=400)

# Adjust patient ID and add Metadata

In [None]:
metadata = pd.read_csv('metadata_human.csv', index_col=0)

## Adjust PID

In [None]:
adata.obs.rename(columns={'Condition': 'Technology', 'ID':'Unique_ID'}, inplace=True)
adata.obs["ID_harmonised"] = adata.obs["Unique_ID"].copy()

In [None]:
#rename Regev
adata.obs['Dataset'] = adata.obs.Dataset.replace('Regev', 'Hwang')
adata.obs['ID_harmonised'] = np.where(
    adata.obs['ID_batch_covariate'].str.contains('Hwang'),
    [f"PDAC_{cell[0]}_{cell[1:]}" for cell in adata.obs['ID_harmonised']],
    adata.obs['ID_harmonised']
)
adata.obs['ID_batch_covariate'] = np.where(
    adata.obs['ID_batch_covariate'].str.contains('Hwang'),
    [f"{cell}_Hwang_snRNA-seq" for cell in adata.obs['ID_harmonised']],
    adata.obs['ID_batch_covariate']
)

In [None]:
#rename Lee
adata.obs['ID_harmonised'] = adata.obs['Unique_ID'].replace({
    'P1_filtered_feature_bc_matrix': "P1",
    'P2_filtered_feature_bc_matrix': "P2",
    'P3_filtered_feature_bc_matrix': "P3",
    'P4_filtered_feature_bc_matrix': "P4",
    'P5_filtered_feature_bc_matrix': "P5",
    })
adata.obs['ID_batch_covariate'] = [ID[:2] + ID[-14:] if 'Lee' in ID else ID for ID in adata.obs.ID_batch_covariate]

In [None]:
#rename Simeone
adata.obs['ID_harmonised'] = [ID[ID.find('P'):] if 'GSM620' in ID else ID for ID in adata.obs.ID_harmonised]
adata.obs['ID_batch_covariate'] = [ID[ID.find('P'):] if 'GSM620' in ID else ID for ID in adata.obs.ID_batch_covariate]


In [None]:
#rename Caronni
sample_mapping = {
    'GSM6727546_PDAC': 'PDAC47',
    'GSM6727547_PDAC': 'PDAC48',
    'GSM6727548_PDAC': 'PDAC50',
    'GSM6727549_PDAC': 'PDAC51',
    'GSM6727550_PDAC': 'PDAC55',
    'GSM6727551_PDAC': 'PDAC60',
    'GSM6727542_LPDAC': 'LPDAC15',
    'GSM6727543_LPDAC': 'LPDAC25',
    'GSM6727544_LPDAC': 'LPDAC26',
    'GSM6727545_LPDAC': 'LPDAC30'
}
adata.obs['ID_harmonised'] = adata.obs['ID_harmonised'].replace(sample_mapping)
adata.obs['ID_batch_covariate'] = [sample_mapping[ID[:ID.find('_Caronni')]] + '_Caronni_scRNA-seq' if 'Caronni' in ID else ID for ID in adata.obs.ID_batch_covariate]

In [None]:
#rename Lin
sample_mapping = {
    'GSM4679532': 'P01',
    'GSM4679533': 'P02',
    'GSM4679534': 'P03',
    'GSM4679535': 'P04',
    'GSM4679536': 'P05',
    'GSM4679537': 'P06',
    'GSM4679538': 'P07',
    'GSM4679539': 'P08',
    'GSM4679540': 'P09',
    'GSM4679541': 'P10',
    'GSM4679542': 'M01',
    'GSM4679543': 'M02',
    'GSM4679544': 'M03',
    'GSM4679545': 'M04',
    'GSM4679546': 'M05',
    'GSM4679547': 'M06'
}

adata.obs['ID_harmonised'] = adata.obs['ID_harmonised'].replace(sample_mapping)
adata.obs['ID_batch_covariate'] = [sample_mapping[ID[:ID.find('_Lin')]] + '_Lin_scRNA-seq' if 'Lin' in ID else ID for ID in adata.obs.ID_batch_covariate]

In [None]:
mapping = {
    'Ding_snRNA-seq': 'Ding',
    'Peng_Normal': 'Peng',
    'Steele_Adj_Norm': 'Steele',
}
adata.obs['Dataset_unique'] = adata.obs.Dataset.replace(mapping)

In [None]:
def check_overlap(adata, metadata, dataset):
    metadata_patients = set(metadata[metadata.index == dataset].Patient)
    adata_patients = set(adata[adata.obs.Dataset_unique == dataset].obs.ID_harmonised)
    missing_patients = metadata_patients - adata_patients
    print(f"Missing patients in {dataset}: {missing_patients}")

In [None]:
for dataset in adata.obs.Dataset_unique.unique():
    check_overlap(adata, metadata, dataset)

## cleanup metadata

In [None]:
treatment_mapping = {
    'neoadjuvant gem/abraxane': 'Neoadjuvant Gem/Abraxane',
    'Neoadjuvant gemcitabine and nab-\r\npaclitaxel': 'Neoadjuvant Gem/Abraxane',
    's/p mFOLFIRINOX -> chemoradiation with gemzar + radiation': 'FOLFIRINOX + Chemoradiation',
    's/p gemcitabine/paclitaxel': 'Gem/Abraxane',
    'FOLFOXIRI -> nab-paclitaxel/gemcitabine-> Whipple': 'FOLFIRINOX + Gem/Abraxane + Whipple',
    'gemcitabine/paclitaxel': 'Gem/Abraxane',
    '5-fluorouracil/liposmal irinotecan': '5-FU/Liposomal Irinotecan',
    'evofosfamide/ipilimumab': 'Evofosfamide/Ipilimumab',
    'capecitabine': 'Capecitabine',
    'FOLFIRINOX + radiotherapy with concurrent capecitabine or 5-FU': 'FOLFIRINOX + Radiotherapy + Capecitabine/5-FU',
    'FOLFIRINOX + losartan + radiotherapy with concurrent capecitabine or 5-FU': 'FOLFIRINOX + Losartan + Radiotherapy + Capecitabine/5-FU',
    'FOLFIRINOX + stereotactic body radiotherapy + nivolumab': 'FOLFIRINOX + SBRT + Nivolumab',
    'FOLFIRINOX + stereotactic body radiotherapy + losartan + nivolumab; Other, treatment regimen consisting of chemotherapy and/or radiotherapy combination not otherwise specified': 'FOLFIRINOX + SBRT + Losartan + Nivolumab',
    'Other': 'Other',
    'FOLFIRINOX': 'FOLFIRINOX',
    'Chemo-RT': 'FOLFIRINOX + RT',
    'Gem_abrax': 'Gem/Abraxane',
    'FOLFIRINOX then Gem_abrax': 'FOLFIRINOX + Gem/Abraxane',
    'FOLIRINOX, Gem/Abraxane': 'FOLFIRINOX + Gem/Abraxane',
    'FOLFIRINOX, SBRT': 'FOLFIRINOX + SBRT',
    'mFOLFIRINOX, FOLFIRI': 'FOLFIRINOX',
    'FOLFIRONOX, Gemcitabine': 'FOLFIRINOX + Gemcitabine',
    'Folfirinox-based': 'FOLFIRINOX',
    'Gemcitabine/Abraxane': 'Gem/Abraxane',
    'PAXG': 'Abraxane',
    'GEMCITABINA + NAB PACLITAXEL': 'Gem/Abraxane',
    np.nan: 'Untreated'
}

In [None]:
def map_age_to_decade(age):
    try:
        age = int(age)
        return f"{age // 10 * 10}s"
    except ValueError:
        return age

In [None]:
metadata['Gender/sex'] = metadata['Gender/sex'].replace(np.nan, 'Unknown')
metadata['Diabetes'] = metadata['Diabetes'].replace(np.nan, 'Unknown')
metadata['Age'] = metadata['Age'].apply(map_age_to_decade).replace(np.nan, 'Unknown')
metadata['tissue'] = metadata['tissue'].replace(
    {
        'pancreas': 'Pancreas',
        'duodenum': 'Duodenum',
        'common bile duct': 'Common Bile Duct',
    }
)
metadata['Which treatment/radiation'] = metadata['Which treatment/radiation'].replace(treatment_mapping)

In [None]:
def map_metadata(adata, metadata, field, obs_name, Dataset):
    mapping = {k:v for k,v in zip(metadata[metadata.index == Dataset].Patient, metadata[metadata.index == Dataset][field])}
    adata.obs.loc[adata.obs.Dataset == Dataset, obs_name] = 'Unknown'
    adata.obs.loc[adata.obs.Dataset == Dataset, obs_name] = adata.obs.loc[adata.obs.Dataset == Dataset, 'ID_harmonised'].map(mapping)

In [None]:
# Tissue
for dataset in adata.obs.Dataset_unique.unique():
    map_metadata(adata, metadata, 'tissue', 'Tissue', dataset)
adata.obs.Tissue.fillna('Unknown', inplace=True)
# Age
for dataset in adata.obs.Dataset_unique.unique():
    map_metadata(adata, metadata, 'Age', 'Age', dataset)
adata.obs.Age.fillna('Unknown', inplace=True)
#Sex
for dataset in adata.obs.Dataset_unique.unique():
    map_metadata(adata, metadata, 'Gender/sex', 'Sex', dataset)
adata.obs.Sex.fillna('Unknown', inplace=True)
#Diabetes
for dataset in adata.obs.Dataset_unique.unique():
    map_metadata(adata, metadata, 'Diabetes', 'Diabetes', dataset)
adata.obs.Diabetes.fillna('Unknown', inplace=True)
#Treatment
for dataset in adata.obs.Dataset_unique.unique():
    map_metadata(adata, metadata, 'Which treatment/radiation', 'Treatment', dataset)
adata.obs.Treatment.fillna('Unknown', inplace=True)

# Save object

In [None]:
adata.write_h5ad('/mnt/storage/Daniele/atlases/human/adata_all_genes_scpoli_final.h5ad', compression='gzip')

# Check metadata

In [None]:
adata = sc.read_h5ad('/mnt/storage/Daniele/atlases/human/adata_all_genes_scpoli_final.h5ad')

In [None]:
metadata = ['Tissue', 'Age', 'Sex', 'Diabetes', 'Treatment']

In [None]:
sc.pl.umap(adata, color = metadata, ncols=3, wspace = .3)