In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import os

# read

In [None]:
base_path = '/mnt/storage/Daniele/atlases'

In [None]:
human = sc.read_h5ad(os.path.join(base_path, 'human', 'Core_Extension_All_Genes.h5ad'))
mouse = sc.read_h5ad(os.path.join(base_path, 'mouse', '15_mouse_final_integration.h5ad'))

In [None]:
base_path_csv = '/home/daniele/Code/github_synced/PDAC/'

In [None]:
human.obs.to_csv(os.path.join(base_path_csv, 'human_obs.csv'))
mouse.obs.to_csv(os.path.join(base_path_csv, 'mouse_obs.csv'))

In [None]:
human_obs = pd.read_csv(os.path.join(base_path_csv, 'human_obs.csv'), index_col=0)
mouse_obs = pd.read_csv(os.path.join(base_path_csv, 'mouse_obs.csv'), index_col=0)

In [None]:
human_obs = human.obs.copy()
mouse_obs = mouse.obs.copy()

# Human

### datasets harmonization

In [None]:
human_obs.Dataset.unique()

In [None]:
mapping = {
    'Ding': 'Ding_2022',
    'Ding_snRNA-seq': 'Ding_2022',
    'Lee': 'Lee_2021',
    'Lee_MET_GSE156405': 'Lee_2021',
    'Simeone': 'Simeone_2023',
    'Simeone_MET_GSE205013': 'Simeone_2023',
    'Steele': 'Steele_2020',
    'Steele_Adj_Norm': 'Steele_2020',
    'Peng': 'Peng_2019',
    'Peng_Normal': 'Peng_2019',
    'Lin': 'Lin_2020',
    'Lin_MET_GSE154778': 'Lin_2020',
    'Hwang': 'Hwang_2022',
    'Schlesinger': 'Schlesinger_2020',
    'Caronni': 'Caronni_2023',
    'GSE211644': 'Schalck_2022',
    'GSE158356': 'Kemp_2020',
    'GSE194247': 'Kim_2022',
    'GSE229413': 'Carpenter_2023',
    'phs001840_v1_p1': 'Elyada_2019', 
    'Zhang_GSE197177': 'Zhang_2023',
    'EGAS00001002543': 'Yue_2020'
}

In [None]:
human_obs['Dataset'] = human_obs['Dataset'].replace(mapping)

In [None]:
del human_obs['Dataset_unique']

### ID harmonization

In [None]:
human_obs['Sample_ID'] = human_obs['Unique_ID'].fillna(human_obs['ID_batch_covariate'])

In [None]:
columns = ['ID_batch_covariate','Unique_ID', 'ID_harmonised', 'Donor_ID', 'ID']

In [None]:
for col in columns:
    del human_obs[col]

### nans

In [None]:
human_obs.Technology.fillna('scRNA-seq', inplace=True)

In [None]:
human_obs['Age'].fillna('Unknown', inplace=True)

In [None]:
human_obs.Sex.fillna('Unknown', inplace=True)

In [None]:
human_obs.Diabetes.fillna('Unknown', inplace=True)

In [None]:
human_obs['Tissue'] = human_obs['Location'].copy()
del human_obs['Location']

### remove superfluous columns

In [None]:
human_obs.columns

In [None]:
human_obs['Level_4'] = human_obs['Level_4_Final'].copy()

In [None]:
columns_to_remove = ['Barcode', 'Level_0','Atlas_Extension_CellType', 'combo', 'Suspicious_Normal'] + [col for col in human_obs.columns if 'leiden_0.75' in col]

In [None]:
human_obs.drop(columns=columns_to_remove, inplace=True)

In [None]:
human_obs.columns

In [None]:
human_obs['Level_4'] = human_obs['Level_4_Final'].copy()
human_obs.drop(columns=['Level_4_Final'], inplace=True)

### Treatment

In [None]:
human_obs.columns

In [None]:
treatment_columns = ['Treatment', 'Treatment_Harmonized', 'TreatmentType', 'Treatment_Category']

In [None]:
human_obs['Treatment'].isna().sum(), human_obs['Treatment_Harmonized'].isna().sum()

In [None]:
human_obs['Treatment'].value_counts()

In [None]:
mapping_treatment = {k:'' for k in human_obs['Treatment'].unique()}
    

In [None]:
mapping_treatment = {
    # Clear assignments
    'FOLFIRINOX': 'FOLFIRINOX',
    'Untreated': 'Untreated',
    'Treatment na√Øve': 'Untreated',
    'N_A': 'Unknown',
    np.nan: 'Unknown',
    'FOLFIRINOX + RT': 'FOLFIRINOX + Radiotherapy',
    'FOLFIRINOX + Radiotherapy + Capecitabine/5-FU': 'FOLFIRINOX + Radiotherapy + Capecitabine/5-FU',
    'FOLFIRINOX + Losartan + Radiotherapy + Capecitabine/5-FU': 'FOLFIRINOX + Radiotherapy + Capecitabine/5-FU + Losartan',
    'FOLFIRINOX + SBRT + Nivolumab': 'FOLFIRINOX + Radiotherapy + Nivolumab',
    'FOLFIRINOX + SBRT + Losartan + Nivolumab': 'FOLFIRINOX + Radiotherapy + Nivolumab + Losartan',
    'FOLFIRINOX + Gem/Abraxane': 'FOLFIRINOX + Gemcitabine/Abraxane',
    
    # Abraxane or Gem/Abraxane
    'Gem/Abraxane': 'Gemcitabine/Abraxane',
    'Abraxane': 'Abraxane',
    
    # Other or unclear
    'Treatment': 'Other',
    'Other': 'Other'
}

In [None]:
_core = human_obs[human_obs['Is_Core'] == 'Core'].copy()
_exte = human_obs[human_obs['Is_Core'] != 'Core'].copy()

In [None]:
_core['Treatment'].fillna('Untreated', inplace=True)
_core['Treatment'] = _core['Treatment'].replace(mapping_treatment)

In [None]:
_exte['Treatment'].fillna('Unknown', inplace=True)
_exte['Treatment'] = _exte['Treatment'].replace(mapping_treatment)


In [None]:
treated = pd.concat([_core['Treatment'], _exte['Treatment']])

In [None]:
treated.value_counts().sum() == human_obs.shape[0]

In [None]:
human_obs['Treatment'] = treated

In [None]:
treatment_macro_mapping = {
    'FOLFIRINOX': 'Chemotherapy',
    'Untreated': 'Untreated',
    'Gemcitabine/Abraxane': 'Chemotherapy',
    'Abraxane': 'Chemotherapy',
    'FOLFIRINOX + Gemcitabine/Abraxane': 'Chemotherapy',
    
    'FOLFIRINOX + Radiotherapy': 'Chemotherapy + Radiotherapy',
    'FOLFIRINOX + Radiotherapy + Capecitabine/5-FU': 'Chemotherapy + Radiotherapy',
    'FOLFIRINOX + Radiotherapy + Capecitabine/5-FU + Losartan': 'Chemotherapy + Radiotherapy',
    
    'FOLFIRINOX + Radiotherapy + Nivolumab': 'Chemotherapy + Radiotherapy + Immunotherapy',
    'FOLFIRINOX + Radiotherapy + Nivolumab + Losartan': 'Chemotherapy + Radiotherapy + Immunotherapy',
    
    'Other': 'Other',
    'Unknown': 'Unknown'
}

In [None]:
human_obs['TreatmentType'] = human_obs['Treatment'].replace(treatment_macro_mapping)

In [None]:
treatment_status_mapping = {
    'Chemotherapy': 'Treated',
    'Untreated': 'Untreated',
    'Chemotherapy + Radiotherapy': 'Treated',
    'Other': 'Treated',
    'Chemotherapy + Radiotherapy + Immunotherapy': 'Treated',
    'Unknown': 'Unknown'
    }

In [None]:
human_obs['TreatmentStatus'] = human_obs['TreatmentType'].replace(treatment_status_mapping)

In [None]:
columns_to_remove = ['Treatment_Harmonized', 'Treatment_Category', 'treatment_status']
human_obs.drop(columns=columns_to_remove, inplace=True)

In [None]:
human_obs.columns

In [None]:
columns_order = [
    # 1. Metadata
    'Sample_ID', 'Condition', 'Treatment', 'TreatmentType', 'TreatmentStatus',
    'Tissue', 'Age', 'Sex', 'Diabetes',
    'Is_Core', 'Dataset', 'Technology', 

    # 2. Annotation-related columns

    'Level_1', 'Level_2', 'Level_3', 'Level_4',

    # 3. QC and technical metrics
    'n_genes', 'n_counts', 'log_counts', 'mt_frac', 'n_genes_by_counts',
    'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
    'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito',
    'is_outlier_total_counts', 'outlier', 'infercnv_score_malignant',
    'infercnv_score_malignant_refined', 'cnv_score_abs', 'MALAT1_lognorm',
    'empty_droplet', 'EMT category', 'EMT score', 'EMT_score_DL',
    '_scvi_batch', '_scvi_labels', 'batch',
    
    #Misc
    'Global_Leiden', 'leiden', 'leiden_0.2', 'leiden_0.2_annotation', 'leiden_subcluster',
    'level0_leiden_subcluster', 'leiden_0.5',
]

In [None]:
human_obs = human_obs[columns_order]

# Mouse

### dataset and sample ID mapping

In [None]:
to_resolve = mouse_obs[mouse_obs.Sample_unique.isna()].copy()

In [None]:
to_resolve

In [None]:
to_resolve['donor_id'] = [i[19:] for i in to_resolve.index]

In [None]:
to_resolve['Sample'] = [i.split('-')[0] for i in to_resolve['donor_id']]

In [None]:
to_resolve['Dataset'] = 'Donahue_2024'
to_resolve['provenance'] = 'public_datasets'

In [None]:
mouse_obs[mouse_obs.Sample_unique.isna()] = to_resolve

In [None]:
mouse_obs['Sample'].isna().sum()

In [None]:
len(mouse_obs['Sample'].unique())

### match human column names

In [None]:
human_obs.columns

In [None]:
mouse_to_human_column_mapping = {
    # Matched to human columns
    'Sample': 'Sample_ID',
    'disease': 'Condition',
    'treatment': 'Treatment',
    'Dataset': 'Dataset',
    'technology': 'Technology',
    'sex': 'Sex',
    'model': 'Model',
    'barcoded': 'Barcoded',
    'strain': 'Strain',
    'genotype': 'Genotype',
    'cell_filtering': 'Cell_filtering',
    'larry_positive': 'Larry_positive',
    'provenance': 'Provenance',
}
mouse_obs = mouse_obs.rename(columns=mouse_to_human_column_mapping)



In [None]:
mouse_obs_public = mouse_obs[mouse_obs['Provenance'] == 'public_datasets'].copy()

### public metadata

In [None]:
metadata_public = pd.read_excel(os.path.join(base_path_csv, 'supplementary_data', 'mouse', 'mouse_public_metadata.xlsx'), sheet_name='metadata_final')

In [None]:
mouse_ids = metadata_public['Mouse_id'].dropna().unique()
sample_ids = mouse_obs_public['Sample_ID'].dropna().unique()

In [None]:
from rapidfuzz import process, fuzz
matches = []
for mouse_id in sample_ids:
    match, score, _ = process.extractOne(
        mouse_id,
        mouse_ids,
        scorer=fuzz.WRatio 
    )
    matches.append((mouse_id, match, score))
match_dict = {match: sample for sample, match, score in matches}
metadata_public['Mouse_id'] = metadata_public['Mouse_id'].replace(match_dict)


In [None]:
metadata_public_maps = {
    'Genotype': {k:v for k, v in zip(metadata_public['Mouse_id'], metadata_public['genotype'])},
    'Condition': {k:v for k, v in zip(metadata_public['Mouse_id'], metadata_public['Healthy/Cancer tissue'])},
    'Treatment': {k:v for k, v in zip(metadata_public['Mouse_id'], metadata_public['Which treatment/radiation'])},
    'Sex': {k:v for k, v in zip(metadata_public['Mouse_id'], metadata_public['Gender/sex'])},
    'Strain': {k:v for k, v in zip(metadata_public['Mouse_id'], metadata_public['mouse background'])},  
    'Model': {k:v for k, v in zip(metadata_public['Mouse_id'], metadata_public['Model *orthovsgemm'])},
    
}

In [None]:
for col_name, mapping in metadata_public_maps.items():
    mouse_obs_public[col_name] = mouse_obs_public['Sample_ID'].replace(mapping)

In [None]:
mouse_obs[mouse_obs['Provenance'] == 'public_datasets'] = mouse_obs_public
mouse_obs['Tissue'] = 'Pancreas'

### in house metadata

In [None]:
metadata_house = pd.read_excel(os.path.join(base_path_csv, 'supplementary_data', 'mouse', 'metadata_inhouse.xlsx'), sheet_name='scRNA-seq')

In [None]:
metadata_house = metadata_house[metadata_house['include atlas'] == 'yes'].copy()

In [None]:
mouse_obs_inhouse = mouse_obs[mouse_obs['Provenance'] == 'in_house_datasets'].copy()

In [None]:
mouse_ids = metadata_house['sample ID'].dropna().unique()
sample_ids = mouse_obs_inhouse['sampleID'].dropna().unique()
sample_ids = [str(x) for x in sample_ids]
mouse_ids = [str(x) for x in mouse_ids]

In [None]:
len(sample_ids)

In [None]:
matches = []
for mouse_id in sample_ids:
    match, score, _ = process.extractOne(
        mouse_id,
        mouse_ids,
        scorer=fuzz.WRatio 
    )
    matches.append((mouse_id, match, score))
match_dict = {match: sample for sample, match, score in matches}
metadata_house['sample ID'] = metadata_house['sample ID'].replace(match_dict)


In [None]:
metadata_house.columns

In [None]:
metadata_house_maps = {
    'Genotype': {k:v for k, v in zip(metadata_house['sample ID'], metadata_house['genotype_common'])},
}

In [None]:
for col_name, mapping in metadata_house_maps.items():
    mouse_obs_inhouse[col_name] = mouse_obs_inhouse['Sample_ID'].replace(mapping)

In [None]:
mouse_obs_inhouse['Genotype'].isna().sum()

In [None]:
mouse_obs.loc[mouse_obs['Provenance'] != 'public_datasets', 'Genotype'] = mouse_obs_inhouse['Genotype'].values


In [None]:
mouse_obs['Genotype'].fillna('Unknown', inplace=True)

## harmonize fields

### condition

In [None]:
mouse_obs.Condition.unique(), human_obs.Condition.unique()

In [None]:
mapping_condition = {
    'PDAC': 'Primary Tumour',
    'Cancer': 'Primary Tumour',
    'Normal': 'Healthy'
}

In [None]:
mouse_obs['Condition'] = mouse_obs['Condition'].replace(mapping_condition)

### treatment

In [None]:
mouse_obs.columns

In [None]:
mouse_obs.Treatment.unique(), human_obs.Treatment.unique()

In [None]:
treatment_mapping = {
    'ctrl': 'Untreated',
    'Control': 'Untreated',
    np.nan: 'Untreated',
    'ganciclovir': 'Ganciclovir',
    'T/N': 'Trametinib + Nintedanib',
    'AVID200': 'AVID200',
    'vehicle control': 'Untreated',
    'T/N+a-PDL1': 'Trametinib + Nintedanib + Anti-PD-L1',
    'D-CAN': 'D-CAN',
    'SUMOi TAK-981 (Subasumstat)': 'Subasumstat',
    'Vehicle (Cremophor)': 'Untreated',
    'G1-D265A + AVID200': 'Anti-PD-1 + AVID200',
    'G1-D265A': 'Anti-PD-1'
    }

In [None]:
mouse_obs['Treatment'] = mouse_obs['Treatment'].replace(treatment_mapping)

In [None]:
treatment_categories_mapping = {
    'Untreated': 'Untreated',
    'Ganciclovir': 'Chemotherapy',
    'Trametinib + Nintedanib': 'Chemotherapy',
    'AVID200': 'Chemotherapy',
    'Trametinib + Nintedanib + Anti-PD-L1': 'Chemotherapy + Immunotherapy',
    'D-CAN': 'Chemotherapy',
    'Subasumstat': 'Chemotherapy',
    'Anti-PD-1 + AVID200': 'Chemotherapy + Immunotherapy',
    'Anti-PD-1': 'Immunotherapy'
}

In [None]:
mouse_obs['TreatmentType'] = mouse_obs['Treatment'].replace(treatment_categories_mapping)

In [None]:
mouse_obs['TreatmentType'].unique()

In [None]:
treatment_status_mapping = {
    'Untreated': 'Untreated',
    'Chemotherapy': 'Treated',
    'Chemotherapy + Immunotherapy': 'Treated',
    'Immunotherapy': 'Treated'
}

In [None]:
mouse_obs['TreatmentStatus'] = mouse_obs['TreatmentType'].replace(treatment_status_mapping)

### Sex

In [None]:
sex_mapping = {
    'F': 'Female',
    'female': 'Female',
    'male': 'Male',
    np.nan: 'Unknown'
}

In [None]:
mouse_obs['Sex'].replace(sex_mapping, inplace=True)

In [None]:
mouse_obs.columns

In [None]:
mouse_obs.Dataset.isna().sum()

In [None]:
human_obs.columns

In [None]:
mouse_obs.columns

### remove redundant columns 

In [None]:
mouse_obs['Level_1'] = mouse_obs['Level_1_final']
del mouse_obs['Level_1_final']
mouse_obs['Level_2'] = mouse_obs['Level_2_final']
del mouse_obs['Level_2_final']
mouse_obs['Level_3'] = mouse_obs['Level_3_final']
del mouse_obs['Level_3_final']
mouse_obs['Level_4'] = mouse_obs['Level_4_final']
del mouse_obs['Level_4_final']
del mouse_obs['Level_1_refined']
del mouse_obs['Level_4_knn']
del mouse_obs['Level_3_knn']
del mouse_obs['Level_1_label_transfer']

In [None]:
mouse_obs.columns

In [None]:
columns_to_remove = ['donor_id','sampleID', 'backup', 'Sample_unique', 'Level_0']
mouse_obs.drop(columns=columns_to_remove, inplace=True)


# Check unique values

In [None]:
common_cols = set(human_obs.columns).intersection(mouse_obs.columns)
for col in common_cols:
    if human_obs[col].dtype.name in ['object', 'category'] and mouse_obs[col].dtype.name in ['object', 'category']:
        human_vals = set(human_obs[col].dropna().unique())
        mouse_vals = set(mouse_obs[col].dropna().unique())
        only_in_human = human_vals - mouse_vals
        only_in_mouse = mouse_vals - human_vals
        if only_in_human or only_in_mouse:
            print(f"Column: {col}")
            if only_in_human:
                print("  Only in human_obs:", only_in_human)
            if only_in_mouse:
                print("  Only in mouse_obs:", only_in_mouse)

# modify anndatas

In [None]:
if not human.obs.index.equals(human_obs.index):
    raise ValueError("The index order of adata.obs and human_obs do not match.")


In [None]:
if not mouse.obs.index.equals(mouse_obs.index):
    raise ValueError("The index order of adata.obs and human_obs do not match.")


In [None]:
human.obs = human_obs.copy()
mouse.obs = mouse_obs.copy()

### reoderd columns

In [None]:
qc_keywords = [
    'n_genes', 'n_counts', 'log_counts', 'log1p_', 'total_counts', 'pct_counts',
    'mt_frac', 'is_outlier', 'outlier', 'empty_droplet', 'score', 'cnv',
    'malat', '_scvi', 'leiden', 'batch', 'filtering'
]

def is_qc(col):
    return any(key in col.lower() for key in qc_keywords)

human_cols = human.obs.columns
mouse_cols = mouse.obs.columns

# Shared columns (exact matches)
shared_cols = [col for col in human_cols if col in mouse_cols and not is_qc(col)]

# Human-unique metadata
human_only = [col for col in human_cols if col not in mouse_cols and not is_qc(col)]

# Mouse-unique metadata
mouse_only = [col for col in mouse_cols if col not in human_cols and not is_qc(col)]

# All QC/misc columns (include shared and unique)
human_qc = [col for col in human_cols if is_qc(col)]
mouse_qc = [col for col in mouse_cols if is_qc(col)]

# Final order
human_order = shared_cols + human_only + human_qc
mouse_order = shared_cols + mouse_only + mouse_qc


In [None]:
human.obs = human.obs[human_order]
mouse.obs = mouse.obs[mouse_order]

### move qc to obsm

In [None]:
human.obs.shape, len(human_qc)

In [None]:
mouse.obs.shape, len(mouse_qc)

In [None]:
for col in human_qc:
    human.obsm[col] = human.obs[col].values
    del human.obs[col]
for col in mouse_qc:
    mouse.obsm[col] = mouse.obs[col].values
    del mouse.obs[col]

### fix human var

In [None]:
column_names = [i[:i.find('-0')] for i in human.var.columns]

In [None]:
len(column_names) == human.var.shape[1]

In [None]:
human.var.columns = column_names

In [None]:
def clean_obs(obs_df):
    for col in obs_df.columns:
        obs_df[col] = obs_df[col].astype(str).astype('category')
    return obs_df

In [None]:
def clean_obsm(adata):

    for key in list(adata.obsm.keys()):
        val = adata.obsm[key]
        if isinstance(val, pd.DataFrame):
            for col in val.columns:
                if val[col].dtype == 'object':
                    print(f"Converting '{key}.{col}' to string")
                    val[col] = val[col].astype(str)
            adata.obsm[key] = val

        elif hasattr(val, 'dtype') and val.dtype == 'object':
            print(f"Converting '{key}' to string array")
            adata.obsm[key] = val.astype(str)

    return adata


In [None]:
human.obs = clean_obs(human.obs)
mouse.obs = clean_obs(mouse.obs)

In [None]:
human = clean_obsm(human)
mouse = clean_obsm(mouse)

In [None]:
human.write_h5ad(os.path.join(base_path, 'human', 'Human_Atlas_Harmonised.h5ad'))
mouse.write_h5ad(os.path.join(base_path, 'mouse', 'Mouse_Atlas_Harmonised.h5ad'))


In [None]:
human_core = human[human.obs['Is_Core'] == 'Core'].copy()

In [None]:
human_core.write_h5ad(os.path.join(base_path, 'human', 'Human_Atlas_Harmonised_Core.h5ad'))
