# Overlaying pathways of interest on DC from my atlas - using only primary cancer samples

### import package and set directory

In [1]:
import os
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#show current directory 
os.getcwd()
os.chdir('/scratch/user/s4436039/scdata/Python_Integration_Sep')
os.getcwd()

'/scratch/user/s4436039/scdata/Python_Integration_Sep'

# Read in the data

In [3]:
# read in data
data = sc.read_h5ad('NRclean_clustered2.h5ad')

In [4]:
data_P = data[data.obs["sample_type_major2"] == "primary tumour"]

data_DC1 = data_P[data_P.obs["NR_annotations_simple"] == "cDC1"]
data_DC2 = data_P[data_P.obs["NR_annotations_simple"] == "cDC2"]
data_mregDC = data_P[data_P.obs["NR_annotations_simple"] == "mregDC"]

# Import KEGG sets

In [5]:
os.chdir('/scratch/user/s4436039/scdata/Pathway-sets')
K_sets = pd.read_excel("kegg_sets.xlsx", sheet_name=None)
os.chdir('/scratch/user/s4436039/scdata/Python_Integration_Sep')

In [6]:
# Print the sheet names
print(K_sets.keys())

dict_keys(['Sheet1', 'Sheet2', 'Sheet3', 'Sheet4', 'Sheet5', 'Sheet6', 'Sheet7', 'Sheet8', 'Sheet9', 'Sheet10', 'Sheet11', 'Sheet12', 'Sheet13', 'Sheet14', 'Sheet15', 'Sheet16', 'Sheet17', 'Sheet18', 'Sheet19', 'Sheet20', 'Sheet21', 'Sheet22', 'Sheet23', 'Sheet24', 'Sheet25', 'Sheet26', 'Sheet27', 'Sheet28', 'Sheet29', 'Sheet30', 'Sheet31', 'Sheet32', 'Sheet33', 'Sheet34', 'Sheet35', 'Sheet36', 'Sheet37', 'Sheet38', 'Sheet39', 'Sheet40', 'Sheet41', 'Sheet42', 'Sheet43', 'Sheet44', 'Sheet45', 'Sheet46', 'Sheet47', 'Sheet48', 'Sheet49', 'Sheet50', 'Sheet51', 'Sheet52', 'Sheet53', 'Sheet54', 'Sheet55', 'Sheet56', 'Sheet57', 'Sheet58', 'Sheet59', 'Sheet60', 'Sheet61', 'Sheet62', 'Sheet63', 'Sheet64', 'Sheet65', 'Sheet66', 'Sheet67', 'Sheet68', 'Sheet69', 'Sheet70', 'Sheet71', 'Sheet72', 'Sheet73', 'Sheet74', 'Sheet75', 'Sheet76', 'Sheet77', 'Sheet78', 'Sheet79', 'Sheet80', 'Sheet81', 'Sheet82', 'Sheet83', 'Sheet84', 'Sheet85', 'Sheet86', 'Sheet87', 'Sheet88', 'Sheet89', 'Sheet90', 'Sheet91

In [7]:
# Change keys to be gene set name (the first value in the 'gs_name' column) instead of Sheet1 etc 
K_sets2 = {df["gs_name"].iloc[0]: df for _, df in K_sets.items()}
K_sets = K_sets2

# Result
print(K_sets.keys())

dict_keys(['KEGG_ABC_TRANSPORTERS', 'KEGG_ACUTE_MYELOID_LEUKEMIA', 'KEGG_ADHERENS_JUNCTION', 'KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY', 'KEGG_ALANINE_ASPARTATE_AND_GLUTAMATE_METABOLISM', 'KEGG_ALDOSTERONE_REGULATED_SODIUM_REABSORPTION', 'KEGG_ALLOGRAFT_REJECTION', 'KEGG_ALPHA_LINOLENIC_ACID_METABOLISM', 'KEGG_ALZHEIMERS_DISEASE', 'KEGG_AMINO_SUGAR_AND_NUCLEOTIDE_SUGAR_METABOLISM', 'KEGG_AMINOACYL_TRNA_BIOSYNTHESIS', 'KEGG_AMYOTROPHIC_LATERAL_SCLEROSIS_ALS', 'KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION', 'KEGG_APOPTOSIS', 'KEGG_ARACHIDONIC_ACID_METABOLISM', 'KEGG_ARGININE_AND_PROLINE_METABOLISM', 'KEGG_ARRHYTHMOGENIC_RIGHT_VENTRICULAR_CARDIOMYOPATHY_ARVC', 'KEGG_ASCORBATE_AND_ALDARATE_METABOLISM', 'KEGG_ASTHMA', 'KEGG_AUTOIMMUNE_THYROID_DISEASE', 'KEGG_AXON_GUIDANCE', 'KEGG_B_CELL_RECEPTOR_SIGNALING_PATHWAY', 'KEGG_BASAL_CELL_CARCINOMA', 'KEGG_BASAL_TRANSCRIPTION_FACTORS', 'KEGG_BASE_EXCISION_REPAIR', 'KEGG_BETA_ALANINE_METABOLISM', 'KEGG_BIOSYNTHESIS_OF_UNSATURATED_FATTY_ACIDS', 'KEGG_BLADDE

In [8]:
# Keys to include in new dictionary
keys_include = ["KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY",
"KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION",
"KEGG_APOPTOSIS",
"KEGG_CALCIUM_SIGNALING_PATHWAY",
"KEGG_CELL_ADHESION_MOLECULES_CAMS",
"KEGG_CHEMOKINE_SIGNALING_PATHWAY",
"KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION",
"KEGG_CYTOSOLIC_DNA_SENSING_PATHWAY",
"KEGG_ENDOCYTOSIS",
"KEGG_FATTY_ACID_METABOLISM",
"KEGG_FC_GAMMA_R_MEDIATED_PHAGOCYTOSIS",
"KEGG_GLYCOLYSIS_GLUCONEOGENESIS",
"KEGG_JAK_STAT_SIGNALING_PATHWAY",
"KEGG_MAPK_SIGNALING_PATHWAY",
"KEGG_NOD_LIKE_RECEPTOR_SIGNALING_PATHWAY",
"KEGG_NOTCH_SIGNALING_PATHWAY",
"KEGG_OXIDATIVE_PHOSPHORYLATION",
"KEGG_PROTEASOME",
"KEGG_PROTEIN_EXPORT",
"KEGG_TGF_BETA_SIGNALING_PATHWAY",
"KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY",
"KEGG_TYROSINE_METABOLISM",
"KEGG_UBIQUITIN_MEDIATED_PROTEOLYSIS",
"KEGG_VEGF_SIGNALING_PATHWAY"]

In [9]:
# create a new dictionary with select keys (pathway sets)
K_sets_sub = {key: K_sets[key] for key in keys_include if key in K_sets}

In [10]:
print(K_sets_sub.keys())

dict_keys(['KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY', 'KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION', 'KEGG_APOPTOSIS', 'KEGG_CALCIUM_SIGNALING_PATHWAY', 'KEGG_CELL_ADHESION_MOLECULES_CAMS', 'KEGG_CHEMOKINE_SIGNALING_PATHWAY', 'KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION', 'KEGG_CYTOSOLIC_DNA_SENSING_PATHWAY', 'KEGG_ENDOCYTOSIS', 'KEGG_FATTY_ACID_METABOLISM', 'KEGG_FC_GAMMA_R_MEDIATED_PHAGOCYTOSIS', 'KEGG_GLYCOLYSIS_GLUCONEOGENESIS', 'KEGG_JAK_STAT_SIGNALING_PATHWAY', 'KEGG_MAPK_SIGNALING_PATHWAY', 'KEGG_NOD_LIKE_RECEPTOR_SIGNALING_PATHWAY', 'KEGG_NOTCH_SIGNALING_PATHWAY', 'KEGG_OXIDATIVE_PHOSPHORYLATION', 'KEGG_PROTEASOME', 'KEGG_PROTEIN_EXPORT', 'KEGG_TGF_BETA_SIGNALING_PATHWAY', 'KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY', 'KEGG_TYROSINE_METABOLISM', 'KEGG_UBIQUITIN_MEDIATED_PROTEOLYSIS', 'KEGG_VEGF_SIGNALING_PATHWAY'])


In [11]:
custom_order = ['HGSOC','BC','CRC','GAC','GBM','HCC','HNSCC','MEL','NPC','NSCLC','PDAC','iCCA']

In [13]:
#set current directory 
os.chdir('/scratch/user/s4436039/scdata/Pathway4Prism')
os.getcwd()

'/scratch/user/s4436039/scdata/Pathway4Prism'

# loop through all pathways and export an excel containing scores per sample, grouped by cancer type, for each DC subset

In [14]:
for set_name, gene_set_df in K_sets_sub.items():

    gene_set = gene_set_df['human_gene_symbol'].tolist()

    #score DC for gene set:
    sc.tl.score_genes(data_DC1, gene_list=gene_set, score_name=f"{set_name}_score")
    sc.tl.score_genes(data_DC2, gene_list=gene_set, score_name=f"{set_name}_score")
    sc.tl.score_genes(data_mregDC, gene_list=gene_set, score_name=f"{set_name}_score")

    score_column = f"{set_name}_score"

    #DC1: 

    # Group by 'integration_id' and calculate the mean score for each group
    score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'cancer_broadest' by matching the 'integration_id'
    score_DC1 = score_DC1.merge(
        data_DC1.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
    pivoted_df_DC1 = score_DC1.pivot(index='cancer_broadest', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_DC1 = pivoted_df_DC1.reindex(custom_order)
    pivoted_df_DC1.shape

    #DC2: 

    # Group by 'integration_id' and calculate the mean score for each group
    score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'cancer_broadest' by matching the 'integration_id'
    score_DC2 = score_DC2.merge(
        data_DC2.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
    pivoted_df_DC2 = score_DC2.pivot(index='cancer_broadest', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_DC2 = pivoted_df_DC2.reindex(custom_order)
    pivoted_df_DC2.shape

    #mregDC:
    # Group by 'integration_id' and calculate the mean score for each group
    score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'cancer_broadest' by matching the 'integration_id'
    score_mregDC = score_mregDC.merge(
        data_mregDC.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()
    pivoted_df_mregDC = score_mregDC.pivot(index='cancer_broadest', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_mregDC = pivoted_df_mregDC.reindex(custom_order)
    pivoted_df_mregDC.shape

    save_name = f"{set_name}.xlsx"

    with pd.ExcelWriter(save_name) as writer:
        pivoted_df_DC1.to_excel(writer, sheet_name='DC1')
        pivoted_df_DC2.to_excel(writer, sheet_name='DC2')
        pivoted_df_mregDC.to_excel(writer, sheet_name='mregDC')


       'PRKAG3'],
      dtype='object')


       'PRKAG3'],
      dtype='object')


  adata.obs[score_name] = pd.Series(


       'PRKAG3'],
      dtype='object')


  adata.obs[score_name] = pd.Series(
  adata.obs[score_name] = pd.Series(
  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'IFNA17', 'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7',
       'IFNA8', 'KIR2DL2', 'KIR2DL5A', 'KIR2DS1', 'KIR2DS3', 'KIR2DS4',
       'KIR2DS5'],
      dtype='object')
       'IFNA17', 'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7',
       'IFNA8', 'KIR2DL2', 'KIR2DL5A', 'KIR2DS1', 'KIR2DS3', 'KIR2DS4',
       'KIR2DS5'],
      dtype='object')
       'IFNA17', 'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7',
       'IFNA8', 'KIR2DL2', 'KIR2DL5A', 'KIR2DS1', 'KIR2DS3', 'KIR2DS4',
       'KIR2DS5'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'AVPR1B', 'BDKRB1', 'CACNA1B', 'CACNA1E', 'CACNA1S', 'CAMK2A', 'CCKAR',
       'CCKBR', 'CHP2', 'CHRM1', 'CHRM2', 'CHRM5', 'DRD1', 'DRD5', 'GRM1',
       'GRM5', 'GRPR', 'HTR2A', 'HTR2C', 'HTR4', 'HTR5A', 'HTR6', 'LHCGR',
       'MYLK3', 'P2RX3', 'PLCZ1', 'PLN', 'PPP3R2', 'PRKACG', 'PTGFR',
       'SLC25A31', 'TACR3', 'TRHR', 'VDAC2P5'],
      dtype='object')
       'AVPR1B', 'BDKRB1', 'CACNA1B', 'CACNA1E', 'CACNA1S', 'CAMK2A', 'CCKAR',
       'CCKBR', 'CHP2', 'CHRM1', 'CHRM2', 'CHRM5', 'DRD1', 'DRD5', 'GRM1',
       'GRM5', 'GRPR', 'HTR2A', 'HTR2C', 'HTR4', 'HTR5A', 'HTR6', 'LHCGR',
       'MYLK3', 'P2RX3', 'PLCZ1', 'PLN', 'PPP3R2', 'PRKACG', 'PTGFR',
       'SLC25A31', 'TACR3', 'TRHR', 'VDAC2P5'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'AVPR1B', 'BDKRB1', 'CACNA1B', 'CACNA1E', 'CACNA1S', 'CAMK2A', 'CCKAR',
       'CCKBR', 'CHP2', 'CHRM1', 'CHRM2', 'CHRM5', 'DRD1', 'DRD5', 'GRM1',
       'GRM5', 'GRPR', 'HTR2A', 'HTR2C', 'HTR4', 'HTR5A', 'HTR6', 'LHCGR',
       'MYLK3', 'P2RX3', 'PLCZ1', 'PLN', 'PPP3R2', 'PRKACG', 'PTGFR',
       'SLC25A31', 'TACR3', 'TRHR', 'VDAC2P5'],
      dtype='object')
       'CLDN8', 'HLA-DRB3', 'HLA-DRB4', 'NECTIN1', 'NECTIN2', 'NECTIN3',
       'PECAM1'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'CLDN8', 'HLA-DRB3', 'HLA-DRB4', 'NECTIN1', 'NECTIN2', 'NECTIN3',
       'PECAM1'],
      dtype='object')
       'CLDN8', 'HLA-DRB3', 'HLA-DRB4', 'NECTIN1', 'NECTIN2', 'NECTIN3',
       'PECAM1'],
      dtype='object')
       'CCL3L3', 'CXCL8', 'GNG13', 'GRK1', 'GRK2', 'GRK3', 'PPBPP1', 'PRKACG'],
      dtype='object')
       'CCL3L3', 'CXCL8', 'GNG13', 'GRK1', 'GRK2', 'GRK3', 'PPBPP1', 'PRKACG'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'CCL3L3', 'CXCL8', 'GNG13', 'GRK1', 'GRK2', 'GRK3', 'PPBPP1', 'PRKACG'],
      dtype='object')
       'EPO', 'GH2', 'IFNA1', 'IFNA10', 'IFNA13', 'IFNA14', 'IFNA16', 'IFNA17',
       'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7', 'IFNA8', 'IFNK',
       'IFNL2', 'IFNL3', 'IFNW1', 'IL17B', 'IL25', 'IL3', 'IL4', 'IL5', 'IL9',
       'INHBC', 'LEP', 'PPBPP1', 'PRL', 'TGFB1', 'TNFRSF6B', 'VEGFD'],
      dtype='object')
       'EPO', 'GH2', 'IFNA1', 'IFNA10', 'IFNA13', 'IFNA14', 'IFNA16', 'IFNA17',
       'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7', 'IFNA8', 'IFNK',
       'IFNL2', 'IFNL3', 'IFNW1', 'IL17B', 'IL25', 'IL3', 'IL4', 'IL5', 'IL9',
       'INHBC', 'LEP', 'PPBPP1', 'PRL', 'TGFB1', 'TNFRSF6B', 'VEGFD'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'EPO', 'GH2', 'IFNA1', 'IFNA10', 'IFNA13', 'IFNA14', 'IFNA16', 'IFNA17',
       'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7', 'IFNA8', 'IFNK',
       'IFNL2', 'IFNL3', 'IFNW1', 'IL17B', 'IL25', 'IL3', 'IL4', 'IL5', 'IL9',
       'INHBC', 'LEP', 'PPBPP1', 'PRL', 'TGFB1', 'TNFRSF6B', 'VEGFD'],
      dtype='object')
       'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7', 'IFNA8', 'STING1'],
      dtype='object')
       'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7', 'IFNA8', 'STING1'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7', 'IFNA8', 'STING1'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'G6PC2', 'LDHAL6B', 'PCK1', 'PDHA2', 'PGAM4', 'PGK2', 'PKLR'],
      dtype='object')
       'G6PC2', 'LDHAL6B', 'PCK1', 'PDHA2', 'PGAM4', 'PGK2', 'PKLR'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'G6PC2', 'LDHAL6B', 'PCK1', 'PDHA2', 'PGAM4', 'PGK2', 'PKLR'],
      dtype='object')
       'IFNA17', 'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7',
       'IFNA8', 'IFNK', 'IFNL2', 'IFNL3', 'IFNW1', 'IL3', 'IL4', 'IL5', 'IL9',
       'LEP', 'PRL'],
      dtype='object')
       'IFNA17', 'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7',
       'IFNA8', 'IFNK', 'IFNL2', 'IFNL3', 'IFNW1', 'IL3', 'IL4', 'IL5', 'IL9',
       'LEP', 'PRL'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'IFNA17', 'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7',
       'IFNA8', 'IFNK', 'IFNL2', 'IFNL3', 'IFNW1', 'IL3', 'IL4', 'IL5', 'IL9',
       'LEP', 'PRL'],
      dtype='object')
       'CACNG6', 'CACNG7', 'CHP2', 'FGF10', 'FGF16', 'FGF20', 'FGF21', 'FGF23',
       'FGF3', 'FGF4', 'FGF5', 'FGF6', 'FLNC', 'JMJD7-PLA2G4B', 'MAP3K14',
       'MAP3K20', 'MOS', 'NGF', 'PLA2G12B', 'PLA2G2A', 'PLA2G2C', 'PLA2G2E',
       'PLA2G2F', 'PPP3R2', 'PRKACG', 'PTPN5', 'TGFB1'],
      dtype='object')
       'CACNG6', 'CACNG7', 'CHP2', 'FGF10', 'FGF16', 'FGF20', 'FGF21', 'FGF23',
       'FGF3', 'FGF4', 'FGF5', 'FGF6', 'FLNC', 'JMJD7-PLA2G4B', 'MAP3K14',
       'MAP3K20', 'MOS', 'NGF', 'PLA2G12B', 'PLA2G2A', 'PLA2G2C', 'PLA2G2E',
       'PLA2G2F', 'PPP3R2', 'PRKACG', 'PTPN5', 'TGFB1'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'CACNG6', 'CACNG7', 'CHP2', 'FGF10', 'FGF16', 'FGF20', 'FGF21', 'FGF23',
       'FGF3', 'FGF4', 'FGF5', 'FGF6', 'FLNC', 'JMJD7-PLA2G4B', 'MAP3K14',
       'MAP3K20', 'MOS', 'NGF', 'PLA2G12B', 'PLA2G2A', 'PLA2G2C', 'PLA2G2E',
       'PLA2G2F', 'PPP3R2', 'PRKACG', 'PTPN5', 'TGFB1'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'ATP5MC1', 'ATP5MC1P5', 'ATP5MC2', 'ATP5MC3', 'ATP5ME', 'ATP5MF',
       'ATP5MG', 'ATP5PB', 'ATP5PD', 'ATP5PF', 'ATP5PO', 'ATP6V1G3', 'COX6A2',
       'COX6CP3', 'COX7B2', 'COX8C', 'UQCR10P1', 'UQCRHL'],
      dtype='object')
       'ATP5MC1', 'ATP5MC1P5', 'ATP5MC2', 'ATP5MC3', 'ATP5ME', 'ATP5MF',
       'ATP5MG', 'ATP5PB', 'ATP5PD', 'ATP5PF', 'ATP5PO', 'ATP6V1G3', 'COX6A2',
       'COX6CP3', 'COX7B2', 'COX8C', 'UQCR10P1', 'UQCRHL'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'ATP5MC1', 'ATP5MC1P5', 'ATP5MC2', 'ATP5MC3', 'ATP5ME', 'ATP5MF',
       'ATP5MG', 'ATP5PB', 'ATP5PD', 'ATP5PF', 'ATP5PO', 'ATP6V1G3', 'COX6A2',
       'COX6CP3', 'COX7B2', 'COX8C', 'UQCR10P1', 'UQCRHL'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'TGFB1'],
      dtype='object')
       'TGFB1'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'TGFB1'],
      dtype='object')
       'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7', 'IFNA8', 'LBP'],
      dtype='object')
       'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7', 'IFNA8', 'LBP'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7', 'IFNA8', 'LBP'],
      dtype='object')
       'NAA80', 'TAT', 'TH', 'TYR', 'TYRP1'],
      dtype='object')
       'NAA80', 'TAT', 'TH', 'TYR', 'TYRP1'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'NAA80', 'TAT', 'TH', 'TYR', 'TYRP1'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'PLA2G2F', 'PPP3R2'],
      dtype='object')
       'PLA2G2F', 'PPP3R2'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'PLA2G2F', 'PPP3R2'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


# Repeat for hallmark sets

### Import Hallmark sets
* will be imported as a dictionary where each key is a sheet name, and each value is a dataframe (gene set)

In [5]:
os.chdir('/scratch/user/s4436039/scdata/Pathway-sets')
H_sets = pd.read_excel("hallmark_sets.xlsx", sheet_name=None)
os.chdir('/scratch/user/s4436039/scdata/Python_Integration_Sep')

In [6]:
# Print the sheet names
print(H_sets.keys())

dict_keys(['Sheet1', 'Sheet2', 'Sheet3', 'Sheet4', 'Sheet5', 'Sheet6', 'Sheet7', 'Sheet8', 'Sheet9', 'Sheet10', 'Sheet11', 'Sheet12', 'Sheet13', 'Sheet14', 'Sheet15', 'Sheet16', 'Sheet17', 'Sheet18', 'Sheet19', 'Sheet20', 'Sheet21', 'Sheet22', 'Sheet23', 'Sheet24', 'Sheet25', 'Sheet26', 'Sheet27', 'Sheet28', 'Sheet29', 'Sheet30', 'Sheet31', 'Sheet32', 'Sheet33', 'Sheet34', 'Sheet35', 'Sheet36', 'Sheet37', 'Sheet38', 'Sheet39', 'Sheet40', 'Sheet41', 'Sheet42', 'Sheet43', 'Sheet44', 'Sheet45', 'Sheet46', 'Sheet47', 'Sheet48', 'Sheet49', 'Sheet50'])


In [7]:
# Change keys to be gene set name (the first value in the 'gs_name' column) instead of Sheet1 etc 
H_sets2 = {df["gs_name"].iloc[0]: df for _, df in H_sets.items()}
H_sets = H_sets2

# Result
print(H_sets.keys())

dict_keys(['HALLMARK_ADIPOGENESIS', 'HALLMARK_ALLOGRAFT_REJECTION', 'HALLMARK_ANDROGEN_RESPONSE', 'HALLMARK_ANGIOGENESIS', 'HALLMARK_APICAL_JUNCTION', 'HALLMARK_APICAL_SURFACE', 'HALLMARK_APOPTOSIS', 'HALLMARK_BILE_ACID_METABOLISM', 'HALLMARK_CHOLESTEROL_HOMEOSTASIS', 'HALLMARK_COAGULATION', 'HALLMARK_COMPLEMENT', 'HALLMARK_DNA_REPAIR', 'HALLMARK_E2F_TARGETS', 'HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION', 'HALLMARK_ESTROGEN_RESPONSE_EARLY', 'HALLMARK_ESTROGEN_RESPONSE_LATE', 'HALLMARK_FATTY_ACID_METABOLISM', 'HALLMARK_G2M_CHECKPOINT', 'HALLMARK_GLYCOLYSIS', 'HALLMARK_HEDGEHOG_SIGNALING', 'HALLMARK_HEME_METABOLISM', 'HALLMARK_HYPOXIA', 'HALLMARK_IL2_STAT5_SIGNALING', 'HALLMARK_IL6_JAK_STAT3_SIGNALING', 'HALLMARK_INFLAMMATORY_RESPONSE', 'HALLMARK_INTERFERON_ALPHA_RESPONSE', 'HALLMARK_INTERFERON_GAMMA_RESPONSE', 'HALLMARK_KRAS_SIGNALING_DN', 'HALLMARK_KRAS_SIGNALING_UP', 'HALLMARK_MITOTIC_SPINDLE', 'HALLMARK_MTORC1_SIGNALING', 'HALLMARK_MYC_TARGETS_V1', 'HALLMARK_MYC_TARGETS_V2', 'HALLMAR

In [18]:
# Keys to include in new dictionary
keys_include = ["HALLMARK_APOPTOSIS",
                "HALLMARK_CHOLESTEROL_HOMEOSTASIS",
                "HALLMARK_DNA_REPAIR",
                "HALLMARK_FATTY_ACID_METABOLISM",
                "HALLMARK_GLYCOLYSIS",
                "HALLMARK_IL2_STAT5_SIGNALING",
                "HALLMARK_IL6_JAK_STAT3_SIGNALING",
                "HALLMARK_INFLAMMATORY_RESPONSE",
                "HALLMARK_INTERFERON_ALPHA_RESPONSE",
                "HALLMARK_INTERFERON_GAMMA_RESPONSE",
                "HALLMARK_KRAS_SIGNALING_DN",
                "HALLMARK_KRAS_SIGNALING_UP",
                "HALLMARK_MYC_TARGETS_V1",
                "HALLMARK_MYC_TARGETS_V2",
                "HALLMARK_NOTCH_SIGNALING",
                "HALLMARK_OXIDATIVE_PHOSPHORYLATION",
                "HALLMARK_PI3K_AKT_MTOR_SIGNALING",
                "HALLMARK_PROTEIN_SECRETION",
                "HALLMARK_REACTIVE_OXYGEN_SPECIES_PATHWAY",
                "HALLMARK_TGF_BETA_SIGNALING",
                "HALLMARK_TNFA_SIGNALING_VIA_NFKB"
                ]

In [19]:
# create a new dictionary with select keys (pathway sets)
H_sets_sub = {key: H_sets[key] for key in keys_include if key in H_sets}

In [20]:
print(H_sets_sub.keys())

dict_keys(['HALLMARK_APOPTOSIS', 'HALLMARK_CHOLESTEROL_HOMEOSTASIS', 'HALLMARK_DNA_REPAIR', 'HALLMARK_FATTY_ACID_METABOLISM', 'HALLMARK_GLYCOLYSIS', 'HALLMARK_IL2_STAT5_SIGNALING', 'HALLMARK_IL6_JAK_STAT3_SIGNALING', 'HALLMARK_INFLAMMATORY_RESPONSE', 'HALLMARK_INTERFERON_ALPHA_RESPONSE', 'HALLMARK_INTERFERON_GAMMA_RESPONSE', 'HALLMARK_KRAS_SIGNALING_DN', 'HALLMARK_KRAS_SIGNALING_UP', 'HALLMARK_MYC_TARGETS_V1', 'HALLMARK_MYC_TARGETS_V2', 'HALLMARK_NOTCH_SIGNALING', 'HALLMARK_OXIDATIVE_PHOSPHORYLATION', 'HALLMARK_PI3K_AKT_MTOR_SIGNALING', 'HALLMARK_PROTEIN_SECRETION', 'HALLMARK_REACTIVE_OXYGEN_SPECIES_PATHWAY', 'HALLMARK_TGF_BETA_SIGNALING', 'HALLMARK_TNFA_SIGNALING_VIA_NFKB'])


In [21]:
custom_order = ['HGSOC','BC','CRC','GAC','GBM','HCC','HNSCC','MEL','NPC','NSCLC','PDAC','iCCA']

In [22]:
#set current directory 
os.chdir('/scratch/user/s4436039/scdata/Pathway4Prism')
os.getcwd()

'/scratch/user/s4436039/scdata/Pathway4Prism'

# loop through all pathways and export an excel containing scores per sample, grouped by cancer type, for each DC subset

In [23]:
for set_name, gene_set_df in H_sets_sub.items():

    gene_set = gene_set_df['human_gene_symbol'].tolist()

    #score DC for gene set:
    sc.tl.score_genes(data_DC1, gene_list=gene_set, score_name=f"{set_name}_score")
    sc.tl.score_genes(data_DC2, gene_list=gene_set, score_name=f"{set_name}_score")
    sc.tl.score_genes(data_mregDC, gene_list=gene_set, score_name=f"{set_name}_score")

    score_column = f"{set_name}_score"

    #DC1: 

    # Group by 'integration_id' and calculate the mean score for each group
    score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'cancer_broadest' by matching the 'integration_id'
    score_DC1 = score_DC1.merge(
        data_DC1.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
    pivoted_df_DC1 = score_DC1.pivot(index='cancer_broadest', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_DC1 = pivoted_df_DC1.reindex(custom_order)
    pivoted_df_DC1.shape

    #DC2: 

    # Group by 'integration_id' and calculate the mean score for each group
    score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'cancer_broadest' by matching the 'integration_id'
    score_DC2 = score_DC2.merge(
        data_DC2.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
    pivoted_df_DC2 = score_DC2.pivot(index='cancer_broadest', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_DC2 = pivoted_df_DC2.reindex(custom_order)
    pivoted_df_DC2.shape

    #mregDC:
    # Group by 'integration_id' and calculate the mean score for each group
    score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'cancer_broadest' by matching the 'integration_id'
    score_mregDC = score_mregDC.merge(
        data_mregDC.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()
    pivoted_df_mregDC = score_mregDC.pivot(index='cancer_broadest', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_mregDC = pivoted_df_mregDC.reindex(custom_order)
    pivoted_df_mregDC.shape

    save_name = f"{set_name}.xlsx"

    with pd.ExcelWriter(save_name) as writer:
        pivoted_df_DC1.to_excel(writer, sheet_name='DC1')
        pivoted_df_DC2.to_excel(writer, sheet_name='DC2')
        pivoted_df_mregDC.to_excel(writer, sheet_name='mregDC')




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'GAD2', 'GAPDHS', 'H2AZ1', 'HAO2', 'HMGCS2', 'KMT5A', 'MIX23', 'XIST'],
      dtype='object')
       'GAD2', 'GAPDHS', 'H2AZ1', 'HAO2', 'HMGCS2', 'KMT5A', 'MIX23', 'XIST'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'GAD2', 'GAPDHS', 'H2AZ1', 'HAO2', 'HMGCS2', 'KMT5A', 'MIX23', 'XIST'],
      dtype='object')
       'PGLS', 'RARS1'],
      dtype='object')
       'PGLS', 'RARS1'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'PGLS', 'RARS1'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'OPRK1', 'ROS1', 'SELENOS', 'SLC28A2', 'TACR3', 'VIP'],
      dtype='object')
       'OPRK1', 'ROS1', 'SELENOS', 'SLC28A2', 'TACR3', 'VIP'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'OPRK1', 'ROS1', 'SELENOS', 'SLC28A2', 'TACR3', 'VIP'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'CHRNG', 'CLDN8', 'CLPS', 'COQ8A', 'CPB1', 'CYP11B2', 'EPHA5', 'FGF16',
       'FSHB', 'GDNF', 'GP2', 'GRID2', 'HTR1B', 'HTR1D', 'IL5', 'INSL5',
       'IRS4', 'ITIH3', 'KLK7', 'MACROH2A2', 'MYH7', 'NGB', 'NPY4R', 'NRIP2',
       'P2RY4', 'PAX4', 'PCDHB1', 'PRKN', 'PROP1', 'PTGFR', 'SCGB1A1',
       'SCN10A', 'SELENOP', 'SERPINA10', 'SLC38A3', 'SMPX', 'SSTR4', 'TCL1A',
       'TENT5C', 'TEX15', 'TFAP2B', 'TFF2', 'TLX1', 'TSHB', 'UGT2B17',
       'VPS50'],
      dtype='object')
       'CHRNG', 'CLDN8', 'CLPS', 'COQ8A', 'CPB1', 'CYP11B2', 'EPHA5', 'FGF16',
       'FSHB', 'GDNF', 'GP2', 'GRID2', 'HTR1B', 'HTR1D', 'IL5', 'INSL5',
       'IRS4', 'ITIH3', 'KLK7', 'MACROH2A2', 'MYH7', 'NGB', 'NPY4R', 'NRIP2',
       'P2RY4', 'PAX4', 'PCDHB1', 'PRKN', 'PROP1', 'PTGFR', 'SCGB1A1',
       'SCN10A', 'SELENOP', 'SERPINA10', 'SLC38A3', 'SMPX', 'SSTR4', 'TCL1A',
       'TENT5C', 'TEX15', 'TFAP2B', 'TFF2', 'TLX1', 'TSHB', 'UGT2B17',
       'VPS50'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'CHRNG', 'CLDN8', 'CLPS', 'COQ8A', 'CPB1', 'CYP11B2', 'EPHA5', 'FGF16',
       'FSHB', 'GDNF', 'GP2', 'GRID2', 'HTR1B', 'HTR1D', 'IL5', 'INSL5',
       'IRS4', 'ITIH3', 'KLK7', 'MACROH2A2', 'MYH7', 'NGB', 'NPY4R', 'NRIP2',
       'P2RY4', 'PAX4', 'PCDHB1', 'PRKN', 'PROP1', 'PTGFR', 'SCGB1A1',
       'SCN10A', 'SELENOP', 'SERPINA10', 'SLC38A3', 'SMPX', 'SSTR4', 'TCL1A',
       'TENT5C', 'TEX15', 'TFAP2B', 'TFF2', 'TLX1', 'TSHB', 'UGT2B17',
       'VPS50'],
      dtype='object')
       'GUCY1A1', 'H2BC3', 'HOXD11', 'IGF2', 'NGF', 'NR0B2', 'NR1H4', 'PCP4',
       'PECAM1', 'PEG3', 'PRELID3B', 'RBP4', 'RELN', 'SEMA3B', 'SNAP91',
       'SPON1', 'TMEM100', 'USH1C'],
      dtype='object')
       'GUCY1A1', 'H2BC3', 'HOXD11', 'IGF2', 'NGF', 'NR0B2', 'NR1H4', 'PCP4',
       'PECAM1', 'PEG3', 'PRELID3B', 'RBP4', 'RELN', 'SEMA3B', 'SNAP91',
       'SPON1', 'TMEM100', 'USH1C'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'GUCY1A1', 'H2BC3', 'HOXD11', 'IGF2', 'NGF', 'NR0B2', 'NR1H4', 'PCP4',
       'PECAM1', 'PEG3', 'PRELID3B', 'RBP4', 'RELN', 'SEMA3B', 'SNAP91',
       'SPON1', 'TMEM100', 'USH1C'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'ATP5MC2', 'ATP5MC3', 'ATP5ME', 'ATP5MF', 'ATP5MG', 'ATP5PB', 'ATP5PD',
       'ATP5PF', 'ATP5PO', 'TOMM70'],
      dtype='object')
       'ATP5MC2', 'ATP5MC3', 'ATP5ME', 'ATP5MF', 'ATP5MG', 'ATP5PB', 'ATP5PD',
       'ATP5PF', 'ATP5PO', 'TOMM70'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'ATP5MC2', 'ATP5MC3', 'ATP5ME', 'ATP5MF', 'ATP5MG', 'ATP5PB', 'ATP5PD',
       'ATP5PF', 'ATP5PO', 'TOMM70'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


# 15 Jan 2025: Using 3 more KEGG sets as they are enriched in in vitro sig

In [8]:
# Keys to include in new dictionary
keys_include = ["KEGG_RIBOSOME",
"KEGG_AUTOIMMUNE_THYROID_DISEASE",
"KEGG_GRAFT_VERSUS_HOST_DISEASE"]

In [9]:
# create a new dictionary with select keys (pathway sets)
K_sets_sub = {key: K_sets[key] for key in keys_include if key in K_sets}

In [10]:
print(K_sets_sub.keys())

dict_keys(['KEGG_RIBOSOME', 'KEGG_AUTOIMMUNE_THYROID_DISEASE', 'KEGG_GRAFT_VERSUS_HOST_DISEASE'])


In [11]:
custom_order = ['HGSOC','BC','CRC','GAC','GBM','HCC','HNSCC','MEL','NPC','NSCLC','PDAC','iCCA']

In [12]:
#set current directory 
os.chdir('/scratch/user/s4436039/scdata/Pathway4Prism')
os.getcwd()

'/scratch/user/s4436039/scdata/Pathway4Prism'

In [13]:
for set_name, gene_set_df in K_sets_sub.items():

    gene_set = gene_set_df['human_gene_symbol'].tolist()

    #score DC for gene set:
    sc.tl.score_genes(data_DC1, gene_list=gene_set, score_name=f"{set_name}_score")
    sc.tl.score_genes(data_DC2, gene_list=gene_set, score_name=f"{set_name}_score")
    sc.tl.score_genes(data_mregDC, gene_list=gene_set, score_name=f"{set_name}_score")

    score_column = f"{set_name}_score"

    #DC1: 

    # Group by 'integration_id' and calculate the mean score for each group
    score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'cancer_broadest' by matching the 'integration_id'
    score_DC1 = score_DC1.merge(
        data_DC1.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
    pivoted_df_DC1 = score_DC1.pivot(index='cancer_broadest', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_DC1 = pivoted_df_DC1.reindex(custom_order)
    pivoted_df_DC1.shape

    #DC2: 

    # Group by 'integration_id' and calculate the mean score for each group
    score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'cancer_broadest' by matching the 'integration_id'
    score_DC2 = score_DC2.merge(
        data_DC2.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
    pivoted_df_DC2 = score_DC2.pivot(index='cancer_broadest', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_DC2 = pivoted_df_DC2.reindex(custom_order)
    pivoted_df_DC2.shape

    #mregDC:
    # Group by 'integration_id' and calculate the mean score for each group
    score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'cancer_broadest' by matching the 'integration_id'
    score_mregDC = score_mregDC.merge(
        data_mregDC.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()
    pivoted_df_mregDC = score_mregDC.pivot(index='cancer_broadest', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_mregDC = pivoted_df_mregDC.reindex(custom_order)
    pivoted_df_mregDC.shape

    save_name = f"{set_name}.xlsx"

    with pd.ExcelWriter(save_name) as writer:
        pivoted_df_DC1.to_excel(writer, sheet_name='DC1')
        pivoted_df_DC2.to_excel(writer, sheet_name='DC2')
        pivoted_df_mregDC.to_excel(writer, sheet_name='mregDC')




  adata.obs[score_name] = pd.Series(




  adata.obs[score_name] = pd.Series(
  adata.obs[score_name] = pd.Series(


       'IFNA17', 'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7',
       'IFNA8', 'IL4', 'IL5', 'TSHB'],
      dtype='object')
       'IFNA17', 'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7',
       'IFNA8', 'IL4', 'IL5', 'TSHB'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


       'IFNA17', 'IFNA2', 'IFNA21', 'IFNA4', 'IFNA5', 'IFNA6', 'IFNA7',
       'IFNA8', 'IL4', 'IL5', 'TSHB'],
      dtype='object')


  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()




  score_DC1 = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


# 23 Feb 2025: Checking if hallmark IFNg is also up in healthy ovary 

In [4]:
data_H = data[data.obs["sample_type_major2"] == "healthy"]

data_H_DC1 = data_H[data_H.obs["NR_annotations_simple"] == "cDC1"]
data_H_DC2 = data_H[data_H.obs["NR_annotations_simple"] == "cDC2"]
data_H_mregDC = data_H[data_H.obs["NR_annotations_simple"] == "mregDC"]

data_M = data[data.obs["sample_type_major2"] == "metastatic tumour"]

data_M_DC1 = data_M[data_M.obs["NR_annotations_simple"] == "cDC1"]
data_M_DC2 = data_M[data_M.obs["NR_annotations_simple"] == "cDC2"]
data_M_mregDC = data_M[data_M.obs["NR_annotations_simple"] == "mregDC"]

In [8]:
# Keys to include in new dictionary
keys_include_IFNg = ["HALLMARK_INTERFERON_GAMMA_RESPONSE"
                ]

In [9]:
# create a new dictionary with select keys (pathway sets)
H_sets_IFNg = {key: H_sets[key] for key in keys_include_IFNg if key in H_sets}

In [10]:
print(H_sets_IFNg.keys())

dict_keys(['HALLMARK_INTERFERON_GAMMA_RESPONSE'])


In [11]:
custom_order = ['HGSOC','BC','CRC','GAC','GBM','HCC','HNSCC','MEL','NPC','NSCLC','PDAC','iCCA']

In [12]:
#set current directory 
os.chdir('/scratch/user/s4436039/scdata/Pathway4Prism')
os.getcwd()

'/scratch/user/s4436039/scdata/Pathway4Prism'

## loop through all pathways and export an excel containing scores per sample, grouped by cancer type, for each DC subset - Mets

In [13]:
for set_name, gene_set_df in H_sets_IFNg.items():

    gene_set = gene_set_df['human_gene_symbol'].tolist()

    #score DC for gene set:
    sc.tl.score_genes(data_M_DC1, gene_list=gene_set, score_name=f"{set_name}_score")
    sc.tl.score_genes(data_M_DC2, gene_list=gene_set, score_name=f"{set_name}_score")
    sc.tl.score_genes(data_M_mregDC, gene_list=gene_set, score_name=f"{set_name}_score")

    score_column = f"{set_name}_score"

    #DC1: 

    # Group by 'integration_id' and calculate the mean score for each group
    score_DC1 = data_M_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'cancer_broadest' by matching the 'integration_id'
    score_DC1 = score_DC1.merge(
        data_M_DC1.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
    pivoted_df_DC1 = score_DC1.pivot(index='cancer_broadest', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_DC1 = pivoted_df_DC1.reindex(custom_order)
    pivoted_df_DC1.shape

    #DC2: 

    # Group by 'integration_id' and calculate the mean score for each group
    score_DC2 = data_M_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'cancer_broadest' by matching the 'integration_id'
    score_DC2 = score_DC2.merge(
        data_M_DC2.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
    pivoted_df_DC2 = score_DC2.pivot(index='cancer_broadest', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_DC2 = pivoted_df_DC2.reindex(custom_order)
    pivoted_df_DC2.shape

    #mregDC:
    # Group by 'integration_id' and calculate the mean score for each group
    score_mregDC = data_M_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'cancer_broadest' by matching the 'integration_id'
    score_mregDC = score_mregDC.merge(
        data_M_mregDC.obs[["integration_id", "cancer_broadest", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()
    pivoted_df_mregDC = score_mregDC.pivot(index='cancer_broadest', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_mregDC = pivoted_df_mregDC.reindex(custom_order)
    pivoted_df_mregDC.shape

    save_name = f"{set_name}_METASTATIC.xlsx"

    with pd.ExcelWriter(save_name) as writer:
        pivoted_df_DC1.to_excel(writer, sheet_name='DC1_METS')
        pivoted_df_DC2.to_excel(writer, sheet_name='DC2_METS')
        pivoted_df_mregDC.to_excel(writer, sheet_name='mregDC_METS')




  adata.obs[score_name] = pd.Series(
  adata.obs[score_name] = pd.Series(
  adata.obs[score_name] = pd.Series(
  score_DC1 = data_M_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('cancer_broadest').cumcount()
  score_DC2 = data_M_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('cancer_broadest').cumcount()
  score_mregDC = data_M_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('cancer_broadest').cumcount()


## repeat for healthy:

In [14]:
custom_order = ['ovary','breast','colon','liver','lung','lymph node']

In [15]:
for set_name, gene_set_df in H_sets_IFNg.items():

    gene_set = gene_set_df['human_gene_symbol'].tolist()

    #score DC for gene set:
    sc.tl.score_genes(data_H_DC1, gene_list=gene_set, score_name=f"{set_name}_score")
    sc.tl.score_genes(data_H_DC2, gene_list=gene_set, score_name=f"{set_name}_score")
    sc.tl.score_genes(data_H_mregDC, gene_list=gene_set, score_name=f"{set_name}_score")

    score_column = f"{set_name}_score"

    #DC1: 

    # Group by 'integration_id' and calculate the mean score for each group
    score_DC1 = data_H_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'site' by matching the 'integration_id'
    score_DC1 = score_DC1.merge(
        data_H_DC1.obs[["integration_id", "site", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_DC1['replicate'] = score_DC1.groupby('site').cumcount()
    pivoted_df_DC1 = score_DC1.pivot(index='site', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_DC1 = pivoted_df_DC1.reindex(custom_order)
    pivoted_df_DC1.shape

    #DC2: 

    # Group by 'integration_id' and calculate the mean score for each group
    score_DC2 = data_H_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'site' by matching the 'integration_id'
    score_DC2 = score_DC2.merge(
        data_H_DC2.obs[["integration_id", "site", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_DC2['replicate'] = score_DC2.groupby('site').cumcount()
    pivoted_df_DC2 = score_DC2.pivot(index='site', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_DC2 = pivoted_df_DC2.reindex(custom_order)
    pivoted_df_DC2.shape

    #mregDC:
    # Group by 'integration_id' and calculate the mean score for each group
    score_mregDC = data_H_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()

    # Merge with 'site' by matching the 'integration_id'
    score_mregDC = score_mregDC.merge(
        data_H_mregDC.obs[["integration_id", "site", "NR_annotations_simple"]].drop_duplicates(),
        on="integration_id",
        how="left"
    )

    # convert to layout for prism (cancer types as rows with replicates as columns)
    score_mregDC['replicate'] = score_mregDC.groupby('site').cumcount()
    pivoted_df_mregDC = score_mregDC.pivot(index='site', columns='replicate', values=score_column)
    # Reindex the rows based on custom order
    pivoted_df_mregDC = pivoted_df_mregDC.reindex(custom_order)
    pivoted_df_mregDC.shape

    save_name = f"{set_name}_HEALTHY.xlsx"

    with pd.ExcelWriter(save_name) as writer:
        pivoted_df_DC1.to_excel(writer, sheet_name='DC1_HEALTHY')
        pivoted_df_DC2.to_excel(writer, sheet_name='DC2_HEALTHY')
        pivoted_df_mregDC.to_excel(writer, sheet_name='mregDC_HEALTHY')




  adata.obs[score_name] = pd.Series(
  adata.obs[score_name] = pd.Series(
  adata.obs[score_name] = pd.Series(
  score_DC1 = data_H_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC1['replicate'] = score_DC1.groupby('site').cumcount()
  score_DC2 = data_H_DC2.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_DC2['replicate'] = score_DC2.groupby('site').cumcount()
  score_mregDC = data_H_mregDC.obs.groupby("integration_id")[score_column].mean().reset_index()
  score_mregDC['replicate'] = score_mregDC.groupby('site').cumcount()
