In [None]:
import scanpy as sc
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.stats import normaltest

In [None]:
#show current directory 
os.getcwd()
os.chdir('/scratch/user/s4436039/scdata/Python_Integration_Sep')
os.getcwd()

In [None]:
# read in data
data = sc.read_h5ad('NRclean_clustered.h5ad')

In [None]:
data_P = data[data.obs["sample_type_major2"] == "primary tumour"]

data_DC1 = data_P[data_P.obs["NR_annotations_simple"] == "cDC1"]
data_DC2 = data_P[data_P.obs["NR_annotations_simple"] == "cDC2"]
data_mregDC = data_P[data_P.obs["NR_annotations_simple"] == "mregDC"]

# Import KEGG sets

In [None]:
os.chdir('/scratch/user/s4436039/scdata/Pathway-sets')
K_sets = pd.read_excel("kegg_sets.xlsx", sheet_name=None)
os.chdir('/scratch/user/s4436039/scdata/Python_Integration_Sep')

In [None]:
# Print the sheet names
print(K_sets.keys())

In [None]:
# Change keys to be gene set name (the first value in the 'gs_name' column) instead of Sheet1 etc 
K_sets2 = {df["gs_name"].iloc[0]: df for _, df in K_sets.items()}
K_sets = K_sets2

# Result
print(K_sets.keys())

In [None]:
# Keys to include in new dictionary
keys_include = ["KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY",
"KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION",
"KEGG_APOPTOSIS",
"KEGG_CALCIUM_SIGNALING_PATHWAY",
"KEGG_CELL_ADHESION_MOLECULES_CAMS",
"KEGG_CHEMOKINE_SIGNALING_PATHWAY",
"KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION",
"KEGG_CYTOSOLIC_DNA_SENSING_PATHWAY",
"KEGG_ENDOCYTOSIS",
"KEGG_FATTY_ACID_METABOLISM",
"KEGG_FC_GAMMA_R_MEDIATED_PHAGOCYTOSIS",
"KEGG_GLYCOLYSIS_GLUCONEOGENESIS",
"KEGG_JAK_STAT_SIGNALING_PATHWAY",
"KEGG_MAPK_SIGNALING_PATHWAY",
"KEGG_NOD_LIKE_RECEPTOR_SIGNALING_PATHWAY",
"KEGG_NOTCH_SIGNALING_PATHWAY",
"KEGG_OXIDATIVE_PHOSPHORYLATION",
"KEGG_PROTEASOME",
"KEGG_PROTEIN_EXPORT",
"KEGG_TGF_BETA_SIGNALING_PATHWAY",
"KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY",
"KEGG_TYROSINE_METABOLISM",
"KEGG_UBIQUITIN_MEDIATED_PROTEOLYSIS",
"KEGG_VEGF_SIGNALING_PATHWAY"]

In [None]:
# create a new dictionary with select keys (pathway sets)
K_sets_sub = {key: K_sets[key] for key in keys_include if key in K_sets}

# Display the new dictionary
K_sets_sub

In [None]:
print(K_sets_sub.keys())

In [None]:
adipocytokine_df = K_sets_sub["KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY"]
adipocytokine_genes = adipocytokine_df["human_gene_symbol"]
adipocytokine_list = adipocytokine_genes.tolist()

In [None]:
#score DC for gene set:
sc.tl.score_genes(data_DC1, gene_list=adipocytokine_list, score_name=f"adipocytokine_score")
sc.tl.score_genes(data_DC2, gene_list=adipocytokine_list, score_name=f"adipocytokine_score")
sc.tl.score_genes(data_mregDC, gene_list=adipocytokine_list, score_name=f"adipocytokine_score")

In [None]:
# Extract the score
score_column = "adipocytokine_score"

# Group by 'integration_id' and calculate the mean score for each group
sample_scores_df = data_DC1.obs.groupby("integration_id")[score_column].mean().reset_index()

# Merge with 'cancer_type_broad' by matching the 'integration_id'
sample_scores_df = sample_scores_df.merge(
    data_DC1.obs[["integration_id", "cancer_type_broad"]].drop_duplicates(),
    on="integration_id",
    how="left"
)

# The resulting 'sample_scores_df' will contain 'integration_id', 'adipocytokine_score', and 'cancer_type_broad'
print(sample_scores_df.head())

In [None]:
print("Adipocytokine: ", normaltest(sample_scores_df["adipocytokine_score"]).pvalue)