# OP2 - Genes ontology analysis

In this notebook, I perform gene ontology analysis on the differential expression data from the 'Open Problems - Single-Cell Perturbations' competition, which aims to predict how small molecule perturbations affect gene expression in different cell types.
In my analysis, I filter the differential expression p-values at 0.05 significance and identify the enriched genes for each cell type and compound treatment. I then analyze the gene lists using Enrichr to identify enriched gene ontology terms and pathways from the KEGG database.
I combine the results into a DataFrame containing the enriched terms, adjusted p-values, and associated genes for each cell type and compound pair.


In [1]:
!pip install --quiet gseapy

In [2]:
# Load libraries
import datetime
import pandas as pd
import gseapy as gp

In [3]:
# Load data
de_train =   pd.read_parquet("/kaggle/input/open-problems-single-cell-perturbations/de_train.parquet")
id_map = pd.read_csv("/kaggle/input/open-problems-single-cell-perturbations/id_map.csv")

In [4]:
# pivot to long
id_vars = ['cell_type', 'sm_name', 'sm_lincs_id', 'SMILES', 'control']
df_long = pd.melt(de_train, id_vars=id_vars, var_name='Gene', value_name='p_value')
df_long.head()

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,Gene,p_value
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,A1BG,0.10472
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,A1BG,0.915953
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,A1BG,-0.387721
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,A1BG,0.232893
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,A1BG,4.290652


In [5]:
grouped = df_long.groupby(['cell_type', 'SMILES'])['Gene'].apply(list).reset_index()
grouped.head()

Unnamed: 0,cell_type,SMILES,Gene
0,B cells,CC#CCn1c(N2CCC[C@@H](N)C2)nc2c1c(=O)n(Cc1nc(C)...,"[A1BG, A1BG-AS1, A2M, A2M-AS1, A2MP1, A4GALT, ..."
1,B cells,CC(=O)c1c(C)c2cnc(Nc3ccc(N4CCNCC4)cn3)nc2n(C2C...,"[A1BG, A1BG-AS1, A2M, A2M-AS1, A2MP1, A4GALT, ..."
2,B cells,CC(C)(C)c1nc(-c2cccc(NS(=O)(=O)c3c(F)cccc3F)c2...,"[A1BG, A1BG-AS1, A2M, A2M-AS1, A2MP1, A4GALT, ..."
3,B cells,CC(C)C[C@H](NC(=O)CNC(=O)c1cc(Cl)ccc1Cl)B(O)O,"[A1BG, A1BG-AS1, A2M, A2M-AS1, A2MP1, A4GALT, ..."
4,B cells,CC[C@H](Nc1ncnc2[nH]cnc12)c1nc2cccc(F)c2c(=O)n...,"[A1BG, A1BG-AS1, A2M, A2M-AS1, A2MP1, A4GALT, ..."


In [6]:
grouped_genes = grouped.groupby(['cell_type', 'SMILES'])['Gene'].sum()

In [7]:
grouped_genes.head()

cell_type  SMILES                                                           
B cells    CC#CCn1c(N2CCC[C@@H](N)C2)nc2c1c(=O)n(Cc1nc(C)c3ccccc3n1)c(=O)n2C    [A1BG, A1BG-AS1, A2M, A2M-AS1, A2MP1, A4GALT, ...
           CC(=O)c1c(C)c2cnc(Nc3ccc(N4CCNCC4)cn3)nc2n(C2CCCC2)c1=O              [A1BG, A1BG-AS1, A2M, A2M-AS1, A2MP1, A4GALT, ...
           CC(C)(C)c1nc(-c2cccc(NS(=O)(=O)c3c(F)cccc3F)c2F)c(-c2ccnc(N)n2)s1    [A1BG, A1BG-AS1, A2M, A2M-AS1, A2MP1, A4GALT, ...
           CC(C)C[C@H](NC(=O)CNC(=O)c1cc(Cl)ccc1Cl)B(O)O                        [A1BG, A1BG-AS1, A2M, A2M-AS1, A2MP1, A4GALT, ...
           CC[C@H](Nc1ncnc2[nH]cnc12)c1nc2cccc(F)c2c(=O)n1-c1ccccc1             [A1BG, A1BG-AS1, A2M, A2M-AS1, A2MP1, A4GALT, ...
Name: Gene, dtype: object

In [8]:
results = []

for (cell_type, smiles), genes in grouped_genes.items():
    enrichment_result = gp.enrichr(gene_list=genes,  
                                   gene_sets='KEGG_2019_Human',  
                                   no_plot=True,
                                   cutoff=0.01)
    
    # Add cell_type and SMILES columns to the result DataFrame
    enrichment_result.res2d['cell_type'] = cell_type
    enrichment_result.res2d['SMILES'] = smiles

    results.append(enrichment_result.res2d)

# Concatenate results
combined_results = pd.concat(results, axis=0)

# Reorder columns to have 'cell_type' and 'SMILES' as the first columns
cols = ['cell_type', 'SMILES'] + [col for col in combined_results if col not in ['cell_type', 'SMILES']]
combined_results = combined_results[cols]

In [9]:
combined_results

Unnamed: 0,cell_type,SMILES,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,B cells,CC#CCn1c(N2CCC[C@@H](N)C2)nc2c1c(=O)n(Cc1nc(C)...,KEGG_2019_Human,Spliceosome,130/134,0.005755,0.999999,0,0,3.208479,1.654854e+01,TCERG1;EIF4A3;HNRNPU;EFTUD2;SNRPD2;SNRPD1;MAGO...
1,B cells,CC#CCn1c(N2CCC[C@@H](N)C2)nc2c1c(=O)n(Cc1nc(C)...,KEGG_2019_Human,Mitophagy,64/65,0.016596,0.999999,0,0,6.305836,2.584520e+01,CITED2;CALCOCO2;TBK1;FUNDC1;USP30;HRAS;USP8;GA...
2,B cells,CC#CCn1c(N2CCC[C@@H](N)C2)nc2c1c(=O)n(Cc1nc(C)...,KEGG_2019_Human,DNA replication,36/36,0.034166,0.999999,0,0,64404.000000,2.174625e+05,RNASEH2C;PRIM2;FEN1;RNASEH2B;RNASEH2A;PCNA;MCM...
3,B cells,CC#CCn1c(N2CCC[C@@H](N)C2)nc2c1c(=O)n(Cc1nc(C)...,KEGG_2019_Human,Legionellosis,54/55,0.036814,0.999999,0,0,5.317619,1.755819e+01,HBS1L;ARF1;CXCL8;ITGAM;ITGB2;CXCL1;CXCL3;CXCL2...
4,B cells,CC#CCn1c(N2CCC[C@@H](N)C2)nc2c1c(=O)n(Cc1nc(C)...,KEGG_2019_Human,B cell receptor signaling pathway,69/71,0.040972,0.999999,0,0,3.398275,1.085703e+01,GSK3B;IFITM1;CD81;INPPL1;PIK3CD;PIK3CB;IKBKB;P...
...,...,...,...,...,...,...,...,...,...,...,...,...
303,T regulatory cells,c1ccc2c(-c3cnn4cc(-c5ccc(N6CCNCC6)cc5)cnc34)cc...,KEGG_2019_Human,Renin-angiotensin system,12/23,0.999998,0.999999,0,0,0.106579,1.793597e-07,CTSA;NLN;MAS1;ACE;KLK1;MME;ANPEP;ATP6AP2;PREP;...
304,T regulatory cells,c1ccc2c(-c3cnn4cc(-c5ccc(N6CCNCC6)cc5)cnc34)cc...,KEGG_2019_Human,Maturity onset diabetes of the young,4/26,0.999998,0.999999,0,0,0.017646,2.936651e-08,NEUROD1;HHEX;HES1;PAX6
305,T regulatory cells,c1ccc2c(-c3cnn4cc(-c5ccc(N6CCNCC6)cc5)cnc34)cc...,KEGG_2019_Human,Nitrogen metabolism,8/17,0.999998,0.999999,0,0,0.086921,1.416149e-07,GLUD1;CPS1;CA5B;CA2;CA5A;CA6;GLUL;CA13
306,T regulatory cells,c1ccc2c(-c3cnn4cc(-c5ccc(N6CCNCC6)cc5)cnc34)cc...,KEGG_2019_Human,alpha-Linolenic acid metabolism,11/25,0.999998,0.999999,0,0,0.076629,1.246889e-07,FADS2;PLA2G16;PLA2G12A;PLA2G2D;ACOX1;PLA2G4C;P...


In [10]:
combined_results.to_csv("/kaggle/working/genes_ontology.csv")