In [5]:
import pandas as pd
import numpy as np
from matplotlib_venn import venn3
import matplotlib.pyplot as plt
import os
import pyranges as pr
import gseapy as gp
from gseapy import barplot, dotplot
import json

In [6]:
cell_type = 'CD8'
de_type = 'include_emergent_disappear' # include_emergent_disappear, no_emergent_include_disappear, no_emergent_no_disappear
fig_path = '/home/che/TRIM/git/tcr/figures/saliency'
parent_path = f'{fig_path}/{de_type}/{cell_type}/data'
geneset_path = f'{parent_path}/gene_sets'
csv_path = f'{parent_path}/diffexp_df.csv'
output_path = f'{parent_path}/compiled'
venn_path = f'{output_path}/venn/all_gradient'

if not os.path.exists(output_path):
    os.makedirs(output_path)
if not os.path.exists(venn_path):
    os.makedirs(venn_path)

print(output_path)
print(venn_path)

/home/che/TRIM/git/tcr/figures/saliency/include_emergent_disappear/CD8/data/compiled
/home/che/TRIM/git/tcr/figures/saliency/include_emergent_disappear/CD8/data/compiled/venn/all_gradient


In [7]:
# Read in the whole dataset
df = pd.read_csv(csv_path)
df.rename(columns={'Unnamed: 0': 'Gene'}, inplace=True)
df

Unnamed: 0,Gene,lfc_post,lfc_pre,lfc_clone_pre_10,lfc_clone_pre_1,lfc_post_pvals_adj,lfc_pre_pvals_adj,lfc_clone_pre_10_pvals_adj,lfc_clone_pre_1_pvals_adj,salient_genes,salient_genes_rank,lfc_post_rank,lfc_pre_rank,lfc_clone_pre_10_rank,lfc_clone_pre_1_rank
0,A1BG,0.841133,-0.794820,-0.725561,-1.166667,9.641343e-08,0.00015,1.121982e-12,2.611040e-41,-0.000007,10791.0,1865.0,17173.0,16486.0,17158.0
1,A1BG-AS1,0.630395,-0.090250,-0.484623,0.291439,1.000000e+00,1.00000,1.000000e+00,1.000000e+00,0.000031,2010.0,2357.0,13270.0,15601.0,6076.0
2,A2M,-1.642448,2.252429,-0.178504,-0.586164,1.000000e+00,1.00000,1.000000e+00,1.000000e+00,0.000051,658.0,18341.0,454.0,13850.0,13951.0
3,A2M-AS1,0.501576,0.178894,0.893563,0.957310,1.000000e+00,1.00000,1.000000e+00,1.000000e+00,-0.000027,15070.0,2735.0,8739.0,3887.0,2239.0
4,A4GALT,-1.497684,2.251524,0.973174,-0.587068,,,,,0.000005,6178.0,17228.0,1452.5,2731.5,14951.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19251,ZXDC,0.052903,-0.004820,0.134014,-0.018931,1.000000e+00,1.00000,1.000000e+00,1.000000e+00,0.000010,5125.0,6347.0,11984.0,9515.0,10352.0
19252,ZYG11A,-1.497684,0.396684,2.828014,1.267772,,1.00000,1.000000e+00,1.000000e+00,-0.000029,15255.0,17228.0,6003.0,359.0,1522.0
19253,ZYG11B,0.317693,-0.155747,-0.495282,-0.672724,1.000000e+00,1.00000,1.000000e+00,1.000000e+00,-0.000004,10006.0,3636.0,14064.0,15651.0,16224.0
19254,ZYX,-0.018861,-0.026237,0.016083,0.089319,1.000000e+00,1.00000,1.000000e+00,1.000000e+00,-0.000028,15111.0,7555.0,12323.0,11443.0,8987.0


In [8]:
# full
relevant_genes = {
    't_cell_activation_and_immune_response': [
        'CCR7','CD2','CD24', 'CD81', 'CD93', 'CTLA4', 'FOXO1', 'GNLY'
        'GZMA','GZMH','HCST', 'HOPX','IRF4','NKG7','NRP1','SH2B3','TNFRSF9', 

        'CD27', 'CD28', 'CD8A', 'CD8B', 'CD8B2',
        'CX3CR1', 'IL7R', 'IL10RA', 'IL18R1', 'IL18RAP', 'IL2RG',
        'FOXP1', 'LAG3', 'TCF7', 'TBX21',
        'TMIGD2', 'TNFRSF4', 
        'CD247', 'CLEC2B', 'ICOS',
        "CD37", "CD38", "CD3D", "CD69", "GZMB", "GZMK", "FOXP3",
    ],
    't_cell_proliferation_and_cell_cycle_regulation': [
        'AURKA', 'IRF4', 'ZBTB32','CENPU',

        'UBE2C', 'SATB1', 'ID2', 'EOMES', 'LEF1',
        'MYC', 'CDKN2A', 'CDKN2D',
        'PIM2', 'PIMREG',
        "MKI67", "CCNA2", "CCNB2", "CDC20", "CDC45", "CDK1", "AURKB",
    ],
    'cytotoxic_and_effector_functions': [
        'CST7', 'GNLY', 'GZMA', 'GZMH', 'LTA', 'NKG7', 'PRF1', 'S100A4',
        'GZMB', 'GZMM', 'CTSW', 'TYROBP',
        'FGFBP2', 'NCR1', 'FCGR3A', 'FCER1G',
        'ANXA1', "GZMK",
    ],
    'regulation_of_immune_pathways': [
        'IRF4', 'KLRD1', 'KLRG1', 'NRP1', 'SH2B3', 'SHC4', 'ZBTB10', 'ZBTB12',

        'KLRB1', 'KIR2DL1', 'KIR2DL3', 'KIR3DL1', 'KIR3DL2', 'TOX', 'BATF',
        'IRF1', 'BACH2', 'DUSP2', 'NFKB1', 'NFKB2',
        'BCL11B', 'BACH2', 'DOK2', 'FOXP1', 'IRF1', 'ISG20',
        'NDFIP1', 'SH2D2A', 'SIRPG', "KLRC4"
    ],
    'metabolism_and_signaling_pathways': [
        'ACKR3', 'EFHD2', 'FOXO1', 'KCNN3', 'OXCT2', 'PASK', 'QPCT', 'VDAC2',

        'MAP3K8', 'FOXP1', 'FYN', 'GNG2', 'ARF6',
        'LDHA','RGS9', 'G6PD',
        'ABHD17A', 'ADSL', 
        'APMAP', "AREL1"

    ],
    'cytoskeletal_dynamics_and_migration': [
        'ARHGAP10', 'ARHGEF5', 'MARCKS', 'NEDD9', 'RHOC', 'S100A4', 'WASF3',

        'ITGAL', 'ITGB2', 'LCP1', 'MSN', 'CXCR3', 'VASP',
         'ABLIM1', 'ARPC2', 'ARPC5L', "STMN1"
    ],
    'immune_checkpoint_molecules_and_regulation': [
        'CTLA4', 'IL6R', 'TGFBR1', 'TNFRSF9',
        'CD160', 'CD300A', 'LAG3',
        'TIGIT',  'NT5E', 'TGFBR2', 'BTLA', "KIR3DL1"
    ]
}

# Save to a JSON file (pretty-printed)
with open(f"{geneset_path}/relevant_genes.json", "w") as f:
    json.dump(relevant_genes, f, indent=4)