# Looking at the L/R interactinos enriched in particular pairs of cell types

CellphoneDB

Validation cohort
18.02.2021

This code uses DEGs computed for each cluster to identify relevant L/R interactions between the cells in a microenviroment

Code from Luz rewritten in python

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

In [2]:
# Define cutoff variables
filter_int_user_curated = True # Use only user_curated interactions?
per_cutoff = 0.1 # min % of cells in the cluster required  with expression > 0 for the gene
pval_cutoff = 0.05 # max adjusted p-value requeired to consider a gene as DEG

# as of 18.03.2021, not using the logFC cutoff at all! --> 0 here
logFC_cutoff = 0 # min logFC to consider a gene as DEG

## Load cellphone database


In [3]:
# Gene names
genes_cpDB = pd.read_csv('/home/jovyan/notebooks/Vento_Lab/CVID/202009_new_analysis_revision/CITE_all_samples_analysis/CVID/scTranscriptomics_CITE/cellphonedb_analysis/hsa_uniprot.txt', 
                         sep = '\t')

In [4]:
genes_cpDB

Unnamed: 0,uniprot,Entry,gene_name
0,P01611,KVD12_HUMAN,IGKV1D-12
1,P01615,KVD28_HUMAN,IGKV2D-28
2,Q15334,L2GL1_HUMAN,LLGL1
3,Q6ZP29,LAAT1_HUMAN,PQLC2
4,Q9GZZ8,LACRT_HUMAN,LACRT
...,...,...,...
20311,Q9H900,ZWILC_HUMAN,ZWILCH
20312,P98169,ZXDB_HUMAN,ZXDB
20313,Q2QGD7,ZXDC_HUMAN,ZXDC
20314,Q15942,ZYX_HUMAN,ZYX


In [5]:
# Complexes members
com_cpDB = pd.read_csv('/home/jovyan/notebooks/Vento_Lab/CVID/202009_new_analysis_revision/CITE_all_samples_analysis/CVID/scTranscriptomics_CITE/cellphonedb_analysis/database_20210218/complex_generated.csv')
com_cpDB['complex_name'] = ['complex:' + complex_name for complex_name in com_cpDB['complex_name']]

In [6]:
com_cpDB

Unnamed: 0,complex_name,uniprot_1,uniprot_2,uniprot_3,uniprot_4,transmembrane,peripheral,secreted,secreted_desc,secreted_highlight,receptor,receptor_desc,integrin,other,other_desc,pdb_id,pdb_structure,stoichiometry,comments_complex
0,complex:contactin complex II,Q12860,Q92823,,,True,False,False,,False,False,,False,False,,,FALSE,,NRCAM bind in cis and in trans to contactin-1
1,complex:IL6 receptor,P08887,P40189,,,True,False,False,,False,True,Cytokine receptor IL6 family,False,False,,1p9m,binding,IL6;IL6;IL6R;IL6R;IL6ST;IL6ST,Signal activation necessitate an association w...
2,complex:AT8B4CC50B complex,Q8TF62,Q3MIR4,,,True,False,False,,False,False,,False,False,,,FALSE,,Interacts with beta subunits TMEM30A and TMEM30B
3,complex:KCNV1KCNB2 complex,Q6PIU1,Q92953,,,True,False,False,,False,False,,False,False,,,FALSE,,Has to be associated with another potassium ch...
4,complex:LRFN3LRFN5 complex,Q9BTN0,Q96NI6,,,True,False,False,,False,False,,False,False,,,FALSE,,"Can form heteromeric complexes with LRFN1, LRF..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614,complex:FZD8_LRP6,O75581,Q9H461,,,True,False,False,,False,False,,False,False,,,False,,
615,complex:FZD9_LRP5,O75197,O00144,,,True,False,False,,False,False,,False,False,,,False,,
616,complex:FZD9_LRP6,O75581,O00144,,,True,False,False,,False,False,,False,False,,,False,,
617,complex:FZD10_LRP5,O75197,Q9ULW2,,,True,False,False,,False,False,,False,False,,,False,,


In [7]:
#'complex:FZD8_LRP6'[8:]

In [8]:
#com_cpDB[(com_cpDB['complex_name'] == 'complex:IL6 receptor')].loc[:, ['uniprot_1', 'uniprot_2', 'uniprot_3', 'uniprot_4']].values

In [9]:
#genes_cpDB[genes_cpDB['uniprot'].isin(['P08887','P40189'])]['gene_name']

In [10]:
# Generate complexes2gene symbol dictionary
Com2Gene = {}


for complex_name in np.unique(com_cpDB['complex_name']):
    #print(complex_name)
    
    # getting rid of 'complex:' in the beginning
    #complex_name = complex_name[8:]
    #print(complex_name)
    
    curr_complex_proteins = list(com_cpDB[(com_cpDB['complex_name'] == complex_name)].loc[:, ['uniprot_1', 'uniprot_2', 'uniprot_3', 'uniprot_4']].values.tolist())
    # flatten list 
    curr_complex_proteins = [item for sublist in curr_complex_proteins for item in sublist]
    #print(curr_complex_proteins)
    # remove nans
    curr_complex_proteins = [x for x in curr_complex_proteins if str(x) != 'nan']
    #print('after removing nans:', curr_complex_proteins)
    
    # getting corresponding gene names from the gene table
    Com2Gene[complex_name] = list(genes_cpDB[genes_cpDB['uniprot'].isin(curr_complex_proteins)]['gene_name'])
    

In [11]:
list(Com2Gene.items())[:10]

[('complex:12oxoLeukotrieneB4_byPTGR1', ['PTGR1']),
 ('complex:17aHydroxyprogesterone_byCYP17A1', ['CYP17A1']),
 ('complex:22Hydroxycholesterol_byCYP11A1', ['CYP11A1']),
 ('complex:22Hydroxycholesterol_byCYP3A4', ['CYP3A4']),
 ('complex:2arachidonoylglycerol_byDAGLA', ['DAGLA']),
 ('complex:2arachidonoylglycerol_byDAGLB', ['DAGLB']),
 ('complex:5-alpha-Dihydroprogesterone_byDHRS9', ['DHRS9']),
 ('complex:5HT3C5HT3A complex', ['HTR3A', 'HTR3C']),
 ('complex:5HT3C5HT3A_complex', ['HTR3A', 'HTR3C']),
 ('complex:5HT3D receptor', ['HTR3A', 'HTR3D'])]

In [12]:
# Load interactions from cellphoneDB/out/means.txt output file                   
int_cpDB = pd.read_csv('/home/jovyan/notebooks/Vento_Lab/CVID/202009_new_analysis_revision/CITE_all_samples_analysis/CVID/scTranscriptomics_CITE/cellphonedb_analysis/out/means.txt',
                      sep='\t')

# disregarding pairwise average expression values
int_cpDB = int_cpDB.loc[:, list(int_cpDB.columns)[:11]]
int_cpDB

Unnamed: 0,id_cp_interaction,interacting_pair,partner_a,partner_b,gene_a,gene_b,secreted,receptor_a,receptor_b,annotation_strategy,is_integrin
0,CPI-SS0A7B487D4,KLRG2_WNT11,simple:A4D1S0,simple:O96014,KLRG2,WNT11,True,True,False,InnateDB-All,False
1,CPI-CS0481C1F9A,FZD1_LRP5_WNT11,complex:FZD1_LRP5,simple:O96014,,WNT11,True,False,False,user_curated,False
2,CPI-CS0F29C6285,FZD1_LRP6_WNT11,complex:FZD1_LRP6,simple:O96014,,WNT11,True,False,False,user_curated,False
3,CPI-CS0372FC240,FZD2_LRP5_WNT11,complex:FZD2_LRP5,simple:O96014,,WNT11,True,False,False,user_curated,False
4,CPI-CS031A2034E,FZD2_LRP6_WNT11,complex:FZD2_LRP6,simple:O96014,,WNT11,True,False,False,user_curated,False
...,...,...,...,...,...,...,...,...,...,...,...
1351,CPI-SC047CEF2DD,CRLF2_TSLPR,simple:Q9HC73,complex:TSLPR,CRLF2,,True,True,True,user_curated,False
1352,CPI-SS04C672963,ESAM_ESAM,simple:Q96AP7,simple:Q96AP7,ESAM,ESAM,False,False,False,user_curated,False
1353,CPI-SC001AFA16D,NRTN_RET receptor 2,simple:Q99748,complex:RET receptor 2,NRTN,,True,False,True,curated,False
1354,CPI-SC060C69786,NRTN_RET_receptor_2,simple:Q99748,complex:RET_receptor_2,NRTN,,True,False,True,user_curated,False


In [13]:
np.unique(int_cpDB['annotation_strategy'], return_counts=True)

(array(['I2D', 'I2D,IMEx,InnateDB,InnateDB-All,IntAct,MINT',
        'I2D,IMEx,InnateDB,IntAct', 'I2D,IMEx,InnateDB-All,IntAct',
        'I2D,IMEx,InnateDB-All,IntAct,MINT', 'I2D,IMEx,InnateDB-All,MINT',
        'I2D,InnateDB', 'I2D,InnateDB-All', 'I2D,InnateDB-All,IntAct',
        'I2D,IntAct', 'IMEx', 'IMEx,InnateDB-All,IntAct',
        'IMEx,InnateDB-All,IntAct,MatrixDB', 'IMEx,InnateDB-All,MINT',
        'IMEx,InnateDB-All,UniProt', 'IMEx,IntAct', 'IMEx,MINT',
        'InnateDB', 'InnateDB-All', 'InnateDB-All,MINT', 'curated',
        'guidetopharmacology.org', 'user_curated'], dtype=object),
 array([ 42,   1,   1,   4,   1,   1,   2,  21,   1,   2,   2,   4,   1,
          6,   1,  26,   2,   2,  67,   2, 279,  63, 825]))

In [14]:
# MANDATORY: remove "curated" because we have cleaned and renamed them (this is a long story, just do it)
# these interactions have either been renamed or excluded so best not to use them
int_cpDB = int_cpDB[int_cpDB['annotation_strategy'] != 'curated']

In [15]:
# OPTIONAL: Use only user_curated interactions?
if filter_int_user_curated:
    int_cpDB = int_cpDB[int_cpDB['annotation_strategy'] == 'user_curated']

In [16]:
int_cpDB

Unnamed: 0,id_cp_interaction,interacting_pair,partner_a,partner_b,gene_a,gene_b,secreted,receptor_a,receptor_b,annotation_strategy,is_integrin
1,CPI-CS0481C1F9A,FZD1_LRP5_WNT11,complex:FZD1_LRP5,simple:O96014,,WNT11,True,False,False,user_curated,False
2,CPI-CS0F29C6285,FZD1_LRP6_WNT11,complex:FZD1_LRP6,simple:O96014,,WNT11,True,False,False,user_curated,False
3,CPI-CS0372FC240,FZD2_LRP5_WNT11,complex:FZD2_LRP5,simple:O96014,,WNT11,True,False,False,user_curated,False
4,CPI-CS031A2034E,FZD2_LRP6_WNT11,complex:FZD2_LRP6,simple:O96014,,WNT11,True,False,False,user_curated,False
5,CPI-CS02643715E,FZD3_LRP5_WNT11,complex:FZD3_LRP5,simple:O96014,,WNT11,True,False,False,user_curated,False
...,...,...,...,...,...,...,...,...,...,...,...
1350,CPI-SC090068F7B,TSLP_TSLPR,simple:Q969D9,complex:TSLPR,TSLP,,True,False,True,user_curated,False
1351,CPI-SC047CEF2DD,CRLF2_TSLPR,simple:Q9HC73,complex:TSLPR,CRLF2,,True,True,True,user_curated,False
1352,CPI-SS04C672963,ESAM_ESAM,simple:Q96AP7,simple:Q96AP7,ESAM,ESAM,False,False,False,user_curated,False
1354,CPI-SC060C69786,NRTN_RET_receptor_2,simple:Q99748,complex:RET_receptor_2,NRTN,,True,False,True,user_curated,False


In [17]:
int_cpDB.loc[1,:]

id_cp_interaction        CPI-CS0481C1F9A
interacting_pair         FZD1_LRP5_WNT11
partner_a              complex:FZD1_LRP5
partner_b                  simple:O96014
gene_a                               NaN
gene_b                             WNT11
secreted                            True
receptor_a                         False
receptor_b                         False
annotation_strategy         user_curated
is_integrin                        False
Name: 1, dtype: object

In [18]:
'complex:FZD1_LRP5' in list(Com2Gene.keys())

True

In [19]:
list(Com2Gene.keys())[:5]

['complex:12oxoLeukotrieneB4_byPTGR1',
 'complex:17aHydroxyprogesterone_byCYP17A1',
 'complex:22Hydroxycholesterol_byCYP11A1',
 'complex:22Hydroxycholesterol_byCYP3A4',
 'complex:2arachidonoylglycerol_byDAGLA']

In [20]:
'complex:FZD1_LRP5' in list(com_cpDB['complex_name'])

True

In [21]:
# Generate Int2Gene dictionary
Int2Gene = {}

for i in int_cpDB.index:
    #print('row number:', i)
    curr_df_row = int_cpDB.loc[i,:]
    #print('row:', curr_df_row)
    
    # if partnerA is complex (aka np.isnan(table['gene_a']) == True), then retrieve members from dictionary
    # complex name will be in table['partner_a']
    if str(curr_df_row['gene_a']) == 'nan':
        partner_A = Com2Gene[curr_df_row['partner_a']]
    else:
        # if it's not a complex, then get the partner A from table['gene_a']
        partner_A = [curr_df_row['gene_a']]
        
    # if partnerB is complex, then retrieve members from dictionary
    # complex name will be in table['partner_b']
    if str(curr_df_row['gene_b']) == 'nan':
        partner_B = Com2Gene[curr_df_row['partner_b']]
    else:
        # if it's not a complex, then get the partner B from table['gene_b']
        partner_B = [curr_df_row['gene_b']]
        
    interaction_id = curr_df_row['interacting_pair']

    Int2Gene[interaction_id] = {'partner_a': partner_A,
                  'partner_b': partner_B}

In [22]:
list(Int2Gene.items())[:10]

[('FZD1_LRP5_WNT11', {'partner_a': ['LRP5', 'FZD1'], 'partner_b': ['WNT11']}),
 ('FZD1_LRP6_WNT11', {'partner_a': ['FZD1', 'LRP6'], 'partner_b': ['WNT11']}),
 ('FZD2_LRP5_WNT11', {'partner_a': ['LRP5', 'FZD2'], 'partner_b': ['WNT11']}),
 ('FZD2_LRP6_WNT11', {'partner_a': ['FZD2', 'LRP6'], 'partner_b': ['WNT11']}),
 ('FZD3_LRP5_WNT11', {'partner_a': ['LRP5', 'FZD3'], 'partner_b': ['WNT11']}),
 ('FZD3_LRP6_WNT11', {'partner_a': ['FZD3', 'LRP6'], 'partner_b': ['WNT11']}),
 ('FZD4_LRP5_WNT11', {'partner_a': ['LRP5', 'FZD4'], 'partner_b': ['WNT11']}),
 ('FZD4_LRP6_WNT11', {'partner_a': ['FZD4', 'LRP6'], 'partner_b': ['WNT11']}),
 ('FZD5_LRP5_WNT11', {'partner_a': ['LRP5', 'FZD5'], 'partner_b': ['WNT11']}),
 ('FZD5_LRP6_WNT11', {'partner_a': ['FZD5', 'LRP6'], 'partner_b': ['WNT11']})]

## Load cluster's gene percentage expression

Prepared in S2 notebook

In [23]:
# Load percentage expression info
# Matrix of genes (rows) per celltypes (columns) containing the proportion [0-1] of cells 
# in a celltype expressing the gene
path_Exp = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202009_CVID_revision/PercentExpressed_for_cellphone_20210218.csv'

# Load matrix
Per_df = pd.read_csv(path_Exp, index_col=0) 

# Dictionary of celltypes2expressed genes
genes_expr_per_cell_type = {} 

for ct in Per_df.columns:
    print(ct)
    curr_table = pd.DataFrame(Per_df.loc[:, ct])
    
    # only leave in genes expressed in this cell type according to a threshold declared in the beginnning of this notebook
    mask = (curr_table[ct] > per_cutoff)
    genes_expr_per_cell_type[ct] = list(curr_table[ct][mask].index)

B_cells_memory
B_cells_memory_activated
B_cells_naive
B_cells_naive_activated
MAIT_cells
Macrophages
Monocytes_classical
Monocytes_intermediate
Monocytes_non-classical
NK_CD16_bright
NK_CD16_bright_activated
NK_CD56_bright
NK_CD56_bright_activated
Plasma_cells
Precursor_cells
T4_activated
T4_memory
T4_naive
T8_activated
T8_naive
TCM_CD8+
TEM_CD8+
TMRA_CD8+
T_gd
T_regs
cDC1
cDC2
iNKT_cells
pDC


In [24]:
Per_df

Unnamed: 0_level_0,B_cells_memory,B_cells_memory_activated,B_cells_naive,B_cells_naive_activated,MAIT_cells,Macrophages,Monocytes_classical,Monocytes_intermediate,Monocytes_non-classical,NK_CD16_bright,...,T8_naive,TCM_CD8+,TEM_CD8+,TMRA_CD8+,T_gd,T_regs,cDC1,cDC2,iNKT_cells,pDC
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RP11-34P13.7,0.000000,0.000378,0.000590,0.000000,0.000000,0.000526,0.000183,0.000238,0.000000,0.000000,...,0.000321,0.000305,0.000000,0.000000,0.000721,0.000139,0.000847,0.000000,0.000177,0.000000
FO538757.2,0.118336,0.242155,0.086777,0.072884,0.071611,0.256835,0.220497,0.168689,0.205003,0.065168,...,0.068678,0.102259,0.056984,0.076870,0.075703,0.166805,0.197018,0.099831,0.091022,0.067669
AP006222.2,0.001951,0.003403,0.000590,0.000784,0.000853,0.006046,0.004202,0.004759,0.002441,0.000378,...,0.000642,0.001221,0.000485,0.000693,0.000721,0.001249,0.002710,0.000000,0.000532,0.000000
RP4-669L17.10,0.001951,0.002836,0.002361,0.000784,0.000853,0.001577,0.000365,0.000714,0.002441,0.000189,...,0.000963,0.000305,0.000485,0.002078,0.002163,0.001804,0.001355,0.000000,0.001065,0.000000
RP5-857K21.4,0.000650,0.000000,0.000000,0.000000,0.000000,0.000000,0.000548,0.000000,0.000000,0.000000,...,0.000321,0.000305,0.000000,0.000000,0.000000,0.000278,0.000339,0.000000,0.000177,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTD-2541M15.3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
THEGL,0.000000,0.000000,0.000000,0.000000,0.000000,0.000263,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000169,0.000000,0.000000,0.000000
KIAA1644,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000610,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000508,0.006768,0.000000,0.000000
RP11-132A1.3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000139,0.000000,0.000000,0.000355,0.000000


In [25]:
len(genes_expr_per_cell_type['B_cells_memory'])

4038

## Load DE expression info

In [26]:
path_DE = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202009_CVID_revision/joint_DEGs_list_all_cell_types_for_cellphone_20210218.csv'
DE_df = pd.read_csv(path_DE, #row.names = 0
                )

DE_df


Unnamed: 0.1,Unnamed: 0,Gene,logFC,P.Value,adj.P.Val,AveExpr_cluster,AveExpr_rest,percentExpr_cluster,percentExpr_rest,cluster
0,0,CD73(Ecto-5'-nucleotidase),-0.972167,1.004392e-34,1.603813e-30,1.075824,2.047991,0.532946,0.735178,B_cells_memory
1,1,IgD,0.892972,2.104098e-32,1.679912e-28,2.001557,1.108585,0.783915,0.565217,B_cells_memory
2,2,CD32,0.858412,1.258348e-22,6.697768e-19,4.160845,3.302433,0.936047,0.849802,B_cells_memory
3,3,IGHG3,0.405874,3.555500e-22,1.419356e-18,0.599779,0.193905,0.371124,0.128458,B_cells_memory
4,4,IGHM,0.439509,9.379230e-17,2.995351e-13,1.090960,0.651451,0.611434,0.371542,B_cells_memory
...,...,...,...,...,...,...,...,...,...,...
465692,11554,ABCF3,-0.000026,9.993354e-01,9.996097e-01,0.024162,0.024187,0.014085,0.032258,pDC
465693,11555,DDX31,0.000032,9.993502e-01,9.996097e-01,0.038317,0.038285,0.028169,0.032258,pDC
465694,11556,CACUL1,0.000019,9.997470e-01,9.998807e-01,0.087572,0.087553,0.056338,0.080645,pDC
465695,11557,NUCB1,-0.000024,9.998530e-01,9.998807e-01,0.490545,0.490569,0.323944,0.338710,pDC


In [27]:
'PTPRC' in list(DE_df['Gene'])

True

In [28]:
logFC_cutoff

0

In [29]:
pval_cutoff

0.05

In [30]:
per_cutoff

0.1

In [31]:
# filter the DE table according to cutoffs declared in the beginning of the notebook
# separately for upreg and downreg genes

DE_df = DE_df[(abs(DE_df['logFC']) > logFC_cutoff)
             & (DE_df['adj.P.Val'] < pval_cutoff)
             & (DE_df['percentExpr_cluster'] > per_cutoff)]

DE_df_upreg = DE_df[DE_df['logFC'] > 0]
DE_df_downreg = DE_df[DE_df['logFC'] < 0]

In [32]:
# without logFC filtering at all
print(DE_df.shape)
print(DE_df_upreg.shape)
print(DE_df_downreg.shape)

(11027, 10)
(8299, 10)
(2728, 10)


In [33]:
np.unique(DE_df_upreg['cluster'])

array(['B_cells_memory', 'B_cells_memory_activated', 'B_cells_naive',
       'B_cells_naive_activated', 'MAIT_cells', 'Macrophages',
       'Monocytes_classical', 'Monocytes_intermediate',
       'Monocytes_non-classical', 'NK_CD16_bright',
       'NK_CD16_bright_activated', 'NK_CD56_bright_activated',
       'Plasma_cells', 'Precursor_cells', 'T4_activated', 'T4_memory',
       'T4_naive', 'T8_activated', 'T8_naive', 'TCM_CD8+', 'TEM_CD8+',
       'TMRA_CD8+', 'T_gd', 'T_regs', 'cDC1', 'cDC2', 'iNKT_cells'],
      dtype=object)

In [34]:
np.unique(DE_df_downreg['cluster'])

array(['B_cells_memory', 'B_cells_memory_activated', 'B_cells_naive',
       'B_cells_naive_activated', 'MAIT_cells', 'Macrophages',
       'Monocytes_classical', 'Monocytes_intermediate',
       'Monocytes_non-classical', 'NK_CD16_bright',
       'NK_CD16_bright_activated', 'NK_CD56_bright_activated',
       'Plasma_cells', 'T4_activated', 'T4_memory', 'T4_naive',
       'T8_activated', 'T8_naive', 'TCM_CD8+', 'TEM_CD8+', 'TMRA_CD8+',
       'T_gd', 'T_regs', 'cDC1', 'iNKT_cells'], dtype=object)

In [35]:
# Build dictionary clusters2DE_genes
# separately for upreg and downreg genes

clusters_upreg = list(np.unique(DE_df_upreg['cluster']))
clusters_downreg = list(np.unique(DE_df_downreg['cluster']))

is_DE_upreg = {}
is_DE_downreg = {}

for cluster in clusters_upreg:
    is_DE_upreg[cluster] = list(DE_df_upreg[DE_df_upreg['cluster'] == cluster]['Gene'])
    
for cluster in clusters_downreg:
    is_DE_downreg[cluster] = list(DE_df_downreg[DE_df_downreg['cluster'] == cluster]['Gene'])

In [36]:
len(is_DE_upreg['B_cells_memory_activated'])

193

In [37]:
'CD40' in is_DE_downreg['B_cells_memory_activated']

True

In [38]:
len(is_DE_downreg['B_cells_memory'])

15

In [39]:
for ct in list(is_DE_upreg.keys()):
    print(ct)
    print(len(is_DE_upreg[ct]), '\n')

B_cells_memory
77 

B_cells_memory_activated
193 

B_cells_naive
22 

B_cells_naive_activated
9 

MAIT_cells
6 

Macrophages
435 

Monocytes_classical
399 

Monocytes_intermediate
126 

Monocytes_non-classical
93 

NK_CD16_bright
89 

NK_CD16_bright_activated
21 

NK_CD56_bright_activated
1 

Plasma_cells
10 

Precursor_cells
1 

T4_activated
1144 

T4_memory
227 

T4_naive
67 

T8_activated
430 

T8_naive
28 

TCM_CD8+
226 

TEM_CD8+
57 

TMRA_CD8+
49 

T_gd
11 

T_regs
1226 

cDC1
280 

cDC2
3 

iNKT_cells
3069 



In [40]:
for ct in list(is_DE_downreg.keys()):
    print(ct)
    print(len(is_DE_downreg[ct]), '\n')

B_cells_memory
15 

B_cells_memory_activated
529 

B_cells_naive
8 

B_cells_naive_activated
8 

MAIT_cells
7 

Macrophages
131 

Monocytes_classical
113 

Monocytes_intermediate
140 

Monocytes_non-classical
85 

NK_CD16_bright
157 

NK_CD16_bright_activated
9 

NK_CD56_bright_activated
1 

Plasma_cells
5 

T4_activated
162 

T4_memory
19 

T4_naive
17 

T8_activated
867 

T8_naive
7 

TCM_CD8+
15 

TEM_CD8+
16 

TMRA_CD8+
6 

T_gd
9 

T_regs
34 

cDC1
227 

iNKT_cells
141 



In [41]:
is_DE_upreg['B_cells_naive_activated']

['CD32',
 'MT-ND6',
 'IgD',
 'RPS4Y1',
 'CD82-1',
 'MT-CO3',
 'CD45RA',
 'CD1c',
 'CD19-1']

In [42]:
is_DE_downreg['B_cells_naive_activated']

["CD73(Ecto-5'-nucleotidase)",
 'HLA-DQB1',
 'RPS10',
 'ARID5B',
 'CD38-1',
 'HLA-B',
 'HLA-DQA1',
 'DUSP2']

## Define cell pairs to test

In [43]:
len(list(genes_expr_per_cell_type.keys()))

29

In [44]:
list(genes_expr_per_cell_type.keys())

['B_cells_memory',
 'B_cells_memory_activated',
 'B_cells_naive',
 'B_cells_naive_activated',
 'MAIT_cells',
 'Macrophages',
 'Monocytes_classical',
 'Monocytes_intermediate',
 'Monocytes_non-classical',
 'NK_CD16_bright',
 'NK_CD16_bright_activated',
 'NK_CD56_bright',
 'NK_CD56_bright_activated',
 'Plasma_cells',
 'Precursor_cells',
 'T4_activated',
 'T4_memory',
 'T4_naive',
 'T8_activated',
 'T8_naive',
 'TCM_CD8+',
 'TEM_CD8+',
 'TMRA_CD8+',
 'T_gd',
 'T_regs',
 'cDC1',
 'cDC2',
 'iNKT_cells',
 'pDC']

In [45]:
# Get all pairwise cluster combinations: A--B, B--A
# Consider the reverse interaction too: A--B but B--A as well
pairwise_cluster_combinations = list(itertools.permutations(list(genes_expr_per_cell_type.keys()), 2))
len(pairwise_cluster_combinations)


812

In [46]:
pairwise_cluster_combinations[:5]

[('B_cells_memory', 'B_cells_memory_activated'),
 ('B_cells_memory', 'B_cells_naive'),
 ('B_cells_memory', 'B_cells_naive_activated'),
 ('B_cells_memory', 'MAIT_cells'),
 ('B_cells_memory', 'Macrophages')]

In [47]:
# add self interactions
self_inter_combinations = [(ct, ct) for ct in list(genes_expr_per_cell_type.keys())]
pairwise_cluster_combinations = pairwise_cluster_combinations + self_inter_combinations
len(pairwise_cluster_combinations)


841

In [48]:
len(is_DE_upreg.keys())

27

In [49]:
len(is_DE_downreg.keys())

25

In [53]:
# We only want to test pairs including at least one celltype in the DE folder

pairwise_cluster_combinations_upreg = [elem for elem in pairwise_cluster_combinations if elem[0] in list(is_DE_upreg.keys()) or elem[1] in list(is_DE_upreg.keys())]
pairwise_cluster_combinations_downreg = [elem for elem in pairwise_cluster_combinations if elem[0] in list(is_DE_downreg.keys()) or elem[1] in list(is_DE_downreg.keys())]

In [54]:
len(pairwise_cluster_combinations_upreg)

837

In [55]:
len(pairwise_cluster_combinations_downreg)

825

In [56]:
# Make cluster pair labels: celltypeA--celltypeB             
cluster_combinations_labels_upreg = [comb[0] + '---' + comb[1] for comb in pairwise_cluster_combinations_upreg]
cluster_combinations_labels_downreg = [comb[0] + '---' + comb[1] for comb in pairwise_cluster_combinations_downreg]

In [57]:
len(cluster_combinations_labels_upreg)

837

In [58]:
len(cluster_combinations_labels_downreg)

825

# Retrieve CellphoneDB L/R interactions

A relevant interaction shoudl have

1. All their participants expressed in the corresponding celltypes
2. At least one participant is a DEG

In [59]:
len(Int2Gene.keys())

825

In [60]:
# Make scaffold matrix: L/R interactions (rows) x celltype pairs (columns)
df_Exrp_LR_in_celltype_pairs_upreg = pd.DataFrame(index = list(Int2Gene.keys()),
                                          columns = cluster_combinations_labels_upreg,
                                          data = np.zeros( (len(list(Int2Gene.keys())), 
                                                            len(cluster_combinations_labels_upreg)) )
                                         )

df_Exrp_LR_in_celltype_pairs_downreg = pd.DataFrame(index = list(Int2Gene.keys()),
                                          columns = cluster_combinations_labels_downreg,
                                          data = np.zeros( (len(list(Int2Gene.keys())), 
                                                            len(cluster_combinations_labels_downreg)) )
                                         )

In [61]:
df_Exrp_LR_in_celltype_pairs_upreg

Unnamed: 0,B_cells_memory---B_cells_memory_activated,B_cells_memory---B_cells_naive,B_cells_memory---B_cells_naive_activated,B_cells_memory---MAIT_cells,B_cells_memory---Macrophages,B_cells_memory---Monocytes_classical,B_cells_memory---Monocytes_intermediate,B_cells_memory---Monocytes_non-classical,B_cells_memory---NK_CD16_bright,B_cells_memory---NK_CD16_bright_activated,...,T8_activated---T8_activated,T8_naive---T8_naive,TCM_CD8+---TCM_CD8+,TEM_CD8+---TEM_CD8+,TMRA_CD8+---TMRA_CD8+,T_gd---T_gd,T_regs---T_regs,cDC1---cDC1,cDC2---cDC2,iNKT_cells---iNKT_cells
FZD1_LRP5_WNT11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FZD1_LRP6_WNT11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FZD2_LRP5_WNT11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FZD2_LRP6_WNT11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FZD3_LRP5_WNT11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TSLP_TSLPR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CRLF2_TSLPR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ESAM_ESAM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NRTN_RET_receptor_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
df_Exrp_LR_in_celltype_pairs_downreg

Unnamed: 0,B_cells_memory---B_cells_memory_activated,B_cells_memory---B_cells_naive,B_cells_memory---B_cells_naive_activated,B_cells_memory---MAIT_cells,B_cells_memory---Macrophages,B_cells_memory---Monocytes_classical,B_cells_memory---Monocytes_intermediate,B_cells_memory---Monocytes_non-classical,B_cells_memory---NK_CD16_bright,B_cells_memory---NK_CD16_bright_activated,...,T4_naive---T4_naive,T8_activated---T8_activated,T8_naive---T8_naive,TCM_CD8+---TCM_CD8+,TEM_CD8+---TEM_CD8+,TMRA_CD8+---TMRA_CD8+,T_gd---T_gd,T_regs---T_regs,cDC1---cDC1,iNKT_cells---iNKT_cells
FZD1_LRP5_WNT11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FZD1_LRP6_WNT11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FZD2_LRP5_WNT11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FZD2_LRP6_WNT11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FZD3_LRP5_WNT11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TSLP_TSLPR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CRLF2_TSLPR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ESAM_ESAM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NRTN_RET_receptor_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
%%time

# LR_pairs_celltype_pairs_df will be a binary matrix with 1 indicating that all the genes in the interaction are expressed in the corresponding celltype
# So, fill 1 if all genes are expressed in all clusters

for interaction in list(df_Exrp_LR_in_celltype_pairs_upreg.index):
    #print('interaction', interaction, list(df_Exrp_LR_in_celltype_pairs_upreg.index).index(interaction)+1, 
    #      'out of', len(list(df_Exrp_LR_in_celltype_pairs_upreg.index)))
    for ct_pair in list(df_Exrp_LR_in_celltype_pairs_upreg.columns):
        #print(ct_pair)
        
        ct_A = ct_pair.split('---')[0]
        ct_B = ct_pair.split('---')[1]
        
        partner_A_genes = Int2Gene[interaction]['partner_a']
        partner_B_genes = Int2Gene[interaction]['partner_b']
        
        # are all partner_A genes expressed in celltype_A and are all partner_B genes expressed in celltype_B?
        are_all_expressed = all(elem in genes_expr_per_cell_type[ct_A] for elem in partner_A_genes) & all(elem in genes_expr_per_cell_type[ct_B] for elem in partner_B_genes)
        
        if are_all_expressed:
            df_Exrp_LR_in_celltype_pairs_upreg.loc[interaction, ct_pair] = 1


CPU times: user 1min 20s, sys: 0 ns, total: 1min 20s
Wall time: 1min 20s


In [64]:
%%time

# LR_pairs_celltype_pairs_df will be a binary matrix with 1 indicating that all the genes in the interaction are expressed in the corresponding celltype
# So, fill 1 if all genes are expressed in all clusters

for interaction in list(df_Exrp_LR_in_celltype_pairs_downreg.index):
    #print('interaction', interaction, list(df_Exrp_LR_in_celltype_pairs_downreg.index).index(interaction)+1, 
    #      'out of', len(list(df_Exrp_LR_in_celltype_pairs_downreg.index)))
    for ct_pair in list(df_Exrp_LR_in_celltype_pairs_downreg.columns):
        #print(ct_pair)
        
        ct_A = ct_pair.split('---')[0]
        ct_B = ct_pair.split('---')[1]
        
        partner_A_genes = Int2Gene[interaction]['partner_a']
        partner_B_genes = Int2Gene[interaction]['partner_b']
        
        # are all partner_A genes expressed in celltype_A and are all partner_B genes expressed in celltype_B?
        are_all_expressed = all(elem in genes_expr_per_cell_type[ct_A] for elem in partner_A_genes) & all(elem in genes_expr_per_cell_type[ct_B] for elem in partner_B_genes)
        
        if are_all_expressed:
            df_Exrp_LR_in_celltype_pairs_downreg.loc[interaction, ct_pair] = 1


CPU times: user 1min 18s, sys: 0 ns, total: 1min 18s
Wall time: 1min 18s


In [65]:
np.unique(df_Exrp_LR_in_celltype_pairs_upreg.values, return_counts=True)

(array([0., 1.]), array([677891,  12634]))

In [66]:
np.unique(df_Exrp_LR_in_celltype_pairs_downreg.values, return_counts=True)

(array([0., 1.]), array([668150,  12475]))

In [67]:
np.unique(df_Exrp_LR_in_celltype_pairs_upreg.sum(axis=0))

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
       40.])

In [68]:
np.unique(df_Exrp_LR_in_celltype_pairs_downreg.sum(axis=0))

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
       40.])

In [69]:
df_Exrp_LR_in_celltype_pairs_upreg.shape

(825, 837)

In [70]:
df_Exrp_LR_in_celltype_pairs_downreg.shape

(825, 825)

In [71]:
# keep celltype pairs with at least one expressed interaction

df_Exrp_LR_in_celltype_pairs_upreg = df_Exrp_LR_in_celltype_pairs_upreg.loc[:, (df_Exrp_LR_in_celltype_pairs_upreg != 0).any(axis=0)]
print(df_Exrp_LR_in_celltype_pairs_upreg.shape)

df_Exrp_LR_in_celltype_pairs_downreg = df_Exrp_LR_in_celltype_pairs_downreg.loc[:, (df_Exrp_LR_in_celltype_pairs_downreg != 0).any(axis=0)]
print(df_Exrp_LR_in_celltype_pairs_downreg.shape)

(825, 837)
(825, 825)


In [72]:
# keep interactions with at least one celltype_pair
df_Exrp_LR_in_celltype_pairs_upreg = df_Exrp_LR_in_celltype_pairs_upreg.loc[(df_Exrp_LR_in_celltype_pairs_upreg != 0).any(axis=1), :]
print(df_Exrp_LR_in_celltype_pairs_upreg.shape)

df_Exrp_LR_in_celltype_pairs_downreg = df_Exrp_LR_in_celltype_pairs_downreg.loc[(df_Exrp_LR_in_celltype_pairs_downreg != 0).any(axis=1), :]
print(df_Exrp_LR_in_celltype_pairs_downreg.shape)

(131, 837)
(130, 825)


In [73]:
df_Exrp_LR_in_celltype_pairs_downreg.columns

Index(['B_cells_memory---B_cells_memory_activated',
       'B_cells_memory---B_cells_naive',
       'B_cells_memory---B_cells_naive_activated',
       'B_cells_memory---MAIT_cells', 'B_cells_memory---Macrophages',
       'B_cells_memory---Monocytes_classical',
       'B_cells_memory---Monocytes_intermediate',
       'B_cells_memory---Monocytes_non-classical',
       'B_cells_memory---NK_CD16_bright',
       'B_cells_memory---NK_CD16_bright_activated',
       ...
       'T4_naive---T4_naive', 'T8_activated---T8_activated',
       'T8_naive---T8_naive', 'TCM_CD8+---TCM_CD8+', 'TEM_CD8+---TEM_CD8+',
       'TMRA_CD8+---TMRA_CD8+', 'T_gd---T_gd', 'T_regs---T_regs',
       'cDC1---cDC1', 'iNKT_cells---iNKT_cells'],
      dtype='object', length=825)

In [74]:
df_Exrp_LR_in_celltype_pairs_downreg.loc[:,'B_cells_memory---B_cells_naive_activated']

PVR_CD96              0.0
PVR_CD226             0.0
PVR_TIGIT             0.0
NOTCH1_DLL3           0.0
NOTCH2_DLL3           0.0
                     ... 
CCR4_CCL17            0.0
LILRA4_BST2           0.0
CD47_SIRB1_complex    0.0
LAIR1_LILRB4          0.0
CLEC2B_KLRF1          0.0
Name: B_cells_memory---B_cells_naive_activated, Length: 130, dtype: float64

In [75]:
df_Exrp_LR_in_celltype_pairs_downreg

Unnamed: 0,B_cells_memory---B_cells_memory_activated,B_cells_memory---B_cells_naive,B_cells_memory---B_cells_naive_activated,B_cells_memory---MAIT_cells,B_cells_memory---Macrophages,B_cells_memory---Monocytes_classical,B_cells_memory---Monocytes_intermediate,B_cells_memory---Monocytes_non-classical,B_cells_memory---NK_CD16_bright,B_cells_memory---NK_CD16_bright_activated,...,T4_naive---T4_naive,T8_activated---T8_activated,T8_naive---T8_naive,TCM_CD8+---TCM_CD8+,TEM_CD8+---TEM_CD8+,TMRA_CD8+---TMRA_CD8+,T_gd---T_gd,T_regs---T_regs,cDC1---cDC1,iNKT_cells---iNKT_cells
PVR_CD96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PVR_CD226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PVR_TIGIT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOTCH1_DLL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOTCH2_DLL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCR4_CCL17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
LILRA4_BST2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CD47_SIRB1_complex,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
LAIR1_LILRB4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [76]:
list(df_Exrp_LR_in_celltype_pairs_downreg.index) == list(df_Exrp_LR_in_celltype_pairs_upreg.index)

False

In [77]:
np.unique(df_Exrp_LR_in_celltype_pairs_upreg.values, return_counts=True)

(array([0., 1.]), array([97013, 12634]))

In [78]:
np.unique(df_Exrp_LR_in_celltype_pairs_downreg.values, return_counts=True)

(array([0., 1.]), array([94775, 12475]))

In [79]:
np.sum(df_Exrp_LR_in_celltype_pairs_upreg.values)

12634.0

In [80]:
np.sum(df_Exrp_LR_in_celltype_pairs_downreg.values)

12475.0

In [81]:
# Initialize DE matrix from LR_pairs_celltype_pairs_df and set all values to 0
# DE will be a binary matrix with 1 indicating that all the genes in the interaction are expressed in the corresponding celltype
# and one is a DE in the celltypes of interests
# Make scaffold matrix: L/R interactions (rows) x celltype pairs (columns)
df_Exrp_LR_in_celltype_pairs_upreg_DE = pd.DataFrame(index = list(df_Exrp_LR_in_celltype_pairs_upreg.index),
                                          columns = list(df_Exrp_LR_in_celltype_pairs_upreg.columns),
                                            data = np.zeros( (len(list(df_Exrp_LR_in_celltype_pairs_upreg.index)), 
                                                            len(list(df_Exrp_LR_in_celltype_pairs_upreg.columns))))
                                            )
df_Exrp_LR_in_celltype_pairs_downreg_DE = pd.DataFrame(index = list(df_Exrp_LR_in_celltype_pairs_downreg.index),
                                          columns = list(df_Exrp_LR_in_celltype_pairs_downreg.columns),
                                            data = np.zeros( (len(list(df_Exrp_LR_in_celltype_pairs_downreg.index)), 
                                                            len(list(df_Exrp_LR_in_celltype_pairs_downreg.columns))))
                                            )

In [88]:
# to find special cell types that do not have upreg DE genes

all_ct = list(genes_expr_per_cell_type.keys())

ct_with_upreg_DE = list(is_DE_upreg.keys())

ct_with_downreg_DE = list(is_DE_downreg.keys())


ct_with_no_upreg_DE_genes = set(all_ct) - set(ct_with_upreg_DE)
ct_with_no_downreg_DE_genes = set(all_ct) - set(ct_with_downreg_DE)


In [89]:
ct_with_no_upreg_DE_genes

{'NK_CD56_bright', 'pDC'}

In [90]:
ct_with_no_downreg_DE_genes

{'NK_CD56_bright', 'Precursor_cells', 'cDC2', 'pDC'}

In [92]:
%%time
# So, fill 1 if at least one gene in the interaction is DE
for interaction in list(df_Exrp_LR_in_celltype_pairs_upreg_DE.index):
    #print('interaction', interaction, list(df_Exrp_LR_in_celltype_pairs_upreg_DE.index).index(interaction)+1, 
    #      'out of', len(list(df_Exrp_LR_in_celltype_pairs_upreg_DE.index)))
    for ct_pair in list(df_Exrp_LR_in_celltype_pairs_upreg_DE.columns):
        #print(ct_pair)
        
        ct_A = ct_pair.split('---')[0]
        ct_B = ct_pair.split('---')[1]
        
        partner_A_genes = Int2Gene[interaction]['partner_a']
        partner_B_genes = Int2Gene[interaction]['partner_b']
        
        
        # so cell type T8_activated_2 will throw an error here because it doesn't have downreg DE genes, so needs a special if
        if ct_A in ct_with_no_upreg_DE_genes:
            # if ct_A doesn't have upreg DE, we only care about if partner_B_genes are upreg in this case
            are_any_DE = all(elem in is_DE_upreg[ct_B] for elem in partner_B_genes)
        elif ct_B in ct_with_no_upreg_DE_genes:
            # if ct_B doesn't have upreg DE, we only care about if partner_A_genes are upreg in this case
            are_any_DE = all(elem in is_DE_upreg[ct_A] for elem in partner_A_genes)
        else:
            # if both partners have upreg DE genes, proceed as normal
            # are partner_A genes DE in celltype_A OR are partner_B genes DE in celltype_B?
            are_any_DE = all(elem in is_DE_upreg[ct_A] for elem in partner_A_genes) | all(elem in is_DE_upreg[ct_B] for elem in partner_B_genes)
      
        if are_any_DE & (df_Exrp_LR_in_celltype_pairs_upreg.loc[interaction, ct_pair] == 1):
            df_Exrp_LR_in_celltype_pairs_upreg_DE.loc[interaction, ct_pair] = 1

CPU times: user 2.36 s, sys: 0 ns, total: 2.36 s
Wall time: 2.36 s


In [93]:
%%time
# So, fill 1 if at least one gene in the interaction is DE
for interaction in list(df_Exrp_LR_in_celltype_pairs_downreg_DE.index):
    #print('interaction', interaction, list(df_Exrp_LR_in_celltype_pairs_downreg_DE.index).index(interaction)+1, 
    #      'out of', len(list(df_Exrp_LR_in_celltype_pairs_downreg_DE.index)))
    for ct_pair in list(df_Exrp_LR_in_celltype_pairs_downreg_DE.columns):
        #print(ct_pair)
        
        ct_A = ct_pair.split('---')[0]
        ct_B = ct_pair.split('---')[1]
        
        partner_A_genes = Int2Gene[interaction]['partner_a']
        partner_B_genes = Int2Gene[interaction]['partner_b']
        
        # so cell type T8_activated_2 will throw an error here because it doesn't have downreg DE genes, so needs a special if
        if ct_A in ct_with_no_downreg_DE_genes:
            # if ct_A doesn't have downreg DE, we only care about if partner_B_genes are downreg in this case
            are_any_DE = all(elem in is_DE_downreg[ct_B] for elem in partner_B_genes)
        elif ct_B in ct_with_no_downreg_DE_genes:
            # if ct_B doesn't have downreg DE, we only care about if partner_A_genes are downreg in this case
            are_any_DE = all(elem in is_DE_downreg[ct_A] for elem in partner_A_genes)
        else:
            # if both partners have downreg DE genes, proceed as normal
            # are partner_A genes DE in celltype_A OR are partner_B genes DE in celltype_B?
            are_any_DE = all(elem in is_DE_downreg[ct_A] for elem in partner_A_genes) | all(elem in is_DE_downreg[ct_B] for elem in partner_B_genes)
   
        if are_any_DE & (df_Exrp_LR_in_celltype_pairs_downreg.loc[interaction, ct_pair] == 1):
            df_Exrp_LR_in_celltype_pairs_downreg_DE.loc[interaction, ct_pair] = 1

CPU times: user 1.5 s, sys: 54 µs, total: 1.5 s
Wall time: 1.51 s


In [94]:
df_Exrp_LR_in_celltype_pairs_upreg_DE

Unnamed: 0,B_cells_memory---B_cells_memory_activated,B_cells_memory---B_cells_naive,B_cells_memory---B_cells_naive_activated,B_cells_memory---MAIT_cells,B_cells_memory---Macrophages,B_cells_memory---Monocytes_classical,B_cells_memory---Monocytes_intermediate,B_cells_memory---Monocytes_non-classical,B_cells_memory---NK_CD16_bright,B_cells_memory---NK_CD16_bright_activated,...,T8_activated---T8_activated,T8_naive---T8_naive,TCM_CD8+---TCM_CD8+,TEM_CD8+---TEM_CD8+,TMRA_CD8+---TMRA_CD8+,T_gd---T_gd,T_regs---T_regs,cDC1---cDC1,cDC2---cDC2,iNKT_cells---iNKT_cells
PVR_CD96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PVR_CD226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PVR_TIGIT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOTCH1_DLL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOTCH2_DLL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LILRA4_BST2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CD47_SIRB1_complex,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LAIR1_LILRB4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CLEC2B_KLRF1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
df_Exrp_LR_in_celltype_pairs_downreg_DE

Unnamed: 0,B_cells_memory---B_cells_memory_activated,B_cells_memory---B_cells_naive,B_cells_memory---B_cells_naive_activated,B_cells_memory---MAIT_cells,B_cells_memory---Macrophages,B_cells_memory---Monocytes_classical,B_cells_memory---Monocytes_intermediate,B_cells_memory---Monocytes_non-classical,B_cells_memory---NK_CD16_bright,B_cells_memory---NK_CD16_bright_activated,...,T4_naive---T4_naive,T8_activated---T8_activated,T8_naive---T8_naive,TCM_CD8+---TCM_CD8+,TEM_CD8+---TEM_CD8+,TMRA_CD8+---TMRA_CD8+,T_gd---T_gd,T_regs---T_regs,cDC1---cDC1,iNKT_cells---iNKT_cells
PVR_CD96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PVR_CD226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PVR_TIGIT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOTCH1_DLL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOTCH2_DLL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCR4_CCL17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LILRA4_BST2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CD47_SIRB1_complex,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
LAIR1_LILRB4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
# UPREG interactions

# keep interactions expressed in at least one celltype pair
df_Exrp_LR_in_celltype_pairs_upreg_DE = df_Exrp_LR_in_celltype_pairs_upreg_DE.loc[:, (df_Exrp_LR_in_celltype_pairs_upreg_DE != 0).any(axis=0)]
print('shape after filtering interactions')
print(df_Exrp_LR_in_celltype_pairs_upreg_DE.shape, '\n')

# keep celltypepairs with at least one interaction

df_Exrp_LR_in_celltype_pairs_upreg_DE = df_Exrp_LR_in_celltype_pairs_upreg_DE.loc[(df_Exrp_LR_in_celltype_pairs_upreg_DE != 0).any(axis=1),:]
print('shape after filtering cell type pairs')
print(df_Exrp_LR_in_celltype_pairs_upreg_DE.shape, '\n')

shape after filtering interactions
(131, 584) 

shape after filtering cell type pairs
(100, 584) 



In [97]:
# DOWNREG interactions

# keep interactions expressed in at least one celltype pair
df_Exrp_LR_in_celltype_pairs_downreg_DE = df_Exrp_LR_in_celltype_pairs_downreg_DE.loc[:, (df_Exrp_LR_in_celltype_pairs_downreg_DE != 0).any(axis=0)]
print('shape after filtering interactions')
print(df_Exrp_LR_in_celltype_pairs_downreg_DE.shape, '\n')

# keep celltypepairs with at least one interaction

df_Exrp_LR_in_celltype_pairs_downreg_DE = df_Exrp_LR_in_celltype_pairs_downreg_DE.loc[(df_Exrp_LR_in_celltype_pairs_downreg_DE != 0).any(axis=1),:]
print('shape after filtering cell type pairs')
print(df_Exrp_LR_in_celltype_pairs_downreg_DE.shape, '\n')

shape after filtering interactions
(130, 399) 

shape after filtering cell type pairs
(68, 399) 



In [98]:
np.unique(df_Exrp_LR_in_celltype_pairs_upreg_DE.values, return_counts=True)

(array([0., 1.]), array([55740,  2660]))

In [99]:
np.unique(df_Exrp_LR_in_celltype_pairs_downreg_DE.values, return_counts=True)

(array([0., 1.]), array([26022,  1110]))

In [100]:
df_Exrp_LR_in_celltype_pairs_upreg_DE

Unnamed: 0,B_cells_memory---B_cells_memory_activated,B_cells_memory---B_cells_naive,B_cells_memory---B_cells_naive_activated,B_cells_memory---Macrophages,B_cells_memory---Monocytes_classical,B_cells_memory---Monocytes_intermediate,B_cells_memory---Monocytes_non-classical,B_cells_memory---NK_CD16_bright,B_cells_memory---NK_CD16_bright_activated,B_cells_memory---NK_CD56_bright,...,NK_CD16_bright---NK_CD16_bright,T4_activated---T4_activated,T4_memory---T4_memory,T8_activated---T8_activated,TCM_CD8+---TCM_CD8+,TEM_CD8+---TEM_CD8+,T_gd---T_gd,T_regs---T_regs,cDC1---cDC1,iNKT_cells---iNKT_cells
PVR_CD96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PVR_CD226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PVR_TIGIT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOTCH2_DLL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LGALS9_HAVCR2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LTBR_LTB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CCR4_CCL17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
LILRA4_BST2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LAIR1_LILRB4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
df_Exrp_LR_in_celltype_pairs_downreg_DE

Unnamed: 0,B_cells_memory---B_cells_memory_activated,B_cells_memory---B_cells_naive_activated,B_cells_memory---Macrophages,B_cells_memory---Monocytes_non-classical,B_cells_memory---NK_CD16_bright,B_cells_memory---Plasma_cells,B_cells_memory---T4_activated,B_cells_memory---T8_activated,B_cells_memory---T_regs,B_cells_memory---cDC1,...,MAIT_cells---MAIT_cells,Macrophages---Macrophages,Monocytes_classical---Monocytes_classical,Monocytes_intermediate---Monocytes_intermediate,Monocytes_non-classical---Monocytes_non-classical,NK_CD16_bright---NK_CD16_bright,T4_activated---T4_activated,T8_activated---T8_activated,cDC1---cDC1,iNKT_cells---iNKT_cells
PVR_CD96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SIRPA_CD47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
PLAUR_integrin_a4b1_complex,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
CD40LG_integrin_a5b1_complex,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TGFB1_TGFBR3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TNFRSF8_TNFSF8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CD52_SIGLEC10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LTBR_LTB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CD47_SIRB1_complex,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Save results

In [102]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202009_CVID_revision/'

In [103]:
# So DE is our output matrix
# Filter it accordingly for visualization
df_Exrp_LR_in_celltype_pairs_upreg_DE.to_csv(save_path + '20210416_cellphone_interactions_upreg_in_CVID_validation_cohort_no_logFC_cutoff.csv')
df_Exrp_LR_in_celltype_pairs_downreg_DE.to_csv(save_path + '20210416_cellphone_interactions_downreg_in_CVID_validation_cohort_no_logFC_cutoff.csv')

## Save results in a more readable format

Gene by gene breakdown with added DEG stats

### Upreg interactions

In [104]:
faulty_index_count = 0

vec2_append_upreg = {}

# row count
curr_count = 0

for interaction in df_Exrp_LR_in_celltype_pairs_upreg_DE.index:
    
    print(interaction, list(df_Exrp_LR_in_celltype_pairs_upreg_DE.index).index(interaction) + 1, 
          'out of', len(df_Exrp_LR_in_celltype_pairs_upreg_DE.index))
    # current row
    curr_table = pd.DataFrame(df_Exrp_LR_in_celltype_pairs_upreg_DE.loc[interaction])
    curr_table = curr_table[curr_table[interaction] > 0]
    
    for celltype_pair in list(curr_table.index):
        
        # row by row
        vec2_append_upreg[str(curr_count)] = {}
        
        vec2_append_upreg[str(curr_count)]['interaction'] = interaction
        
        #print(celltype_pair)
        
        # getting genes, these are lists of length 1 for simple interactions and > 1 for complexes
        curr_partner_A_genes = Int2Gene[interaction]['partner_a']
        curr_partner_B_genes = Int2Gene[interaction]['partner_b']
        
        #print('curr partner A genes', curr_partner_A_genes, 'len:', len(curr_partner_A_genes))
        #print('curr partner B genes', curr_partner_B_genes, 'len:', len(curr_partner_B_genes))
        
        vec2_append_upreg[str(curr_count)]['partner_A_genes'] = curr_partner_A_genes
        vec2_append_upreg[str(curr_count)]['partner_B_genes'] = curr_partner_B_genes
        
        curr_celltype_A = celltype_pair.split('---')[0]
        curr_celltype_B = celltype_pair.split('---')[1]
        
        vec2_append_upreg[str(curr_count)]['celltype_A'] = curr_celltype_A
        vec2_append_upreg[str(curr_count)]['celltype_B'] = curr_celltype_B
        
        
        # are all partner_A genes DE in celltype_A and are all partner_B genes DE in celltype_B?
        # these DE table subsets have been filtered already according to cutoffs declared in the beginning of the notebook
        curr_celltype_A_DE_table_subset = DE_df_upreg[DE_df_upreg['cluster'] == curr_celltype_A]
        curr_celltype_A_DE_table_subset.set_index('Gene', inplace=True)
        
        # Per_df table for all genes, even not DE - to include % of cells expressing even non-DE partners
        curr_celltype_A_per_df_full = pd.DataFrame(Per_df.loc[:,curr_celltype_A])
        
        curr_celltype_B_DE_table_subset = DE_df_upreg[DE_df_upreg['cluster'] == curr_celltype_B]
        curr_celltype_B_DE_table_subset.set_index('Gene', inplace=True)
        
        # Per_df table for all genes, even not DE - to include % of cells expressing even non-DE partners
        curr_celltype_B_per_df_full = pd.DataFrame(Per_df.loc[:,curr_celltype_B])

        
        # if partner A is DE, add stats
        if all(elem in list(curr_celltype_A_DE_table_subset.index) for elem in curr_partner_A_genes):
            vec2_append_upreg[str(curr_count)]['is_partner_A_DE'] = True
            vec2_append_upreg[str(curr_count)]['logFC_gene_A'] = list(curr_celltype_A_DE_table_subset.loc[curr_partner_A_genes,'logFC'])
            vec2_append_upreg[str(curr_count)]['adj_pval_gene_A'] = list(curr_celltype_A_DE_table_subset.loc[curr_partner_A_genes,'adj.P.Val'])
            vec2_append_upreg[str(curr_count)]['percent_expr_gene_A'] = list(curr_celltype_A_DE_table_subset.loc[curr_partner_A_genes,'percentExpr_cluster'])
            
        else: # if not DE, add 'NA'
            vec2_append_upreg[str(curr_count)]['is_partner_A_DE'] = False
            vec2_append_upreg[str(curr_count)]['logFC_gene_A'] = 'NA'
            vec2_append_upreg[str(curr_count)]['adj_pval_gene_A'] = 'NA'
            # even if the partner is not DE, we still want to know the % of cells expressing it
            vec2_append_upreg[str(curr_count)]['percent_expr_gene_A'] = list(curr_celltype_A_per_df_full.loc[curr_partner_A_genes,curr_celltype_A])
            
            
            
        # if partner B is DE, add stats
        if all(elem in list(curr_celltype_B_DE_table_subset.index) for elem in curr_partner_B_genes):
            vec2_append_upreg[str(curr_count)]['is_partner_B_DE'] = True
            vec2_append_upreg[str(curr_count)]['logFC_gene_B'] = list(curr_celltype_B_DE_table_subset.loc[curr_partner_B_genes,'logFC'])
            vec2_append_upreg[str(curr_count)]['adj_pval_gene_B'] = list(curr_celltype_B_DE_table_subset.loc[curr_partner_B_genes,'adj.P.Val'])
            vec2_append_upreg[str(curr_count)]['percent_expr_gene_B'] = list(curr_celltype_B_DE_table_subset.loc[curr_partner_B_genes,'percentExpr_cluster'])
        else: # if not DE, add 'NA'
            vec2_append_upreg[str(curr_count)]['is_partner_B_DE'] = False
            vec2_append_upreg[str(curr_count)]['logFC_gene_B'] = 'NA'
            vec2_append_upreg[str(curr_count)]['adj_pval_gene_B'] = 'NA'
            # even if the partner is not DE, we still want to know the % of cells expressing it
            vec2_append_upreg[str(curr_count)]['percent_expr_gene_B'] = list(curr_celltype_B_per_df_full.loc[curr_partner_B_genes,curr_celltype_B])
            
        curr_count += 1
    
    

PVR_CD96 1 out of 100
PVR_CD226 2 out of 100
PVR_TIGIT 3 out of 100
NOTCH2_DLL3 4 out of 100
LGALS9_HAVCR2 5 out of 100
FN1_integrin_a4b1_complex 6 out of 100
SPP1_integrin_a4b1_complex 7 out of 100
PLAUR_integrin_a4b1_complex 8 out of 100
FN1_integrin_a4b7_complex 9 out of 100
FN1_integrin_a5b1_complex 10 out of 100
CD40LG_integrin_a5b1_complex 11 out of 100
FN1_integrin_aVb1_complex 12 out of 100
TGFB1_TGFBR3 13 out of 100
ICAM1_integrin_aMb2_complex 14 out of 100
ICAM1_integrin_aXb2_complex 15 out of 100
CXCR3_CXCL9 16 out of 100
DPP4_CXCL9 17 out of 100
CD8_receptor_LCK 18 out of 100
CD94:NKG2A_HLA-E 19 out of 100
CD94:NKG2C_HLA-E 20 out of 100
CD94:NKG2E_HLA-E 21 out of 100
TNFRSF13B_TNFSF13B 22 out of 100
TNFRSF17_TNFSF13B 23 out of 100
TNFRSF13C_TNFSF13B 24 out of 100
CD74_APP 25 out of 100
ICAM1_SPN 26 out of 100
ICAM1_ITGAL 27 out of 100
ICAM1_integrin_aLb2_complex 28 out of 100
ICAM2_integrin_aLb2_complex 29 out of 100
ICAM3_integrin_aLb2_complex 30 out of 100
F11R_integrin_a

In [105]:
# outlining the final table format
df_output_upreg = pd.DataFrame(columns = ['interaction',
                                     'partner_A_genes',
                                     'partner_B_genes',
                                     'celltype_A',
                                     'celltype_B',
                                     'is_partner_A_DE',
                                     'logFC_gene_A',
                                     'adj_pval_gene_A',
                                     'percent_expr_gene_A',
                                     'is_partner_B_DE',
                                     'logFC_gene_B',
                                     'adj_pval_gene_B',
                                     'percent_expr_gene_B'],
                              index = list(vec2_append_upreg.keys())
                            )
df_output_upreg

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,,,,,,,,,,,,,
2656,,,,,,,,,,,,,
2657,,,,,,,,,,,,,
2658,,,,,,,,,,,,,


In [106]:
list(df_output_upreg.columns) == list(vec2_append_upreg['0'].keys())

True

In [107]:
len(vec2_append_upreg.keys())

2660

In [108]:
vec2_append_upreg['0'].keys()

dict_keys(['interaction', 'partner_A_genes', 'partner_B_genes', 'celltype_A', 'celltype_B', 'is_partner_A_DE', 'logFC_gene_A', 'adj_pval_gene_A', 'percent_expr_gene_A', 'is_partner_B_DE', 'logFC_gene_B', 'adj_pval_gene_B', 'percent_expr_gene_B'])

In [109]:
df_output_upreg

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,,,,,,,,,,,,,
2656,,,,,,,,,,,,,
2657,,,,,,,,,,,,,
2658,,,,,,,,,,,,,


In [110]:
%%time

for i in list(vec2_append_upreg.keys()):
    #print(i)
    curr_keys = list(vec2_append_upreg[i].keys())
    for col in curr_keys:
        df_output_upreg.loc[i,col] = vec2_append_upreg[i][col]

CPU times: user 2.01 s, sys: 26 µs, total: 2.01 s
Wall time: 2.03 s


In [111]:
df_output_upreg

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
0,PVR_CD96,[PVR],[CD96],cDC2,B_cells_memory_activated,False,,,[0.1336717428087986],True,[0.062315989380357],[7.66358976792082e-09],[0.167733]
1,PVR_CD96,[PVR],[CD96],cDC2,T8_activated,False,,,[0.1336717428087986],True,[0.0792221910327131],[0.000220600879123],[0.228421]
2,PVR_CD226,[PVR],[CD226],cDC2,T4_activated,False,,,[0.1336717428087986],True,[0.0408807841632831],[3.33550702477858e-05],[0.18992]
3,PVR_CD226,[PVR],[CD226],cDC2,T8_activated,False,,,[0.1336717428087986],True,[0.064849917007213],[0.0001914751063181],[0.199869]
4,PVR_CD226,[PVR],[CD226],cDC2,iNKT_cells,False,,,[0.1336717428087986],True,[0.068239211621473],[0.0026646590073635],[0.146573]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,CLEC2B_KLRF1,[CLEC2B],[KLRF1],Macrophages,NK_CD56_bright,True,[0.0541176629755394],[0.0424214084646784],[0.4230609999999999],False,,,[0.2835443037974683]
2656,CLEC2B_KLRF1,[CLEC2B],[KLRF1],Monocytes_non-classical,NK_CD16_bright,True,[0.11336603310709],[0.0079254842524513],[0.388298],False,,,[0.2780506233471855]
2657,CLEC2B_KLRF1,[CLEC2B],[KLRF1],Monocytes_non-classical,NK_CD56_bright,True,[0.11336603310709],[0.0079254842524513],[0.388298],False,,,[0.2835443037974683]
2658,CLEC2B_KLRF1,[CLEC2B],[KLRF1],T8_activated,NK_CD16_bright,True,[0.075501869094107],[4.36707814080924e-06],[0.148999],False,,,[0.2780506233471855]


In [112]:
# getting rid of the square parentheses [] in all the values

cols2correct = ['partner_A_genes', 'partner_B_genes', 'logFC_gene_A', 'adj_pval_gene_A',
       'percent_expr_gene_A', 'logFC_gene_B',
       'adj_pval_gene_B', 'percent_expr_gene_B']

for row in list(df_output_upreg.index):
    #print('row', row)
    for col in cols2correct:
        #print('column', col)
        curr_value = df_output_upreg.loc[row, col] # with []
        #print(curr_value)
        if (curr_value != 'NA') & (len(curr_value) == 1): # ignoring NAs and lists of length > 1 - complex genes that is
        #if not isinstance(curr_value, list) & (curr_value != 'NA'): # ignoring NAs and lists of length > 1 - complex genes that is
            df_output_upreg.loc[row, col] = curr_value[0] # this just get the element - string if a gene, numerical value if it's a stat
            

In [113]:
df_output_upreg

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
0,PVR_CD96,PVR,CD96,cDC2,B_cells_memory_activated,False,,,0.133672,True,0.062316,0.0,0.167733
1,PVR_CD96,PVR,CD96,cDC2,T8_activated,False,,,0.133672,True,0.079222,0.000221,0.228421
2,PVR_CD226,PVR,CD226,cDC2,T4_activated,False,,,0.133672,True,0.040881,0.000033,0.18992
3,PVR_CD226,PVR,CD226,cDC2,T8_activated,False,,,0.133672,True,0.06485,0.000191,0.199869
4,PVR_CD226,PVR,CD226,cDC2,iNKT_cells,False,,,0.133672,True,0.068239,0.002665,0.146573
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,CLEC2B_KLRF1,CLEC2B,KLRF1,Macrophages,NK_CD56_bright,True,0.054118,0.042421,0.423061,False,,,0.283544
2656,CLEC2B_KLRF1,CLEC2B,KLRF1,Monocytes_non-classical,NK_CD16_bright,True,0.113366,0.007925,0.388298,False,,,0.278051
2657,CLEC2B_KLRF1,CLEC2B,KLRF1,Monocytes_non-classical,NK_CD56_bright,True,0.113366,0.007925,0.388298,False,,,0.283544
2658,CLEC2B_KLRF1,CLEC2B,KLRF1,T8_activated,NK_CD16_bright,True,0.075502,0.000004,0.148999,False,,,0.278051


In [114]:
df_output_upreg.columns

Index(['interaction', 'partner_A_genes', 'partner_B_genes', 'celltype_A',
       'celltype_B', 'is_partner_A_DE', 'logFC_gene_A', 'adj_pval_gene_A',
       'percent_expr_gene_A', 'is_partner_B_DE', 'logFC_gene_B',
       'adj_pval_gene_B', 'percent_expr_gene_B'],
      dtype='object')

### Splitting tables into 2 tables: simple interactions and complex interactions, latter being deconvoluted into pseudo-interactions for each subunit of a complex

In [115]:
# getting indices of complex interactions
complex_interaction_rows_upreg = []

for n_row in list(df_output_upreg.index):
    #print('row', n_row)
    
    curr_partner_A_genes = df_output_upreg.loc[n_row, 'partner_A_genes']
    curr_partner_B_genes = df_output_upreg.loc[n_row, 'partner_B_genes']
    
    if isinstance(curr_partner_A_genes, list) or isinstance(curr_partner_B_genes, list): # if partner A or B is a complex
        complex_interaction_rows_upreg.append(n_row)
        

In [116]:
len(complex_interaction_rows_upreg)

450

In [117]:
np.unique(complex_interaction_rows_upreg, return_counts=True)

(array(['100', '101', '102', '103', '104', '105', '106', '107', '108',
        '109', '110', '111', '112', '113', '126', '127', '128', '129',
        '130', '131', '132', '133', '134', '1356', '1357', '1358', '1359',
        '1360', '1361', '1362', '1363', '1364', '1365', '1366', '1367',
        '1368', '1369', '1370', '1371', '1372', '1373', '1374', '1375',
        '1376', '1377', '1378', '1379', '1380', '1381', '1382', '1383',
        '1384', '1385', '1386', '1387', '1388', '1389', '1390', '1391',
        '1392', '1393', '1394', '1395', '1396', '1397', '1398', '1399',
        '1400', '1401', '1402', '1403', '1404', '1405', '1406', '1407',
        '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415',
        '1416', '1417', '1418', '1419', '1420', '182', '183', '184', '185',
        '186', '187', '188', '189', '190', '191', '192', '193', '194',
        '1948', '1949', '195', '1950', '1951', '1952', '1953', '1954',
        '1955', '1956', '1957', '1958', '1959', '196', '1960'

In [118]:
# splitting simple and complex interactions into 2 separate tables
df_output_upreg_complex = df_output_upreg.loc[complex_interaction_rows_upreg,:]
df_output_upreg_simple = df_output_upreg.drop(complex_interaction_rows_upreg, axis=0)

In [119]:
df_output_upreg_simple.shape

(2210, 13)

In [120]:
df_output_upreg_complex.shape

(450, 13)

In [121]:
df_output_upreg.shape

(2660, 13)

In [122]:
df_output_upreg

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
0,PVR_CD96,PVR,CD96,cDC2,B_cells_memory_activated,False,,,0.133672,True,0.062316,0.0,0.167733
1,PVR_CD96,PVR,CD96,cDC2,T8_activated,False,,,0.133672,True,0.079222,0.000221,0.228421
2,PVR_CD226,PVR,CD226,cDC2,T4_activated,False,,,0.133672,True,0.040881,0.000033,0.18992
3,PVR_CD226,PVR,CD226,cDC2,T8_activated,False,,,0.133672,True,0.06485,0.000191,0.199869
4,PVR_CD226,PVR,CD226,cDC2,iNKT_cells,False,,,0.133672,True,0.068239,0.002665,0.146573
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,CLEC2B_KLRF1,CLEC2B,KLRF1,Macrophages,NK_CD56_bright,True,0.054118,0.042421,0.423061,False,,,0.283544
2656,CLEC2B_KLRF1,CLEC2B,KLRF1,Monocytes_non-classical,NK_CD16_bright,True,0.113366,0.007925,0.388298,False,,,0.278051
2657,CLEC2B_KLRF1,CLEC2B,KLRF1,Monocytes_non-classical,NK_CD56_bright,True,0.113366,0.007925,0.388298,False,,,0.283544
2658,CLEC2B_KLRF1,CLEC2B,KLRF1,T8_activated,NK_CD16_bright,True,0.075502,0.000004,0.148999,False,,,0.278051


In [123]:
df_output_upreg_complex

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
36,FN1_integrin_a4b1_complex,FN1,"[ITGB1, ITGA4]",Macrophages,T8_activated,False,,,0.133807,True,"[0.0859402254234798, 0.0947612832088071]","[2.11382087510341e-06, 4.96771168591477e-05]","[0.2156219999999999, 0.351493]"
37,FN1_integrin_a4b1_complex,FN1,"[ITGB1, ITGA4]",Macrophages,T_regs,False,,,0.133807,True,"[0.0463857472576402, 0.13518526418308]","[0.0115822540066519, 1.11954652107609e-14]","[0.307562, 0.461464]"
38,FN1_integrin_a4b1_complex,FN1,"[ITGB1, ITGA4]",Macrophages,iNKT_cells,False,,,0.133807,True,"[0.0706479330863757, 0.135036011748152]","[0.0078183539969285, 8.104892316104151e-05]","[0.1733869999999999, 0.350403]"
39,FN1_integrin_a4b1_complex,FN1,"[ITGB1, ITGA4]",Monocytes_classical,T8_activated,False,,,0.13555,True,"[0.0859402254234798, 0.0947612832088071]","[2.11382087510341e-06, 4.96771168591477e-05]","[0.2156219999999999, 0.351493]"
40,FN1_integrin_a4b1_complex,FN1,"[ITGB1, ITGA4]",Monocytes_classical,T_regs,False,,,0.13555,True,"[0.0463857472576402, 0.13518526418308]","[0.0115822540066519, 1.11954652107609e-14]","[0.307562, 0.461464]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014,IFNG_Type_II_IFNR,IFNG,"[IFNGR1, IFNGR2]",iNKT_cells,Precursor_cells,True,1.650786,0.0,0.838105,False,,,"[0.1213592233009708, 0.2548543689320388]"
2015,IFNG_Type_II_IFNR,IFNG,"[IFNGR1, IFNGR2]",iNKT_cells,cDC1,True,1.650786,0.0,0.838105,False,,,"[0.5653057767236999, 0.6173132305607318]"
2016,IFNG_Type_II_IFNR,IFNG,"[IFNGR1, IFNGR2]",iNKT_cells,cDC2,True,1.650786,0.0,0.838105,False,,,"[0.1962774957698815, 0.4263959390862944]"
2017,IFNG_Type_II_IFNR,IFNG,"[IFNGR1, IFNGR2]",iNKT_cells,pDC,True,1.650786,0.0,0.838105,False,,,"[0.1203007518796992, 0.1203007518796992]"


In [124]:
# checking whether any complexes interact with other complexes and how many subunits each complex contains here

n_subunits_upreg = []

for n_row in list(df_output_upreg.index):
    #print('row', n_row)
    
    curr_partner_A_genes = df_output_upreg.loc[n_row, 'partner_A_genes']
    curr_partner_B_genes = df_output_upreg.loc[n_row, 'partner_B_genes']
    
    if isinstance(curr_partner_A_genes, list) and isinstance(curr_partner_B_genes, list): # if partner A AND B is a complex
        print('row', n_row)
        print('both are complexes')
        
    if isinstance(curr_partner_A_genes, list): # if partner A is a complex
        print('row', n_row)
        print('curr_partner_A_genes',curr_partner_A_genes)
        print('partner A is a complex, len is:', len(curr_partner_A_genes))
        n_subunits_upreg.append(len(curr_partner_A_genes))
        
    if isinstance(curr_partner_B_genes, list): # if partner B is a complex
        print('row', n_row)
        print('curr_partner_B_genes',curr_partner_B_genes)
        print('partner B is a complex, len is:', len(curr_partner_B_genes))
        n_subunits_upreg.append(len(curr_partner_B_genes))

row 36
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 37
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 38
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 39
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 40
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 41
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 42
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 43
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 44
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 45
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 46
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 47
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 48
curr_partner_B_genes 

In [125]:
np.unique(n_subunits_upreg, return_counts=True)

(array([2, 3]), array([422,  28]))

#### So 1 more scenario to ignore: there are no interactions of a complex with a complex
#### And max complex size is 3 subunits - in case of IL2R (indeed, there are subunits: alpha, beta and gamma)

In [126]:
df_output_upreg_complex

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
36,FN1_integrin_a4b1_complex,FN1,"[ITGB1, ITGA4]",Macrophages,T8_activated,False,,,0.133807,True,"[0.0859402254234798, 0.0947612832088071]","[2.11382087510341e-06, 4.96771168591477e-05]","[0.2156219999999999, 0.351493]"
37,FN1_integrin_a4b1_complex,FN1,"[ITGB1, ITGA4]",Macrophages,T_regs,False,,,0.133807,True,"[0.0463857472576402, 0.13518526418308]","[0.0115822540066519, 1.11954652107609e-14]","[0.307562, 0.461464]"
38,FN1_integrin_a4b1_complex,FN1,"[ITGB1, ITGA4]",Macrophages,iNKT_cells,False,,,0.133807,True,"[0.0706479330863757, 0.135036011748152]","[0.0078183539969285, 8.104892316104151e-05]","[0.1733869999999999, 0.350403]"
39,FN1_integrin_a4b1_complex,FN1,"[ITGB1, ITGA4]",Monocytes_classical,T8_activated,False,,,0.13555,True,"[0.0859402254234798, 0.0947612832088071]","[2.11382087510341e-06, 4.96771168591477e-05]","[0.2156219999999999, 0.351493]"
40,FN1_integrin_a4b1_complex,FN1,"[ITGB1, ITGA4]",Monocytes_classical,T_regs,False,,,0.13555,True,"[0.0463857472576402, 0.13518526418308]","[0.0115822540066519, 1.11954652107609e-14]","[0.307562, 0.461464]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014,IFNG_Type_II_IFNR,IFNG,"[IFNGR1, IFNGR2]",iNKT_cells,Precursor_cells,True,1.650786,0.0,0.838105,False,,,"[0.1213592233009708, 0.2548543689320388]"
2015,IFNG_Type_II_IFNR,IFNG,"[IFNGR1, IFNGR2]",iNKT_cells,cDC1,True,1.650786,0.0,0.838105,False,,,"[0.5653057767236999, 0.6173132305607318]"
2016,IFNG_Type_II_IFNR,IFNG,"[IFNGR1, IFNGR2]",iNKT_cells,cDC2,True,1.650786,0.0,0.838105,False,,,"[0.1962774957698815, 0.4263959390862944]"
2017,IFNG_Type_II_IFNR,IFNG,"[IFNGR1, IFNGR2]",iNKT_cells,pDC,True,1.650786,0.0,0.838105,False,,,"[0.1203007518796992, 0.1203007518796992]"


In [127]:
# Duplicating the table and then choosing only 0th or 1st or 2nd values for the complexes
df_output_upreg_complex_member_1 = df_output_upreg_complex.copy()
df_output_upreg_complex_member_2 = df_output_upreg_complex.copy()
df_output_upreg_complex_member_3 = df_output_upreg_complex.copy()

In [128]:
# splitting complex interaction entries by subunits / members
# for any values in these tables, if it's a list, save the 0th for df_output_upreg_complex_member_1 and 1st for df_output_upreg_complex_member_2

# which rows contain interaction with a 3-subunit complex? to then subset df_output_downreg_complex_member_3
subunit_3_rows = []

for n_row in list(df_output_upreg_complex.index):
    #print('outside for loop, row', n_row)
    
    for col in df_output_upreg_complex.columns:
        #print('col', col)
        if isinstance(df_output_upreg_complex.loc[n_row, col], list):
            df_output_upreg_complex_member_1.loc[n_row, col] = df_output_upreg_complex.loc[n_row, col][0]
            df_output_upreg_complex_member_2.loc[n_row, col] = df_output_upreg_complex.loc[n_row, col][1]
            
            # additionally, if there are 3 subunits, separate into 3 entries
            if len(df_output_upreg_complex.loc[n_row, col]) == 3:
                df_output_upreg_complex_member_3.loc[n_row, col] = df_output_upreg_complex.loc[n_row, col][2]
                #print('3-subunit complex')
                #print('row', n_row)
                #print('adding to subunit_3_rows')
                subunit_3_rows.append(n_row)


In [129]:
# not sure why but it gets duplicated
np.unique(subunit_3_rows, return_counts=True)

(array(['1356', '1357', '1358', '1359', '1360', '1361', '1362', '1363',
        '1364', '1365', '1366', '1367', '1368', '1369', '1370', '1371',
        '1372', '1373', '1374', '1375', '1376', '1377', '1378', '1379',
        '1380', '1381', '1382', '1383'], dtype='<U4'),
 array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2]))

In [130]:
# getting rid of duplicates
subunit_3_rows = list(set(subunit_3_rows))

In [131]:
np.unique(subunit_3_rows, return_counts=True)

(array(['1356', '1357', '1358', '1359', '1360', '1361', '1362', '1363',
        '1364', '1365', '1366', '1367', '1368', '1369', '1370', '1371',
        '1372', '1373', '1374', '1375', '1376', '1377', '1378', '1379',
        '1380', '1381', '1382', '1383'], dtype='<U4'),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1]))

In [132]:
len(subunit_3_rows)

28

In [133]:
subunit_3_rows

['1374',
 '1358',
 '1375',
 '1382',
 '1367',
 '1362',
 '1365',
 '1381',
 '1357',
 '1378',
 '1376',
 '1383',
 '1373',
 '1371',
 '1380',
 '1364',
 '1377',
 '1369',
 '1363',
 '1370',
 '1360',
 '1372',
 '1361',
 '1379',
 '1359',
 '1366',
 '1356',
 '1368']

In [134]:
# in df_output_downreg_complex_member_3 entries of 3rd members are correct
# removing other entries of interactions with less than 3 subunit complexes
df_output_upreg_complex_member_3 = df_output_upreg_complex_member_3.loc[subunit_3_rows,:]

In [135]:
df_output_upreg_complex_member_3

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
1374,IL2_receptor_HA_IL2,IL2RB,IL2,T8_activated,iNKT_cells,False,,,0.338487,True,0.114141,0.000207,0.23246
1358,IL2_receptor_HA_IL2,IL2RB,IL2,B_cells_naive_activated,T4_activated,False,,,0.119906,True,0.074279,0.000436,0.176771
1375,IL2_receptor_HA_IL2,IL2RB,IL2,TCM_CD8+,T4_activated,False,,,0.226496,True,0.074279,0.000436,0.176771
1382,IL2_receptor_HA_IL2,IL2RB,IL2,T4_activated,T4_activated,False,,,0.352941,True,0.074279,0.000436,0.176771
1367,IL2_receptor_HA_IL2,IL2RB,IL2,Plasma_cells,iNKT_cells,False,,,0.475191,True,0.114141,0.000207,0.23246
1362,IL2_receptor_HA_IL2,IL2RB,IL2,NK_CD16_bright_activated,T4_activated,False,,,0.322936,True,0.074279,0.000436,0.176771
1365,IL2_receptor_HA_IL2,IL2RB,IL2,NK_CD56_bright_activated,iNKT_cells,False,,,0.386482,True,0.114141,0.000207,0.23246
1381,IL2_receptor_HA_IL2,IL2RB,IL2,iNKT_cells,T4_activated,False,,,0.313165,True,0.074279,0.000436,0.176771
1357,IL2_receptor_HA_IL2,IL2RB,IL2,B_cells_memory_activated,iNKT_cells,False,,,0.176938,True,0.114141,0.000207,0.23246
1378,IL2_receptor_HA_IL2,IL2RB,IL2,T_gd,iNKT_cells,False,,,0.211247,True,0.114141,0.000207,0.23246


In [136]:
# making indices uniques for concatenantion later
df_output_upreg_complex_member_1.index = [idx + '_member_1' for idx in df_output_upreg_complex_member_1.index]
df_output_upreg_complex_member_2.index = [idx + '_member_2' for idx in df_output_upreg_complex_member_2.index]
df_output_upreg_complex_member_3.index = [idx + '_member_3' for idx in df_output_upreg_complex_member_3.index]

In [137]:
# getting all indices
idx_concat = list(df_output_upreg_complex_member_1.index) + list(df_output_upreg_complex_member_2.index) + list(df_output_upreg_complex_member_3.index)

# sorting by original index number, so that the order is: member 1, member 2 and (where applicable) member 3
idx_concat.sort(key = lambda x: x.split('_')[0])
idx_concat

['100_member_1',
 '100_member_2',
 '101_member_1',
 '101_member_2',
 '102_member_1',
 '102_member_2',
 '103_member_1',
 '103_member_2',
 '104_member_1',
 '104_member_2',
 '105_member_1',
 '105_member_2',
 '106_member_1',
 '106_member_2',
 '107_member_1',
 '107_member_2',
 '108_member_1',
 '108_member_2',
 '109_member_1',
 '109_member_2',
 '110_member_1',
 '110_member_2',
 '111_member_1',
 '111_member_2',
 '112_member_1',
 '112_member_2',
 '113_member_1',
 '113_member_2',
 '126_member_1',
 '126_member_2',
 '127_member_1',
 '127_member_2',
 '128_member_1',
 '128_member_2',
 '129_member_1',
 '129_member_2',
 '130_member_1',
 '130_member_2',
 '131_member_1',
 '131_member_2',
 '132_member_1',
 '132_member_2',
 '133_member_1',
 '133_member_2',
 '134_member_1',
 '134_member_2',
 '1356_member_1',
 '1356_member_2',
 '1356_member_3',
 '1357_member_1',
 '1357_member_2',
 '1357_member_3',
 '1358_member_1',
 '1358_member_2',
 '1358_member_3',
 '1359_member_1',
 '1359_member_2',
 '1359_member_3',
 '

In [138]:
df_output_upreg_complex_deconv = pd.concat([df_output_upreg_complex_member_1, df_output_upreg_complex_member_2, df_output_upreg_complex_member_3])

In [139]:
df_output_upreg_complex_member_1

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
36_member_1,FN1_integrin_a4b1_complex,FN1,ITGB1,Macrophages,T8_activated,False,,,0.133807,True,0.08594,0.000002,0.215622
37_member_1,FN1_integrin_a4b1_complex,FN1,ITGB1,Macrophages,T_regs,False,,,0.133807,True,0.046386,0.011582,0.307562
38_member_1,FN1_integrin_a4b1_complex,FN1,ITGB1,Macrophages,iNKT_cells,False,,,0.133807,True,0.070648,0.007818,0.173387
39_member_1,FN1_integrin_a4b1_complex,FN1,ITGB1,Monocytes_classical,T8_activated,False,,,0.13555,True,0.08594,0.000002,0.215622
40_member_1,FN1_integrin_a4b1_complex,FN1,ITGB1,Monocytes_classical,T_regs,False,,,0.13555,True,0.046386,0.011582,0.307562
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014_member_1,IFNG_Type_II_IFNR,IFNG,IFNGR1,iNKT_cells,Precursor_cells,True,1.650786,0.0,0.838105,False,,,0.121359
2015_member_1,IFNG_Type_II_IFNR,IFNG,IFNGR1,iNKT_cells,cDC1,True,1.650786,0.0,0.838105,False,,,0.565306
2016_member_1,IFNG_Type_II_IFNR,IFNG,IFNGR1,iNKT_cells,cDC2,True,1.650786,0.0,0.838105,False,,,0.196277
2017_member_1,IFNG_Type_II_IFNR,IFNG,IFNGR1,iNKT_cells,pDC,True,1.650786,0.0,0.838105,False,,,0.120301


In [140]:
# organising entries so that member 1 entry is followed by member 2 entry and then member 3 entry if applicable
df_output_upreg_complex_deconv = df_output_upreg_complex_deconv.loc[idx_concat,:]

In [141]:
df_output_upreg_complex_deconv

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
100_member_1,FN1_integrin_a4b7_complex,FN1,ITGB7,Monocytes_intermediate,T4_memory,True,0.059207,0.020902,0.153815,False,,,0.114091
100_member_2,FN1_integrin_a4b7_complex,FN1,ITGA4,Monocytes_intermediate,T4_memory,True,0.059207,0.020902,0.153815,False,,,0.33603
101_member_1,FN1_integrin_a4b7_complex,FN1,ITGB7,Monocytes_intermediate,T8_activated,True,0.059207,0.020902,0.153815,True,0.037493,0.016233,0.132918
101_member_2,FN1_integrin_a4b7_complex,FN1,ITGA4,Monocytes_intermediate,T8_activated,True,0.059207,0.020902,0.153815,True,0.094761,0.00005,0.351493
102_member_1,FN1_integrin_a4b7_complex,FN1,ITGB7,Monocytes_intermediate,TCM_CD8+,True,0.059207,0.020902,0.153815,False,,,0.102259
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97_member_2,FN1_integrin_a4b7_complex,FN1,ITGA4,Monocytes_intermediate,Monocytes_classical,True,0.059207,0.020902,0.153815,False,,,0.312751
98_member_1,FN1_integrin_a4b7_complex,FN1,ITGB7,Monocytes_intermediate,NK_CD56_bright_activated,True,0.059207,0.020902,0.153815,False,,,0.142114
98_member_2,FN1_integrin_a4b7_complex,FN1,ITGA4,Monocytes_intermediate,NK_CD56_bright_activated,True,0.059207,0.020902,0.153815,False,,,0.177643
99_member_1,FN1_integrin_a4b7_complex,FN1,ITGB7,Monocytes_intermediate,Plasma_cells,True,0.059207,0.020902,0.153815,False,,,0.379771


In [143]:
# saving these deconvoluted complex interactions
df_output_upreg_complex_deconv.to_csv(save_path + '20210416_cellphone_interactions_table_with_gene_stats_upreg_in_CVID_all_validation_cohort_no_logFC_cutoff_complexes_deconv_into_pseduinteractions.csv')

In [144]:
# saving the simple interactions table
df_output_upreg_simple.to_csv(save_path + '20210416_cellphone_interactions_table_with_gene_stats_upreg_in_CVID_all_validation_cohort_no_logFC_cutoff_simple_interactions.csv')

In [146]:
save_path

'/lustre/scratch117/cellgen/team292/aa22/adata_objects/202009_CVID_revision/'

In [147]:
save_path

'/lustre/scratch117/cellgen/team292/aa22/adata_objects/202009_CVID_revision/'

In [148]:
#df_output_upreg.to_csv(save_path + '20210318_cellphone_interactions_table_with_gene_stats_upreg_in_CVID_validation_cohort_no_logFC_cutoff.csv')

### Downreg interactions

In [149]:
faulty_index_count = 0

vec2_append_downreg = {}

# row count
curr_count = 0

for interaction in df_Exrp_LR_in_celltype_pairs_downreg_DE.index:
    
    print(interaction, list(df_Exrp_LR_in_celltype_pairs_downreg_DE.index).index(interaction) + 1, 
          'out of', len(df_Exrp_LR_in_celltype_pairs_downreg_DE.index))
    # current row
    curr_table = pd.DataFrame(df_Exrp_LR_in_celltype_pairs_downreg_DE.loc[interaction])
    curr_table = curr_table[curr_table[interaction] > 0]
    
    for celltype_pair in list(curr_table.index):
        
        #print('row', curr_count)
        
        #print('celltype pair', celltype_pair)
        
        # row by row
        vec2_append_downreg[str(curr_count)] = {}
        
        vec2_append_downreg[str(curr_count)]['interaction'] = interaction
        
        #print(celltype_pair)
        
        # getting genes, these are lists of length 1 for simple interactions and > 1 for complexes
        curr_partner_A_genes = Int2Gene[interaction]['partner_a']
        curr_partner_B_genes = Int2Gene[interaction]['partner_b']
        
        #print('curr partner A genes', curr_partner_A_genes, 'len:', len(curr_partner_A_genes))
        #print('curr partner B genes', curr_partner_B_genes, 'len:', len(curr_partner_B_genes))
        
        vec2_append_downreg[str(curr_count)]['partner_A_genes'] = curr_partner_A_genes
        vec2_append_downreg[str(curr_count)]['partner_B_genes'] = curr_partner_B_genes
        
        curr_celltype_A = celltype_pair.split('---')[0]
        curr_celltype_B = celltype_pair.split('---')[1]
        
        #print('curr partner A cell type', curr_celltype_A)
        #print('curr partner B cell type', curr_celltype_B)
        
        vec2_append_downreg[str(curr_count)]['celltype_A'] = curr_celltype_A
        vec2_append_downreg[str(curr_count)]['celltype_B'] = curr_celltype_B
        
        
        # are all partner_A genes DE in celltype_A and are all partner_B genes DE in celltype_B?
        # these DE table subsets have been filtered already according to cutoffs declared in the beginning of the notebook
        curr_celltype_A_DE_table_subset = DE_df_downreg[DE_df_downreg['cluster'] == curr_celltype_A]
        curr_celltype_A_DE_table_subset.set_index('Gene', inplace=True)
        
        # Per_df table for all genes, even not DE - to include % of cells expressing even non-DE partners
        curr_celltype_A_per_df_full = pd.DataFrame(Per_df.loc[:,curr_celltype_A])
        
        curr_celltype_B_DE_table_subset = DE_df_downreg[DE_df_downreg['cluster'] == curr_celltype_B]
        curr_celltype_B_DE_table_subset.set_index('Gene', inplace=True)
        
        # Per_df table for all genes, even not DE - to include % of cells expressing even non-DE partners
        curr_celltype_B_per_df_full = pd.DataFrame(Per_df.loc[:,curr_celltype_B])

        # if partner A is DE, add stats
        if all(elem in list(curr_celltype_A_DE_table_subset.index) for elem in curr_partner_A_genes):
            #print('curr partner A is DE')
            vec2_append_downreg[str(curr_count)]['is_partner_A_DE'] = True
            vec2_append_downreg[str(curr_count)]['logFC_gene_A'] = list(curr_celltype_A_DE_table_subset.loc[curr_partner_A_genes,'logFC'])
            vec2_append_downreg[str(curr_count)]['adj_pval_gene_A'] = list(curr_celltype_A_DE_table_subset.loc[curr_partner_A_genes,'adj.P.Val'])
            vec2_append_downreg[str(curr_count)]['percent_expr_gene_A'] = list(curr_celltype_A_DE_table_subset.loc[curr_partner_A_genes,'percentExpr_cluster'])
            
        else: # if not DE, add 'NA'
            #print('curr partner A is NOT DE')
            vec2_append_downreg[str(curr_count)]['is_partner_A_DE'] = False
            vec2_append_downreg[str(curr_count)]['logFC_gene_A'] = 'NA'
            vec2_append_downreg[str(curr_count)]['adj_pval_gene_A'] = 'NA'
            # even if the partner is not DE, we still want to know the % of cells expressing it
            vec2_append_downreg[str(curr_count)]['percent_expr_gene_A'] = list(curr_celltype_A_per_df_full.loc[curr_partner_A_genes,curr_celltype_A])
            
            
        # if partner B is DE, add stats
        if all(elem in list(curr_celltype_B_DE_table_subset.index) for elem in curr_partner_B_genes):
            #print('curr partner B is DE')
            vec2_append_downreg[str(curr_count)]['is_partner_B_DE'] = True
            vec2_append_downreg[str(curr_count)]['logFC_gene_B'] = list(curr_celltype_B_DE_table_subset.loc[curr_partner_B_genes,'logFC'])
            vec2_append_downreg[str(curr_count)]['adj_pval_gene_B'] = list(curr_celltype_B_DE_table_subset.loc[curr_partner_B_genes,'adj.P.Val'])
            vec2_append_downreg[str(curr_count)]['percent_expr_gene_B'] = list(curr_celltype_B_DE_table_subset.loc[curr_partner_B_genes,'percentExpr_cluster'])
        else: # if not DE, add 'NA'
            #print('curr partner B is NOT DE')
            vec2_append_downreg[str(curr_count)]['is_partner_B_DE'] = False
            vec2_append_downreg[str(curr_count)]['logFC_gene_B'] = 'NA'
            vec2_append_downreg[str(curr_count)]['adj_pval_gene_B'] = 'NA'
            # even if the partner is not DE, we still want to know the % of cells expressing it
            vec2_append_downreg[str(curr_count)]['percent_expr_gene_B'] = list(curr_celltype_B_per_df_full.loc[curr_partner_B_genes,curr_celltype_B])
            
        curr_count += 1
    
    

PVR_CD96 1 out of 68
SIRPA_CD47 2 out of 68
PLAUR_integrin_a4b1_complex 3 out of 68
CD40LG_integrin_a5b1_complex 4 out of 68
TGFB1_TGFBR3 5 out of 68
ICAM1_integrin_aMb2_complex 6 out of 68
C3_integrin_aMb2_complex 7 out of 68
ICAM1_integrin_aXb2_complex 8 out of 68
CXCR3_CXCL9 9 out of 68
DPP4_CXCL9 10 out of 68
CD8_receptor_LCK 11 out of 68
CD94:NKG2A_HLA-E 12 out of 68
CD94:NKG2C_HLA-E 13 out of 68
CD94:NKG2E_HLA-E 14 out of 68
CD74_APP 15 out of 68
ICAM1_SPN 16 out of 68
ICAM1_ITGAL 17 out of 68
ICAM1_integrin_aLb2_complex 18 out of 68
NRP1_VEGFB 19 out of 68
GMCSFR_CSF2 20 out of 68
HLA-A_KIR3DL1 21 out of 68
HLA-F_KIR3DL1 22 out of 68
HLA-F_KIR3DL2 23 out of 68
HLA-B_KIR3DL2 24 out of 68
HLA-F_LILRB2 25 out of 68
HLA-F_LILRB1 26 out of 68
CCL4_CCR5 27 out of 68
CCL5_CCR5 28 out of 68
KLRB1_CLEC2D 29 out of 68
TNF_TNFRSF1A 30 out of 68
LTA_TNFRSF1A 31 out of 68
TNF_TNFRSF1B 32 out of 68
LTA_TNFRSF1B 33 out of 68
CCR7_CCL19 34 out of 68
CD27_CD70 35 out of 68
CD40_CD40LG 36 out of 

In [150]:
# outlining the final table format
df_output_downreg = pd.DataFrame(columns = ['interaction',
                                     'partner_A_genes',
                                     'partner_B_genes',
                                     'celltype_A',
                                     'celltype_B',
                                     'is_partner_A_DE',
                                     'logFC_gene_A',
                                     'adj_pval_gene_A',
                                     'percent_expr_gene_A',
                                     'is_partner_B_DE',
                                     'logFC_gene_B',
                                     'adj_pval_gene_B',
                                     'percent_expr_gene_B'],
                              index = list(vec2_append_downreg.keys())
                            )
df_output_downreg

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,,,,,,,,,,,,,
1106,,,,,,,,,,,,,
1107,,,,,,,,,,,,,
1108,,,,,,,,,,,,,


In [151]:
list(df_output_downreg.columns) == list(vec2_append_downreg['0'].keys())

True

In [152]:
len(vec2_append_downreg.keys())

1110

In [153]:
vec2_append_downreg['0'].keys()

dict_keys(['interaction', 'partner_A_genes', 'partner_B_genes', 'celltype_A', 'celltype_B', 'is_partner_A_DE', 'logFC_gene_A', 'adj_pval_gene_A', 'percent_expr_gene_A', 'is_partner_B_DE', 'logFC_gene_B', 'adj_pval_gene_B', 'percent_expr_gene_B'])

In [154]:
df_output_downreg

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,,,,,,,,,,,,,
1106,,,,,,,,,,,,,
1107,,,,,,,,,,,,,
1108,,,,,,,,,,,,,


In [155]:
%%time

for i in list(vec2_append_downreg.keys()):
    #print(i)
    curr_keys = list(vec2_append_downreg[i].keys())
    for col in curr_keys:
        df_output_downreg.loc[i,col] = vec2_append_downreg[i][col]

CPU times: user 833 ms, sys: 0 ns, total: 833 ms
Wall time: 833 ms


In [156]:
df_output_downreg

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
0,PVR_CD96,[PVR],[CD96],cDC2,iNKT_cells,False,,,[0.1336717428087986],True,[-0.0816573197435213],[0.0091949260145585],[0.2044349999999999]
1,SIRPA_CD47,[SIRPA],[CD47],Macrophages,cDC1,False,,,[0.526813880126183],True,[-0.0631467073654104],[0.0414284823587023],[0.4900819999999999]
2,SIRPA_CD47,[SIRPA],[CD47],Monocytes_classical,cDC1,False,,,[0.3735842162952137],True,[-0.0631467073654104],[0.0414284823587023],[0.4900819999999999]
3,SIRPA_CD47,[SIRPA],[CD47],Monocytes_intermediate,cDC1,False,,,[0.2200808945990958],True,[-0.0631467073654104],[0.0414284823587023],[0.4900819999999999]
4,SIRPA_CD47,[SIRPA],[CD47],Monocytes_non-classical,cDC1,False,,,[0.3581452104942038],True,[-0.0631467073654104],[0.0414284823587023],[0.4900819999999999]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,CLEC2B_KLRF1,[CLEC2B],[KLRF1],T_gd,NK_CD16_bright,False,,,[0.3129055515501081],True,[-0.2136441721063],[9.141448745989339e-13],[0.229479]
1106,CLEC2B_KLRF1,[CLEC2B],[KLRF1],cDC1,NK_CD16_bright,False,,,[0.2812129425715737],True,[-0.2136441721063],[9.141448745989339e-13],[0.229479]
1107,CLEC2B_KLRF1,[CLEC2B],[KLRF1],iNKT_cells,NK_CD16_bright,False,,,[0.1133782824698367],True,[-0.2136441721063],[9.141448745989339e-13],[0.229479]
1108,CLEC2B_KLRF1,[CLEC2B],[KLRF1],pDC,NK_CD16_bright,False,,,[0.1428571428571428],True,[-0.2136441721063],[9.141448745989339e-13],[0.229479]


In [157]:
cols2correct = ['partner_A_genes', 'partner_B_genes', 'logFC_gene_A', 'adj_pval_gene_A',
       'percent_expr_gene_A', 'logFC_gene_B',
       'adj_pval_gene_B', 'percent_expr_gene_B']

for row in list(df_output_downreg.index):
    #print('row', row)
    for col in cols2correct:
        #print('column', col)
        curr_value = df_output_downreg.loc[row, col] # with []
        #print(curr_value)
        if (curr_value != 'NA') & (len(curr_value) == 1): # ignoring NAs and lists of length > 1 - complex genes that is
        #if not isinstance(curr_value, list) & (curr_value != 'NA'): # ignoring NAs and lists of length > 1 - complex genes that is
            df_output_downreg.loc[row, col] = curr_value[0] # this just get the element - string if a gene, numerical value if it's a stat
            

In [158]:
df_output_downreg

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
0,PVR_CD96,PVR,CD96,cDC2,iNKT_cells,False,,,0.133672,True,-0.081657,0.009195,0.204435
1,SIRPA_CD47,SIRPA,CD47,Macrophages,cDC1,False,,,0.526814,True,-0.063147,0.041428,0.490082
2,SIRPA_CD47,SIRPA,CD47,Monocytes_classical,cDC1,False,,,0.373584,True,-0.063147,0.041428,0.490082
3,SIRPA_CD47,SIRPA,CD47,Monocytes_intermediate,cDC1,False,,,0.220081,True,-0.063147,0.041428,0.490082
4,SIRPA_CD47,SIRPA,CD47,Monocytes_non-classical,cDC1,False,,,0.358145,True,-0.063147,0.041428,0.490082
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,CLEC2B_KLRF1,CLEC2B,KLRF1,T_gd,NK_CD16_bright,False,,,0.312906,True,-0.213644,0.0,0.229479
1106,CLEC2B_KLRF1,CLEC2B,KLRF1,cDC1,NK_CD16_bright,False,,,0.281213,True,-0.213644,0.0,0.229479
1107,CLEC2B_KLRF1,CLEC2B,KLRF1,iNKT_cells,NK_CD16_bright,False,,,0.113378,True,-0.213644,0.0,0.229479
1108,CLEC2B_KLRF1,CLEC2B,KLRF1,pDC,NK_CD16_bright,False,,,0.142857,True,-0.213644,0.0,0.229479


In [159]:
df_output_downreg.columns

Index(['interaction', 'partner_A_genes', 'partner_B_genes', 'celltype_A',
       'celltype_B', 'is_partner_A_DE', 'logFC_gene_A', 'adj_pval_gene_A',
       'percent_expr_gene_A', 'is_partner_B_DE', 'logFC_gene_B',
       'adj_pval_gene_B', 'percent_expr_gene_B'],
      dtype='object')

In [160]:
save_path

'/lustre/scratch117/cellgen/team292/aa22/adata_objects/202009_CVID_revision/'

In [161]:
#df_output_downreg.to_csv(save_path + '20210318_cellphone_interactions_table_with_gene_stats_downreg_in_CVID_validation_cohort_no_logFC_cutoff.csv')

### Splitting tables into 2 tables: simple interactions and complex interactions, latter being deconvoluted into pseudo-interactions for each subunit of a complex

In [162]:
# getting indices of complex interactions
complex_interaction_rows_downreg = []

for n_row in list(df_output_downreg.index):
    #print('row', n_row)
    
    curr_partner_A_genes = df_output_downreg.loc[n_row, 'partner_A_genes']
    curr_partner_B_genes = df_output_downreg.loc[n_row, 'partner_B_genes']
    
    if isinstance(curr_partner_A_genes, list) or isinstance(curr_partner_B_genes, list): # if partner A or B is a complex
        complex_interaction_rows_downreg.append(n_row)
        

In [163]:
len(complex_interaction_rows_downreg)

221

In [164]:
np.unique(complex_interaction_rows_downreg, return_counts=True)

(array(['10', '1023', '1024', '1025', '1026', '1027', '1028', '1029',
        '1030', '1031', '1032', '1033', '1034', '1035', '1036', '1037',
        '1038', '1039', '1040', '1041', '1042', '1043', '1044', '1045',
        '1046', '1047', '1048', '1049', '1050', '1051', '1052', '1053',
        '1054', '1055', '1056', '1057', '1058', '1059', '1060', '1061',
        '1062', '1063', '1064', '1065', '1066', '1067', '1068', '1069',
        '1070', '1071', '1072', '1073', '1074', '1075', '1076', '1077',
        '1078', '1079', '1080', '1081', '1082', '109', '11', '110', '111',
        '112', '113', '114', '115', '116', '117', '118', '119', '12',
        '120', '121', '122', '123', '124', '125', '13', '14', '15', '16',
        '17', '18', '180', '181', '182', '183', '184', '185', '186', '187',
        '188', '189', '19', '190', '191', '192', '193', '194', '195',
        '196', '197', '198', '199', '20', '200', '201', '208', '209', '21',
        '210', '211', '212', '22', '23', '24', '25', '26'

In [165]:
# splitting simple and complex interactions into 2 separate tables
df_output_downreg_complex = df_output_downreg.loc[complex_interaction_rows_downreg,:]
df_output_downreg_simple = df_output_downreg.drop(complex_interaction_rows_downreg, axis=0)

In [166]:
df_output_downreg_simple.shape

(889, 13)

In [167]:
df_output_downreg_complex.shape

(221, 13)

In [168]:
df_output_downreg.shape

(1110, 13)

In [169]:
df_output_downreg

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
0,PVR_CD96,PVR,CD96,cDC2,iNKT_cells,False,,,0.133672,True,-0.081657,0.009195,0.204435
1,SIRPA_CD47,SIRPA,CD47,Macrophages,cDC1,False,,,0.526814,True,-0.063147,0.041428,0.490082
2,SIRPA_CD47,SIRPA,CD47,Monocytes_classical,cDC1,False,,,0.373584,True,-0.063147,0.041428,0.490082
3,SIRPA_CD47,SIRPA,CD47,Monocytes_intermediate,cDC1,False,,,0.220081,True,-0.063147,0.041428,0.490082
4,SIRPA_CD47,SIRPA,CD47,Monocytes_non-classical,cDC1,False,,,0.358145,True,-0.063147,0.041428,0.490082
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,CLEC2B_KLRF1,CLEC2B,KLRF1,T_gd,NK_CD16_bright,False,,,0.312906,True,-0.213644,0.0,0.229479
1106,CLEC2B_KLRF1,CLEC2B,KLRF1,cDC1,NK_CD16_bright,False,,,0.281213,True,-0.213644,0.0,0.229479
1107,CLEC2B_KLRF1,CLEC2B,KLRF1,iNKT_cells,NK_CD16_bright,False,,,0.113378,True,-0.213644,0.0,0.229479
1108,CLEC2B_KLRF1,CLEC2B,KLRF1,pDC,NK_CD16_bright,False,,,0.142857,True,-0.213644,0.0,0.229479


In [170]:
df_output_downreg_complex

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
7,PLAUR_integrin_a4b1_complex,PLAUR,"[ITGB1, ITGA4]",Monocytes_classical,B_cells_memory,True,-0.08623,0.009779,0.561617,False,,,"[0.1996098829648894, 0.3446033810143042]"
8,PLAUR_integrin_a4b1_complex,PLAUR,"[ITGB1, ITGA4]",Monocytes_classical,B_cells_memory_activated,True,-0.08623,0.009779,0.561617,False,,,"[0.1988657844990548, 0.3986767485822306]"
9,PLAUR_integrin_a4b1_complex,PLAUR,"[ITGB1, ITGA4]",Monocytes_classical,MAIT_cells,True,-0.08623,0.009779,0.561617,False,,,"[0.124467178175618, 0.2557544757033248]"
10,PLAUR_integrin_a4b1_complex,PLAUR,"[ITGB1, ITGA4]",Monocytes_classical,Macrophages,True,-0.08623,0.009779,0.561617,False,,,"[0.5068349106203995, 0.224763406940063]"
11,PLAUR_integrin_a4b1_complex,PLAUR,"[ITGB1, ITGA4]",Monocytes_classical,Monocytes_intermediate,True,-0.08623,0.009779,0.561617,False,,,"[0.2307875327147275, 0.1453723530811325]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1078,CD47_SIRB1_complex,CD47,"[SIRPB1, TYROBP]",iNKT_cells,cDC1,False,,,0.346877,True,"[-0.0370663419936127, -0.0884798100710285]","[0.0122364390604355, 1.50842891999608e-05]","[0.130653, 0.985189]"
1079,CD47_SIRB1_complex,CD47,"[SIRPB1, TYROBP]",pDC,Macrophages,False,,,0.285714,True,"[-0.0425479309720542, -0.0842033434927592]","[0.0282921855917273, 0.001152148442965]","[0.192878, 0.989826]"
1080,CD47_SIRB1_complex,CD47,"[SIRPB1, TYROBP]",pDC,cDC1,False,,,0.285714,True,"[-0.0370663419936127, -0.0884798100710285]","[0.0122364390604355, 1.50842891999608e-05]","[0.130653, 0.985189]"
1081,CD47_SIRB1_complex,CD47,"[SIRPB1, TYROBP]",Macrophages,Macrophages,False,,,0.519716,True,"[-0.0425479309720542, -0.0842033434927592]","[0.0282921855917273, 0.001152148442965]","[0.192878, 0.989826]"


In [171]:
# checking whether any complexes interact with other complexes and how many subunits each complex contains here

n_subunits_downreg = []

for n_row in list(df_output_downreg.index):
    #print('row', n_row)
    
    curr_partner_A_genes = df_output_downreg.loc[n_row, 'partner_A_genes']
    curr_partner_B_genes = df_output_downreg.loc[n_row, 'partner_B_genes']
    
    if isinstance(curr_partner_A_genes, list) and isinstance(curr_partner_B_genes, list): # if partner A AND B is a complex
        print('row', n_row)
        print('both are complexes')
        
    if isinstance(curr_partner_A_genes, list): # if partner A is a complex
        print('row', n_row)
        print('curr_partner_A_genes',curr_partner_A_genes)
        print('partner A is a complex, len is:', len(curr_partner_A_genes))
        n_subunits_downreg.append(len(curr_partner_A_genes))
        
    if isinstance(curr_partner_B_genes, list): # if partner B is a complex
        print('row', n_row)
        print('curr_partner_B_genes',curr_partner_B_genes)
        print('partner B is a complex, len is:', len(curr_partner_B_genes))
        n_subunits_downreg.append(len(curr_partner_B_genes))

row 7
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 8
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 9
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 10
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 11
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 12
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 13
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 14
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 15
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 16
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 17
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 18
curr_partner_B_genes ['ITGB1', 'ITGA4']
partner B is a complex, len is: 2
row 19
curr_partner_B_genes ['I

In [172]:
np.unique(n_subunits_downreg, return_counts=True)

(array([2]), array([221]))

#### So 1 more scenario to ignore: there are no interactions of a complex with a complex
#### And max complex size is 2 subunits here

In [173]:
df_output_downreg_complex

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
7,PLAUR_integrin_a4b1_complex,PLAUR,"[ITGB1, ITGA4]",Monocytes_classical,B_cells_memory,True,-0.08623,0.009779,0.561617,False,,,"[0.1996098829648894, 0.3446033810143042]"
8,PLAUR_integrin_a4b1_complex,PLAUR,"[ITGB1, ITGA4]",Monocytes_classical,B_cells_memory_activated,True,-0.08623,0.009779,0.561617,False,,,"[0.1988657844990548, 0.3986767485822306]"
9,PLAUR_integrin_a4b1_complex,PLAUR,"[ITGB1, ITGA4]",Monocytes_classical,MAIT_cells,True,-0.08623,0.009779,0.561617,False,,,"[0.124467178175618, 0.2557544757033248]"
10,PLAUR_integrin_a4b1_complex,PLAUR,"[ITGB1, ITGA4]",Monocytes_classical,Macrophages,True,-0.08623,0.009779,0.561617,False,,,"[0.5068349106203995, 0.224763406940063]"
11,PLAUR_integrin_a4b1_complex,PLAUR,"[ITGB1, ITGA4]",Monocytes_classical,Monocytes_intermediate,True,-0.08623,0.009779,0.561617,False,,,"[0.2307875327147275, 0.1453723530811325]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1078,CD47_SIRB1_complex,CD47,"[SIRPB1, TYROBP]",iNKT_cells,cDC1,False,,,0.346877,True,"[-0.0370663419936127, -0.0884798100710285]","[0.0122364390604355, 1.50842891999608e-05]","[0.130653, 0.985189]"
1079,CD47_SIRB1_complex,CD47,"[SIRPB1, TYROBP]",pDC,Macrophages,False,,,0.285714,True,"[-0.0425479309720542, -0.0842033434927592]","[0.0282921855917273, 0.001152148442965]","[0.192878, 0.989826]"
1080,CD47_SIRB1_complex,CD47,"[SIRPB1, TYROBP]",pDC,cDC1,False,,,0.285714,True,"[-0.0370663419936127, -0.0884798100710285]","[0.0122364390604355, 1.50842891999608e-05]","[0.130653, 0.985189]"
1081,CD47_SIRB1_complex,CD47,"[SIRPB1, TYROBP]",Macrophages,Macrophages,False,,,0.519716,True,"[-0.0425479309720542, -0.0842033434927592]","[0.0282921855917273, 0.001152148442965]","[0.192878, 0.989826]"


In [174]:
# Duplicating the table and then choosing only 0th or 1st or 2nd values for the complexes
df_output_downreg_complex_member_1 = df_output_downreg_complex.copy()
df_output_downreg_complex_member_2 = df_output_downreg_complex.copy()
#df_output_downreg_complex_member_3 = df_output_downreg_complex.copy()

In [175]:
# splitting complex interaction entries by subunits / members
# for any values in these tables, if it's a list, save the 0th for df_output_upreg_complex_member_1 and 1st for df_output_upreg_complex_member_2

# which rows contain interaction with a 3-subunit complex? to then subset df_output_downreg_complex_member_3
#subunit_3_rows = []

for n_row in list(df_output_downreg_complex.index):
    #print('outside for loop, row', n_row)
    
    for col in df_output_downreg_complex.columns:
        #print('col', col)
        if isinstance(df_output_downreg_complex.loc[n_row, col], list):
            df_output_downreg_complex_member_1.loc[n_row, col] = df_output_downreg_complex.loc[n_row, col][0]
            df_output_downreg_complex_member_2.loc[n_row, col] = df_output_downreg_complex.loc[n_row, col][1]
            
            # additionally, if there are 3 subunits, separate into 3 entries
            #if len(df_output_upreg_complex.loc[n_row, col]) == 3:
                #df_output_upreg_complex_member_3.loc[n_row, col] = df_output_upreg_complex.loc[n_row, col][2]
                #print('3-subunit complex')
                #print('row', n_row)
                #print('adding to subunit_3_rows')
                #subunit_3_rows.append(n_row)


In [176]:
# making indices uniques for concatenantion later
df_output_downreg_complex_member_1.index = [idx + '_member_1' for idx in df_output_downreg_complex_member_1.index]
df_output_downreg_complex_member_2.index = [idx + '_member_2' for idx in df_output_downreg_complex_member_2.index]
#df_output_upreg_complex_member_3.index = [idx + '_member_3' for idx in df_output_upreg_complex_member_3.index]

In [177]:
# getting all indices
idx_concat = list(df_output_downreg_complex_member_1.index) + list(df_output_downreg_complex_member_2.index)

# sorting by original index number, so that the order is: member 1, member 2 and (where applicable) member 3
idx_concat.sort(key = lambda x: x.split('_')[0])
idx_concat

['10_member_1',
 '10_member_2',
 '1023_member_1',
 '1023_member_2',
 '1024_member_1',
 '1024_member_2',
 '1025_member_1',
 '1025_member_2',
 '1026_member_1',
 '1026_member_2',
 '1027_member_1',
 '1027_member_2',
 '1028_member_1',
 '1028_member_2',
 '1029_member_1',
 '1029_member_2',
 '1030_member_1',
 '1030_member_2',
 '1031_member_1',
 '1031_member_2',
 '1032_member_1',
 '1032_member_2',
 '1033_member_1',
 '1033_member_2',
 '1034_member_1',
 '1034_member_2',
 '1035_member_1',
 '1035_member_2',
 '1036_member_1',
 '1036_member_2',
 '1037_member_1',
 '1037_member_2',
 '1038_member_1',
 '1038_member_2',
 '1039_member_1',
 '1039_member_2',
 '1040_member_1',
 '1040_member_2',
 '1041_member_1',
 '1041_member_2',
 '1042_member_1',
 '1042_member_2',
 '1043_member_1',
 '1043_member_2',
 '1044_member_1',
 '1044_member_2',
 '1045_member_1',
 '1045_member_2',
 '1046_member_1',
 '1046_member_2',
 '1047_member_1',
 '1047_member_2',
 '1048_member_1',
 '1048_member_2',
 '1049_member_1',
 '1049_member_

In [178]:
df_output_downreg_complex_deconv = pd.concat([df_output_downreg_complex_member_1, df_output_downreg_complex_member_2])

In [179]:
df_output_downreg_complex_deconv

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
7_member_1,PLAUR_integrin_a4b1_complex,PLAUR,ITGB1,Monocytes_classical,B_cells_memory,True,-0.08623,0.009779,0.561617,False,,,0.19961
8_member_1,PLAUR_integrin_a4b1_complex,PLAUR,ITGB1,Monocytes_classical,B_cells_memory_activated,True,-0.08623,0.009779,0.561617,False,,,0.198866
9_member_1,PLAUR_integrin_a4b1_complex,PLAUR,ITGB1,Monocytes_classical,MAIT_cells,True,-0.08623,0.009779,0.561617,False,,,0.124467
10_member_1,PLAUR_integrin_a4b1_complex,PLAUR,ITGB1,Monocytes_classical,Macrophages,True,-0.08623,0.009779,0.561617,False,,,0.506835
11_member_1,PLAUR_integrin_a4b1_complex,PLAUR,ITGB1,Monocytes_classical,Monocytes_intermediate,True,-0.08623,0.009779,0.561617,False,,,0.230788
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1078_member_2,CD47_SIRB1_complex,CD47,TYROBP,iNKT_cells,cDC1,False,,,0.346877,True,-0.08848,0.000015,0.985189
1079_member_2,CD47_SIRB1_complex,CD47,TYROBP,pDC,Macrophages,False,,,0.285714,True,-0.084203,0.001152,0.989826
1080_member_2,CD47_SIRB1_complex,CD47,TYROBP,pDC,cDC1,False,,,0.285714,True,-0.08848,0.000015,0.985189
1081_member_2,CD47_SIRB1_complex,CD47,TYROBP,Macrophages,Macrophages,False,,,0.519716,True,-0.084203,0.001152,0.989826


In [180]:
# organising entries so that member 1 entry is followed by member 2 entry and then member 3 entry if applicable
df_output_downreg_complex_deconv = df_output_downreg_complex_deconv.loc[idx_concat,:]

In [181]:
df_output_downreg_complex_deconv

Unnamed: 0,interaction,partner_A_genes,partner_B_genes,celltype_A,celltype_B,is_partner_A_DE,logFC_gene_A,adj_pval_gene_A,percent_expr_gene_A,is_partner_B_DE,logFC_gene_B,adj_pval_gene_B,percent_expr_gene_B
10_member_1,PLAUR_integrin_a4b1_complex,PLAUR,ITGB1,Monocytes_classical,Macrophages,True,-0.08623,0.009779,0.561617,False,,,0.506835
10_member_2,PLAUR_integrin_a4b1_complex,PLAUR,ITGA4,Monocytes_classical,Macrophages,True,-0.08623,0.009779,0.561617,False,,,0.224763
1023_member_1,CD47_SIRB1_complex,CD47,SIRPB1,B_cells_memory,Macrophages,False,,,0.458388,True,-0.042548,0.028292,0.192878
1023_member_2,CD47_SIRB1_complex,CD47,TYROBP,B_cells_memory,Macrophages,False,,,0.458388,True,-0.084203,0.001152,0.989826
1024_member_1,CD47_SIRB1_complex,CD47,SIRPB1,B_cells_memory,cDC1,False,,,0.458388,True,-0.037066,0.012236,0.130653
...,...,...,...,...,...,...,...,...,...,...,...,...,...
761_member_2,IFNG_Type_II_IFNR,IFNG,IFNGR2,T_regs,pDC,True,-0.304348,0.0,0.849491,False,,,0.120301
8_member_1,PLAUR_integrin_a4b1_complex,PLAUR,ITGB1,Monocytes_classical,B_cells_memory_activated,True,-0.08623,0.009779,0.561617,False,,,0.198866
8_member_2,PLAUR_integrin_a4b1_complex,PLAUR,ITGA4,Monocytes_classical,B_cells_memory_activated,True,-0.08623,0.009779,0.561617,False,,,0.398677
9_member_1,PLAUR_integrin_a4b1_complex,PLAUR,ITGB1,Monocytes_classical,MAIT_cells,True,-0.08623,0.009779,0.561617,False,,,0.124467


In [184]:
# saving these deconvoluted complex interactions
df_output_downreg_complex_deconv.to_csv(save_path + '20210416_cellphone_interactions_table_with_gene_stats_downreg_in_CVID_all_validation_cohort_no_logFC_cutoff_complexes_deconv_into_pseduinteractions.csv')

In [183]:
# saving the simple interactions table
df_output_downreg_simple.to_csv(save_path + '20210416_cellphone_interactions_table_with_gene_stats_downreg_in_CVID_all_validation_cohort_no_logFC_cutoff_simple_interactions.csv')

In [181]:
save_path

'/lustre/scratch117/cellgen/team292/aa22/adata_objects/202009_CVID_revision/'

In [93]:
# manually checking some hits from previous version of analysis of twins
for interaction in list(df_Exrp_LR_in_celltype_pairs_upreg_DE.index):
    if 'CCL22' in interaction:
        print(interaction)
        curr_subset = pd.DataFrame(df_Exrp_LR_in_celltype_pairs_upreg_DE.loc[interaction])
        curr_subset_nonzero_interacting_celltype_pairs = list(curr_subset[curr_subset[interaction] > 0].index)
        print('this interaction is detected in following celltype pairs:', curr_subset_nonzero_interacting_celltype_pairs)

CCL22_CCR4
this interaction is detected in following celltype pairs: ['B_cells_memory---iNKT_cells', 'B_cells_memory_activated---iNKT_cells', 'B_cells_naive---iNKT_cells', 'B_cells_naive_activated---iNKT_cells', 'MAIT_cells---iNKT_cells', 'Macrophages---iNKT_cells', 'Monocytes_classical---iNKT_cells', 'Monocytes_intermediate---iNKT_cells', 'Monocytes_non-classical---iNKT_cells', 'NK_CD16_bright---iNKT_cells', 'NK_CD16_bright_activated---iNKT_cells', 'NK_CD56_bright_activated---iNKT_cells', 'Plasma_cells---iNKT_cells', 'Precursor_cells---iNKT_cells', 'T4_activated---iNKT_cells', 'T4_memory---iNKT_cells', 'T4_naive---iNKT_cells', 'T8_activated---iNKT_cells', 'T8_naive---iNKT_cells', 'TCM_CD8+---iNKT_cells', 'TEM_CD8+---iNKT_cells', 'TMRA_CD8+---iNKT_cells', 'T_gd---iNKT_cells', 'T_regs---iNKT_cells', 'cDC1---iNKT_cells', 'cDC2---iNKT_cells', 'iNKT_cells---B_cells_memory', 'iNKT_cells---B_cells_memory_activated', 'iNKT_cells---B_cells_naive', 'iNKT_cells---B_cells_naive_activated', 'iNKT_

In [90]:
curr_subset = pd.DataFrame(df_Exrp_LR_in_celltype_pairs_upreg_DE.loc['CCL22_CCR4'])
curr_subset_nonzero_interacting_celltype_pairs = list(curr_subset[curr_subset['CCL22_CCR4'] > 0].index)
curr_subset_nonzero_interacting_celltype_pairs

['B_cells_memory---iNKT_cells',
 'B_cells_memory_activated---iNKT_cells',
 'B_cells_naive---iNKT_cells',
 'B_cells_naive_activated---iNKT_cells',
 'MAIT_cells---iNKT_cells',
 'Macrophages---iNKT_cells',
 'Monocytes_classical---iNKT_cells',
 'Monocytes_intermediate---iNKT_cells',
 'Monocytes_non-classical---iNKT_cells',
 'NK_CD16_bright---iNKT_cells',
 'NK_CD16_bright_activated---iNKT_cells',
 'NK_CD56_bright_activated---iNKT_cells',
 'Plasma_cells---iNKT_cells',
 'Precursor_cells---iNKT_cells',
 'T4_activated---iNKT_cells',
 'T4_memory---iNKT_cells',
 'T4_naive---iNKT_cells',
 'T8_activated---iNKT_cells',
 'T8_naive---iNKT_cells',
 'TCM_CD8+---iNKT_cells',
 'TEM_CD8+---iNKT_cells',
 'TMRA_CD8+---iNKT_cells',
 'T_gd---iNKT_cells',
 'T_regs---iNKT_cells',
 'cDC1---iNKT_cells',
 'cDC2---iNKT_cells',
 'iNKT_cells---B_cells_memory',
 'iNKT_cells---B_cells_memory_activated',
 'iNKT_cells---B_cells_naive',
 'iNKT_cells---B_cells_naive_activated',
 'iNKT_cells---MAIT_cells',
 'iNKT_cells---Mac

In [89]:
df_Exrp_LR_in_celltype_pairs_upreg_DE.loc['CCL22_CCR4']

B_cells_memory---B_cells_memory_activated    0.0
B_cells_memory---B_cells_naive               0.0
B_cells_memory---B_cells_naive_activated     0.0
B_cells_memory---MAIT_cells                  0.0
B_cells_memory---Macrophages                 0.0
                                            ... 
TMRA_CD8+---TMRA_CD8+                        0.0
T_gd---T_gd                                  0.0
T_regs---T_regs                              0.0
cDC1---cDC1                                  0.0
iNKT_cells---iNKT_cells                      1.0
Name: CCL22_CCR4, Length: 619, dtype: float64

In [92]:
# manually checking some hits from previous version of analysis of twins
for interaction in list(df_Exrp_LR_in_celltype_pairs_downreg_DE.index):
    if 'CCL22' in interaction:
        print(interaction)
        curr_subset = pd.DataFrame(df_Exrp_LR_in_celltype_pairs_downreg_DE.loc[interaction])
        curr_subset_nonzero_interacting_celltype_pairs = list(curr_subset[curr_subset[interaction] > 0].index)
        print('this interaction is detected in following celltype pairs:', curr_subset_nonzero_interacting_celltype_pairs)

CCL22_CCR4
this interaction is detected in following celltype pairs: ['B_cells_memory_activated---B_cells_memory', 'B_cells_memory_activated---B_cells_naive', 'B_cells_memory_activated---B_cells_naive_activated', 'B_cells_memory_activated---MAIT_cells', 'B_cells_memory_activated---Macrophages', 'B_cells_memory_activated---Monocytes_classical', 'B_cells_memory_activated---Monocytes_intermediate', 'B_cells_memory_activated---Monocytes_non-classical', 'B_cells_memory_activated---NK_CD16_bright', 'B_cells_memory_activated---NK_CD16_bright_activated', 'B_cells_memory_activated---NK_CD56_bright_activated', 'B_cells_memory_activated---Plasma_cells', 'B_cells_memory_activated---T4_activated', 'B_cells_memory_activated---T4_memory', 'B_cells_memory_activated---T4_naive', 'B_cells_memory_activated---T8_activated', 'B_cells_memory_activated---T8_naive', 'B_cells_memory_activated---TCM_CD8+', 'B_cells_memory_activated---TEM_CD8+', 'B_cells_memory_activated---TMRA_CD8+', 'B_cells_memory_activated--

In [97]:
# manually checking some hits from previous version of analysis of twins
for interaction in list(df_Exrp_LR_in_celltype_pairs_upreg_DE.index):
    if 'CXCL10_CXCR3' in interaction:
        print(interaction)
        curr_subset = pd.DataFrame(df_Exrp_LR_in_celltype_pairs_upreg_DE.loc[interaction])
        curr_subset_nonzero_interacting_celltype_pairs = list(curr_subset[curr_subset[interaction] > 0].index)
        print('this interaction is detected in following celltype pairs:', curr_subset_nonzero_interacting_celltype_pairs)

CXCL10_CXCR3
this interaction is detected in following celltype pairs: ['B_cells_memory---iNKT_cells', 'B_cells_memory_activated---iNKT_cells', 'B_cells_naive---iNKT_cells', 'B_cells_naive_activated---iNKT_cells', 'MAIT_cells---iNKT_cells', 'Macrophages---iNKT_cells', 'Monocytes_classical---iNKT_cells', 'Monocytes_intermediate---iNKT_cells', 'Monocytes_non-classical---iNKT_cells', 'NK_CD16_bright---iNKT_cells', 'NK_CD16_bright_activated---iNKT_cells', 'NK_CD56_bright_activated---iNKT_cells', 'Plasma_cells---iNKT_cells', 'Precursor_cells---iNKT_cells', 'T4_activated---iNKT_cells', 'T4_memory---iNKT_cells', 'T4_naive---iNKT_cells', 'T8_activated---iNKT_cells', 'T8_naive---iNKT_cells', 'TCM_CD8+---iNKT_cells', 'TEM_CD8+---iNKT_cells', 'TMRA_CD8+---iNKT_cells', 'T_gd---iNKT_cells', 'T_regs---iNKT_cells', 'cDC1---iNKT_cells', 'cDC2---iNKT_cells', 'iNKT_cells---B_cells_memory', 'iNKT_cells---B_cells_memory_activated', 'iNKT_cells---B_cells_naive', 'iNKT_cells---B_cells_naive_activated', 'iNK

In [98]:
# manually checking some hits from previous version of analysis of twins
for interaction in list(df_Exrp_LR_in_celltype_pairs_downreg_DE.index):
    if 'CXCL10_CXCR3' in interaction:
        print(interaction)
        curr_subset = pd.DataFrame(df_Exrp_LR_in_celltype_pairs_downreg_DE.loc[interaction])
        curr_subset_nonzero_interacting_celltype_pairs = list(curr_subset[curr_subset[interaction] > 0].index)
        print('this interaction is detected in following celltype pairs:', curr_subset_nonzero_interacting_celltype_pairs)

CXCL10_CXCR3
this interaction is detected in following celltype pairs: ['B_cells_memory---NK_CD16_bright', 'B_cells_memory---TMRA_CD8+', 'B_cells_memory_activated---NK_CD16_bright', 'B_cells_memory_activated---TMRA_CD8+', 'B_cells_naive---NK_CD16_bright', 'B_cells_naive---TMRA_CD8+', 'B_cells_naive_activated---NK_CD16_bright', 'B_cells_naive_activated---TMRA_CD8+', 'MAIT_cells---NK_CD16_bright', 'MAIT_cells---TMRA_CD8+', 'Macrophages---NK_CD16_bright', 'Macrophages---TMRA_CD8+', 'Monocytes_classical---NK_CD16_bright', 'Monocytes_classical---TMRA_CD8+', 'Monocytes_intermediate---NK_CD16_bright', 'Monocytes_intermediate---TMRA_CD8+', 'Monocytes_non-classical---NK_CD16_bright', 'Monocytes_non-classical---TMRA_CD8+', 'NK_CD16_bright---TMRA_CD8+', 'NK_CD16_bright_activated---NK_CD16_bright', 'NK_CD16_bright_activated---TMRA_CD8+', 'NK_CD56_bright_activated---NK_CD16_bright', 'NK_CD56_bright_activated---TMRA_CD8+', 'Plasma_cells---NK_CD16_bright', 'Plasma_cells---TMRA_CD8+', 'T4_activated---

### Checking some stuff

In [29]:
# reading the user curated database starting files to see what interactions haven't made it here

path = '/home/jovyan/notebooks/Vento_Lab/CVID/202009_new_analysis_revision/CITE_all_samples_analysis/CVID/scTranscriptomics_CITE/cellphonedb_analysis/'

# saving them into .tsv files now
interactions_curated = pd.read_csv(path + 'interactions_curated_subset_notLuz.tsv', sep='\t', index_col=0)
complexes_curated = pd.read_csv(path + 'complex_curated.tsv', sep='\t', index_col=0)

In [30]:
interactions_curated.columns

Index(['partner_a', 'partner_b', 'protein_name_a', 'protein_name_b',
       'annotation_strategy', 'source', 'is_ppi', 'reactome_complex',
       'reactome_reaction', 'reactome_pathway', 'complexPortal_complex',
       'curator', 'comments'],
      dtype='object')

In [31]:
np.unique(interactions_curated['curator'], return_counts=True)

(array(['JRodriguezUbreva', 'RVentoTormo'], dtype=object), array([   1, 1339]))

In [32]:
np.unique(interactions_curated['annotation_strategy'], return_counts=True)

(array(['curated'], dtype=object), array([1340]))

In [65]:
interactions_curated#[interactions_curated['partner_a'] == 'Q92478']

Unnamed: 0_level_0,partner_a,partner_b,protein_name_a,protein_name_b,annotation_strategy,source,is_ppi,reactome_complex,reactome_reaction,reactome_pathway,complexPortal_complex,curator,comments
id_cp_interaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,Q9Y275,Q96RJ3,TN13B_HUMAN,TR13C_HUMAN,curated,uniprot;reactome,True,R-HSA-5676540,R-HSA-5676599,R-HSA-1280215,,JRodriguezUbreva,
CPI-CC0041E1D30,IL12,IL12_receptor,,,curated,uniprot,True,,,,,RVentoTormo,
CPI-CC0104F2A96,ACVR_1B2A_receptor,Activin_ligand_ab,,,curated,PMID:22710174;PMID:22991378,True,,,,,RVentoTormo,
CPI-CC045C36F28,ACVR_1A2A_receptor,Activin_ligand_ab,,,curated,less_common_binding;PMID:22710174;PMID:22991378_,True,,,,,RVentoTormo,
CPI-CC051643E98,IL23,IL23_receptor,,,curated,uniprot,True,,,,,RVentoTormo,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
,O14905,Q6FHJ7,WNT9B_HUMAN,SFRP4_HUMAN,curated,PMID:12775774,True,,,,,RVentoTormo,Inhibition WNT. Soluble proteins
,O14905,Q8N474,WNT9B_HUMAN,SFRP1_HUMAN,curated,PMID:12775774,True,,,,,RVentoTormo,Inhibition WNT. Soluble proteins
,O14905,Q92765,WNT9B_HUMAN,SFRP3_HUMAN,curated,PMID:12775774,True,,,,,RVentoTormo,Inhibition WNT. Soluble proteins
,O14905,Q96HF1,WNT9B_HUMAN,SFRP2_HUMAN,curated,PMID:12775774,True,,,,,RVentoTormo,Inhibition WNT. Soluble proteins


In [28]:
#for interaction in int_cpDB['interacting_pair']:
#    if 'IL' in interaction:
#        print(interaction)
#        print(int_cpDB[int_cpDB['interacting_pair'] == interaction])

In [68]:
interactions_curated.columns

Index(['partner_a', 'partner_b', 'protein_name_a', 'protein_name_b',
       'annotation_strategy', 'source', 'is_ppi', 'reactome_complex',
       'reactome_reaction', 'reactome_pathway', 'complexPortal_complex',
       'curator', 'comments'],
      dtype='object')

In [33]:
list(interactions_curated['partner_a'])[:10]

['Q9Y275',
 'IL12',
 'ACVR_1B2A_receptor',
 'ACVR_1A2A_receptor',
 'IL23',
 'ACVR_1B2B_receptor',
 'integrin_aMb2_complex',
 'ACVR_1C2A_receptor',
 'ACVR_1A2B_receptor',
 'IL27']

For example, IL12 and IL12_receptor interaction is in the initial table but is not in the final, going to see if it made it into the expr table

In [70]:
for compl in list(complexes_curated.index):
    if 'OSMR' in compl:
        print(compl)
        print(complexes_curated.loc[compl,:])

OSMR
uniprot_1                                                           Q99650
uniprot_2                                                           P40189
uniprot_3                                                              NaN
uniprot_4                                                              NaN
transmembrane                                                         True
peripheral                                                           False
secreted                                                             False
secreted_desc                                                          NaN
secreted_highlight                                                   False
receptor                                                              True
receptor_desc                                 Cytokine_receptor_IL6_family
integrin                                                             False
other                                                                False
other_desc          

In [24]:
# database generated from 1.3K odd interactions
database_file = '/home/jovyan/notebooks/Vento_Lab/CVID/202009_new_analysis_revision/CITE_all_samples_analysis/CVID/scTranscriptomics_CITE/cellphonedb_analysis/database_20210218/cellphonedb_user_2021-02-18-14_26.db'

import sqlite3

def importdb(file_path):
    conn = sqlite3.connect(file_path)
    c = conn.cursor()
    c.execute("SELECT name FROM sqlite_master WHERE type='table';")
    for table in c.fetchall():
        yield list(c.execute('SELECT * from ?;', (table[0],)))

In [26]:
database = importdb(database_file)

In [30]:
database

AttributeError: 'generator' object has no attribute 'keys'