# Process the zscore results from the 5 diseases


For figure 2f

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns

from IPython.display import display

# latex rendering of text in graphs
import matplotlib as mpl
mpl.rc('text', usetex = False)
mpl.rc('font', family = 'sans-serif')
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']

% matplotlib inline


In [3]:
import sys
sys.path.append('/Users/brinrosenthal/Google Drive/UCSD/cluster_code/pyvenn-master/')
import venn

# Load the PCnet interactome


In [4]:
Gint = nx.read_gpickle('/Users/brinrosenthal/Documents/CCBB_tickets_data/PCnet/G_PCnet.gpickle')
print(len(Gint.nodes()))
print(len(Gint.edges()))

19781
2724724


# Load the seed genes

In [5]:
ASD_HC = pd.read_csv('../../../data/ASD_HC.txt',names=['HC_genes'])

ASD_HC = ASD_HC['HC_genes'].tolist()
print(len(ASD_HC))


CHD_HC = pd.read_csv('../../../data/CHD_HC_new.txt',names=['HC_genes'])
CHD_HC = CHD_HC['HC_genes'].tolist()
print(len(CHD_HC))

65
66


# Load the zscore data

In [9]:
zthresh=2

num_reps=5000

# -------------- ASD and CHD results --------------
CHD_z = pd.read_csv('../../z_score_results/new_CHD_cohort/z_CHD_'+str(num_reps)+'_repsPCnet_degree_binning.tsv',sep='\t',names=['symbol','z'])
CHD_z.index=CHD_z['symbol']
CHD_z=CHD_z.sort_values('z',ascending=False)

ASD_z = pd.read_csv('../../z_score_results/new_CHD_cohort/z_ASD_'+str(num_reps)+'_repsPCnet_degree_binning.tsv',sep='\t',names=['symbol','z'])
ASD_z.index=ASD_z['symbol']
ASD_z=ASD_z.sort_values('z',ascending=False)



# -------------- DISGENET results --------------
print('\nPsoriasis:')
psoriasis_z = pd.read_csv('z_score_results/z_Psoriasis_5000_reps_degree_binning.tsv',sep='\t',names=['symbol','z'])
psoriasis_z.index=psoriasis_z['symbol']
psoriasis_z=psoriasis_z.sort_values('z',ascending=False)
seed_genes_temp = pd.read_csv('Psoriasis_seed_genes_200420.tsv',sep='\t',index_col='Unnamed: 0')
seed_genes_temp = [str(g[1:-1]).strip("'") for g in seed_genes_temp['seed_genes'].tolist()[0][1:-1].split(', ')]
print(len(seed_genes_temp))
print(len(psoriasis_z))
psoriasis_z=psoriasis_z.loc[list(np.setdiff1d(psoriasis_z.index.tolist(),seed_genes_temp))]
print(len(psoriasis_z))


print('\nAtherosclerosis:')
atherosclerosis_z = pd.read_csv('z_score_results/z_Atherosclerosis_5000_reps_degree_binning.tsv',sep='\t',names=['symbol','z'])
atherosclerosis_z.index=atherosclerosis_z['symbol']
atherosclerosis_z=atherosclerosis_z.sort_values('z',ascending=False)
# load seed genes and drop them
seed_genes_temp = pd.read_csv('Atherosclerosis_seed_genes_200420.tsv',sep='\t',index_col='Unnamed: 0')
seed_genes_temp = [str(g[1:-1]).strip("'") for g in seed_genes_temp['seed_genes'].tolist()[0][1:-1].split(', ')]
print(len(seed_genes_temp))
print(len(atherosclerosis_z))
atherosclerosis_z=atherosclerosis_z.loc[list(np.setdiff1d(atherosclerosis_z.index.tolist(),seed_genes_temp))]
print(len(atherosclerosis_z))


print('\nKartagener syndrome:')
kartagena_z = pd.read_csv('z_score_results/z_Kartagener_5000_reps_degree_binning.tsv',sep='\t',names=['symbol','z'])
kartagena_z.index=kartagena_z['symbol']
kartagena_z=kartagena_z.sort_values('z',ascending=False)
kartagena_z_genes = kartagena_z[kartagena_z['z']>zthresh].index.tolist()
print(len(kartagena_z_genes))
# load seed genes and drop them
seed_genes_temp = pd.read_csv('Kartagener Syndrome_seed_genes_200420.tsv',sep='\t',index_col='Unnamed: 0')
seed_genes_temp = [str(g[1:-1]).strip("'") for g in seed_genes_temp['seed_genes'].tolist()[0][1:-1].split(', ')]
print(len(seed_genes_temp))
print(len(kartagena_z))
kartagena_z=kartagena_z.loc[list(np.setdiff1d(kartagena_z.index.tolist(),seed_genes_temp))]
print(len(kartagena_z))



Psoriasis:
59
19781
19722

Atherosclerosis:
65
19781
19716

Kartagener syndrome:
962
38
19781
19744


# How much do the networks overlap?

- Plot the gene overlap in subgraphs

In [10]:
# function to return significant digits in exp form
def nsf(num, n=1):
    """n-Significant Figures"""
    numstr = ("{0:.%ie}" % (n-1)).format(num)
    return float(numstr)

def calculate_size_network_overlap(z1,z2,zthresh=3):
    z_merged = z1.join(z2['z'],lsuffix='_1',rsuffix='_2')
    z_combined = z_merged['z_1']*z_merged['z_2']*(z_merged['z_1']>0)*(z_merged['z_2']>0)
    high_z_genes = z_combined[z_combined>zthresh].index.tolist()
    
    return(len(high_z_genes))

def calculate_expected_overlap(d1,d2,z1,z2,plot=False,zthresh=4,numreps=1000):

    z_d1d2_size=calculate_size_network_overlap(z1,z2,zthresh=zthresh)

    high_z_rand = []
    for r in np.arange(numreps):
        # use permutation shuffling method instead of Fnew comparison
        d1_shuf_genes = z1.index.tolist()
        np.random.shuffle(d1_shuf_genes)
        d1_shuf=z1[:]
        d1_shuf.index=d1_shuf_genes

        d2_shuf_genes = z2.index.tolist()
        np.random.shuffle(d2_shuf_genes)
        d2_shuf=z2[:]
        d2_shuf.index=d2_shuf_genes

        high_z_temp = calculate_size_network_overlap(d1_shuf,d2_shuf,zthresh=zthresh)
        high_z_rand.append(high_z_temp)

    if plot==True:
        sns.distplot(high_z_rand,label='expected network intersection size')
        plt.plot([z_d1d2_size,z_d1d2_size],[0,0.015],label='observed '+d1+'-'+d2+' network intersection size')
        plt.xlabel('size of proximal subgraph, z>'+str(zthresh),fontsize=16)
        plt.legend(fontsize=12)
    return z_d1d2_size,high_z_rand

In [11]:
from scipy.stats import hypergeom
from scipy.stats import norm

z_dict = {'psoriasis':psoriasis_z,'atherosclerosis':atherosclerosis_z,'kartagener':kartagena_z,
          'CHD':CHD_z,'ASD':ASD_z}

# save the num overlap and overlap p-val in dataframes

focal_diseases = ['psoriasis','atherosclerosis','kartagener','CHD','ASD']

network_num_overlap = pd.DataFrame(np.zeros((len(focal_diseases),len(focal_diseases))),index=focal_diseases)
network_num_overlap.columns = focal_diseases

network_obs_exp = pd.DataFrame(np.zeros((len(focal_diseases),len(focal_diseases))),index=focal_diseases)
network_obs_exp.columns = focal_diseases

network_pval_overlap = pd.DataFrame(np.ones((len(focal_diseases),len(focal_diseases))),index=focal_diseases)
network_pval_overlap.columns = focal_diseases

for i in np.arange(len(focal_diseases)-1):
    for j in np.arange(1+i,len(focal_diseases)):
        d1=focal_diseases[i]
        d2=focal_diseases[j]
        z1=z_dict[d1]
        z2=z_dict[d2]

        # replace hypergeometric with permutation empirical p
        z_d1d2_size,high_z_rand=calculate_expected_overlap(d1,d2,z1,z2,plot=False,numreps=100,zthresh=3)
        ztemp = (z_d1d2_size-np.mean(high_z_rand))/np.std(high_z_rand)
        ptemp = norm.sf(ztemp)
        print(d1+' + '+d2)
        print(z_d1d2_size)
        obs_exp_temp = float(z_d1d2_size)/np.mean(high_z_rand)
        print(obs_exp_temp)
        print(ptemp)
        
        
        network_num_overlap.loc[d1][d2]=z_d1d2_size
        network_num_overlap.loc[d2][d1]=z_d1d2_size

        network_pval_overlap.loc[d1][d2]=ptemp
        network_pval_overlap.loc[d2][d1]=ptemp
        
        network_obs_exp.loc[d1][d2]=obs_exp_temp
        network_obs_exp.loc[d2][d1]=obs_exp_temp
        

psoriasis + atherosclerosis
1535
2.847549437910437
0.0
psoriasis + kartagener
110
0.4297210719587468
1.0
psoriasis + CHD
353
0.9279217706745176
0.9558876369044766
psoriasis + ASD
222
0.5294538516575245
1.0
atherosclerosis + kartagener
134
0.3745737127522782
1.0
atherosclerosis + CHD
558
1.0458447351651237
0.07584270725380017
atherosclerosis + ASD
257
0.45061630985569756
1.0
kartagener + CHD
188
0.7572096020621879
0.9999927487356509
kartagener + ASD
290
1.0421533043447013
0.21834377087481976
CHD + ASD
844
2.0737100737100738
1.172874125042493e-162


In [12]:
# make sure significant overlaps have >0 overlapping genes
network_pval_overlap = np.minimum(network_pval_overlap*(network_num_overlap>0)+(network_pval_overlap+1)*(network_num_overlap==0),1)

display(network_num_overlap)
display(network_pval_overlap)

# network_num_overlap.to_csv('network_num_overlap_z'+str(zthresh)+'.tsv',sep='\t')
# (-np.log(network_pval_overlap)).to_csv('network_pval_overlap_z'+str(zthresh)+'_ATH_PSO_KART.tsv',sep='\t')

Unnamed: 0,psoriasis,atherosclerosis,kartagener,CHD,ASD
psoriasis,0.0,1535.0,110.0,353.0,222.0
atherosclerosis,1535.0,0.0,134.0,558.0,257.0
kartagener,110.0,134.0,0.0,188.0,290.0
CHD,353.0,558.0,188.0,0.0,844.0
ASD,222.0,257.0,290.0,844.0,0.0


Unnamed: 0,psoriasis,atherosclerosis,kartagener,CHD,ASD
psoriasis,1.0,0.0,1.0,0.9558876,1.0
atherosclerosis,0.0,1.0,1.0,0.07584271,1.0
kartagener,1.0,1.0,1.0,0.9999927,0.2183438
CHD,0.955888,0.075843,0.999993,1.0,1.172874e-162
ASD,1.0,1.0,0.218344,1.172874e-162,1.0


In [13]:

combined_edges = pd.DataFrame(columns = ['d1','d2',
                                        'num_shared_network','neg_logP_shared_network','edge_label','obs_exp'])


for i in np.arange(len(focal_diseases)-1):
    for j in np.arange(i+1,len(focal_diseases)):
        d1 = focal_diseases[i]
        d2 = focal_diseases[j]
        num_shared_network_temp = network_num_overlap.loc[d1][d2]
        pval_shared_network_temp = network_pval_overlap.loc[d1][d2]
        edge_label_temp = str(nsf(pval_shared_network_temp,2))
        obs_exp_temp = network_obs_exp.loc[d1][d2]
        
        df_temp = pd.DataFrame({'d1':[d1],'d2':[d2],
                               'num_shared_network':[num_shared_network_temp],
                               'neg_logP_shared_network':[-np.log(pval_shared_network_temp)],
                               'edge_label':[edge_label_temp],
                               'obs_exp':[obs_exp_temp]})
        combined_edges = combined_edges.append(df_temp,ignore_index=True)
        

# combined_edges.to_csv('disease_combined_edges_ATH_PSO_KART.tsv',sep='\t')
combined_edges
    

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,d1,d2,edge_label,neg_logP_shared_network,num_shared_network,obs_exp
0,psoriasis,atherosclerosis,0.0,inf,1535.0,2.847549
1,psoriasis,kartagener,1.0,-0.0,110.0,0.429721
2,psoriasis,CHD,0.96,0.045115,353.0,0.927922
3,psoriasis,ASD,1.0,-0.0,222.0,0.529454
4,atherosclerosis,kartagener,1.0,-0.0,134.0,0.374574
5,atherosclerosis,CHD,0.076,2.579094,558.0,1.045845
6,atherosclerosis,ASD,1.0,-0.0,257.0,0.450616
7,kartagener,CHD,1.0,7e-06,188.0,0.75721
8,kartagener,ASD,0.22,1.521685,290.0,1.042153
9,CHD,ASD,1.2e-162,372.859328,844.0,2.07371
