# Integrate the results from various other notebooks (DDD, DECIPHER, DNVs), and prep for cytoscape visualization

In [1]:
import matplotlib
matplotlib.use('TkAgg')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pandas as pd
import random

import community

from scipy.stats import mannwhitneyu

import mygene
mg = mygene.MyGeneInfo()

# latex rendering of text in graphs
import matplotlib as mpl
mpl.rc('text', usetex = False)
mpl.rc('font', family = 'serif')

sns.set_style('white')

import sys

import visJS2jupyter.visJS_module
import visJS2jupyter.visualizations


% matplotlib inline

# Load the DDD and DECIPHER results

In [2]:
DDD_df = pd.read_csv('../../manuscript/tables_18_02/DDD_in_ASD_CHD_interactome.tsv',sep='\t',names=['gene','num_DDD_patients'],
                    index_col='gene')
DDD_df.head()

Unnamed: 0_level_0,num_DDD_patients
gene,Unnamed: 1_level_1
ANKRD11,41
KMT2A,35
CTNNB1,18
MED13L,16
KAT6B,13


In [3]:
decipher_df = pd.read_excel('../../manuscript/tables_18_02/G_ASD_CHD_GIANT_p2nodes_180313.xlsx',sheetname='G_ASD_CHD_GIANT_p2nodes',
                           dtype={'gene':str},index_col='gene')

decipher_df.head()

Unnamed: 0_level_0,number shared phenotype patients with a mutation in this gene in decipher database (DNV),"number shared phenotype patients explained (with seeds, DNV)","num shared phenotype patients explained (no seeds, DNV)","number shared phenotype patients explained (with seeds, all variants)","num shared phenotype patients explained (no seeds, all variants)",ASD_CHD_DNV,ASD_HC,CHD_HC,z_ASD,z_CHD,z_both,xpos,ypos
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
CTTNBP2,,0.0,,1.0,,0,1,0,47.350731,1.195295,27.75349,-1.538416,-0.37744
USP45,,0.0,,2.0,,0,1,0,30.431117,0.066237,21.53968,-1.29382,-0.194478
SHANK3,,4.0,,9.0,,0,1,0,24.601011,2.211072,16.876845,-1.587422,0.25444
NAA15,,5.0,,11.0,,0,1,1,14.023218,9.630552,15.035266,-0.161088,1.30962
MYH6,,5.0,,12.0,,0,0,1,0.865649,18.513572,14.645288,1.73886,0.46992


In [4]:
combined_df = decipher_df[['number shared phenotype patients with a mutation in this gene in decipher database (DNV)',
                          'ASD_CHD_DNV','ASD_HC','CHD_HC','z_ASD','z_CHD','z_both']]
combined_df.columns=['num_decipher_patients','ASD_CHD_DNV','ASD_HC','CHD_HC','z_ASD','z_CHD','z_both'] # rename the columns so they are more manageable


combined_df.head()

Unnamed: 0_level_0,num_decipher_patients,ASD_CHD_DNV,ASD_HC,CHD_HC,z_ASD,z_CHD,z_both
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CTTNBP2,,0,1,0,47.350731,1.195295,27.75349
USP45,,0,1,0,30.431117,0.066237,21.53968
SHANK3,,0,1,0,24.601011,2.211072,16.876845
NAA15,,0,1,1,14.023218,9.630552,15.035266
MYH6,,0,0,1,0.865649,18.513572,14.645288


In [5]:
# add DDD colum
combined_df['DDD_variant']=np.zeros(len(combined_df))
combined_df['DDD_variant'].loc[DDD_df.index.tolist()]=DDD_df['num_DDD_patients']
combined_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,num_decipher_patients,ASD_CHD_DNV,ASD_HC,CHD_HC,z_ASD,z_CHD,z_both,DDD_variant
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CTTNBP2,,0,1,0,47.350731,1.195295,27.75349,0.0
USP45,,0,1,0,30.431117,0.066237,21.53968,0.0
SHANK3,,0,1,0,24.601011,2.211072,16.876845,0.0
NAA15,,0,1,1,14.023218,9.630552,15.035266,0.0
MYH6,,0,0,1,0.865649,18.513572,14.645288,0.0


In [6]:
sum(combined_df['DDD_variant'])

407.0

In [7]:
# add the piechart and circos plot data
combined_df['has_DDD_variant']=(combined_df['DDD_variant']>0)*1.0
combined_df['has_decipher_variant']=(combined_df['num_decipher_patients']>0)*1.0

# make a piechart for the DDD, decipher, and DNV columns
combined_df['piechart_data']=['piechart: attributelist="has_DDD_variant,has_decipher_variant,ASD_CHD_DNV" colorlist="red,yellow,blue" showlabels=false']*len(combined_df)

#combined_df.to_csv('combined_DDD_DECIPHER_ASD_CHD_node_table.csv')

combined_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,num_decipher_patients,ASD_CHD_DNV,ASD_HC,CHD_HC,z_ASD,z_CHD,z_both,DDD_variant,has_DDD_variant,has_decipher_variant,piechart_data
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
CTTNBP2,,0,1,0,47.350731,1.195295,27.75349,0.0,0.0,0.0,"piechart: attributelist=""has_DDD_variant,has_d..."
USP45,,0,1,0,30.431117,0.066237,21.53968,0.0,0.0,0.0,"piechart: attributelist=""has_DDD_variant,has_d..."
SHANK3,,0,1,0,24.601011,2.211072,16.876845,0.0,0.0,0.0,"piechart: attributelist=""has_DDD_variant,has_d..."
NAA15,,0,1,1,14.023218,9.630552,15.035266,0.0,0.0,0.0,"piechart: attributelist=""has_DDD_variant,has_d..."
MYH6,,0,0,1,0.865649,18.513572,14.645288,0.0,0.0,0.0,"piechart: attributelist=""has_DDD_variant,has_d..."


In [8]:
sum(combined_df['ASD_CHD_DNV'])

41

In [9]:
combined_df.dropna()[['num_decipher_patients','DDD_variant']].sort_values('DDD_variant',ascending=False).head(15)

Unnamed: 0_level_0,num_decipher_patients,DDD_variant
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
ANKRD11,10.0,41.0
CTNNB1,0.0,18.0
MED13L,7.0,16.0
KAT6B,3.0,13.0
EP300,9.0,13.0
KAT6A,7.0,9.0
SATB2,3.0,9.0
KANSL1,7.0,9.0
SRRM2,0.0,5.0
TCF12,4.0,4.0


In [10]:
# print out the genes with 2 of DDD, Decipher, DNV and no seeds, for pathway enrichment
multi_evidence_genes = combined_df[(combined_df['has_DDD_variant']+combined_df['has_decipher_variant']+combined_df['ASD_CHD_DNV'])>1]

# remove seeds
multi_evidence_genes = multi_evidence_genes[(multi_evidence_genes['ASD_HC']+multi_evidence_genes['CHD_HC'])==0]

print(len(multi_evidence_genes))
multi_evidence_genes.head()

42


Unnamed: 0_level_0,num_decipher_patients,ASD_CHD_DNV,ASD_HC,CHD_HC,z_ASD,z_CHD,z_both,DDD_variant,has_DDD_variant,has_decipher_variant,piechart_data
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
EBF3,3.0,0,0,0,3.590142,3.357178,4.826389,3.0,1.0,1.0,"piechart: attributelist=""has_DDD_variant,has_d..."
EP300,9.0,1,0,0,3.261691,3.033605,4.33535,13.0,1.0,1.0,"piechart: attributelist=""has_DDD_variant,has_d..."
CTNNB1,0.0,1,0,0,3.087033,3.142416,4.288727,18.0,1.0,0.0,"piechart: attributelist=""has_DDD_variant,has_d..."
ZBTB20,1.0,0,0,0,3.046547,2.44815,4.016793,1.0,1.0,1.0,"piechart: attributelist=""has_DDD_variant,has_d..."
KANSL1L,1.0,0,0,0,3.206343,2.351072,3.774944,1.0,1.0,1.0,"piechart: attributelist=""has_DDD_variant,has_d..."


In [11]:
for g in multi_evidence_genes.index.tolist():
    print(g)

EBF3
EP300
CTNNB1
ZBTB20
KANSL1L
VPS13D
PCDHA12
ETS1
MED13L
SALL3
KANSL1
VEZF1
DMXL2
CRB1
ZMYM2
BPTF
CREBBP
BAZ2B
USP34
BRWD3
HECW2
SPEN
ANKHD1-EIF4EBP3
SATB2
SOS1
ZFC3H1
ZNF462
ANKRD11
KAT6A
BRD4
PPP1R12A
TCF12
CTBP2
UBR3
PCBP2
MEIS2
TLK2
KAT6B
SRRM2
SPRY1
NF1
SIPA1L1


In [52]:
#combined_df.dropna().to_csv('../combined_DDD_DECIPHER_ASD_CHD_node_table_no_seeds.csv')