In [3]:
import numpy as np
import pandas as pd
import networkx as nx
import scipy
from scipy import stats
import mygene
import math
import time

import matplotlib as mpl
mpl.rc('text', usetex = False)
mpl.rc('font', family = 'serif')
% matplotlib inline

## Creating Background Network ###

#### scratch code

In [2]:
# load TR list from AnimalTFDB 
TR_db_m = pd.read_csv("Mus_musculus_transcription_factors_gene_list.txt", sep = "\t")
TR_db_h = pd.read_csv("Homo_sapiens_transcription_factors_gene_list.txt", sep = "\t")
TR_db = TR_db_m.append(TR_db_h)
TR_list_entrez = TR_db.Entrez_ID

In [3]:
len(TR_db_h)

1691

In [4]:
#translate TR list entrez to symbol
mg = mygene.MyGeneInfo()
translated_DF = mg.getgenes(set(TR_list_entrez), as_dataframe=True)
animal_TF = translated_DF["symbol"].str.upper()
len(animal_TF)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-2956...done.


2956

In [5]:
#DEG_list = [7,9,10]
#edge_list = [(2,5), (2,4), (1,5), (3,5), (3,4), (6,4), (6,7), (6,9), (6,10), (8,9), (8,10)]
#DG = nx.DiGraph()
#DG.add_edges_from(edge_list)
#sym1_list = [2, 2, 1, 3, 3, 6, 6, 6, 6, 8, 8]
#sym2_list = [5, 4, 5, 5, 4, 4, 7, 9, 10, 9, 10]
#source_nodes = list(set(zip(*DG.edges())[0]))
#print 'source_nodes: ' + str(source_nodes) ".\upstream_regulator_analysis_project\9606.protein.actions.v10.5.txt"

### Real code starts here ###

In [14]:
def load_slowkow(filename_list = ['./upstream_regulator_analysis_project/slowkow_databases/TRED_TF.txt',
                 './upstream_regulator_analysis_project/slowkow_databases/ITFP_TF.txt',
                 './upstream_regulator_analysis_project/slowkow_databases/ENCODE_TF.txt',
                 './upstream_regulator_analysis_project/slowkow_databases/Neph2012_TF.txt',
                 './upstream_regulator_analysis_project/slowkow_databases/TRRUST_TF.txt',
                  './upstream_regulator_analysis_project/slowkow_databases/Marbach2016_TF.txt']):
    
    # read files formatted as \n separated items
    return_list = []
    for file_name in filename_list:
        with open(file_name) as f:
            lines = f.read().splitlines()
            return_list.extend(lines)
    
    # convert everything to ALL CAPS
    [x.upper() for x in return_list]
    
    # remove duplicates
    return set(return_list)

len(load_slowkow())

2705

In [17]:
def load_jaspar(filename):
    
    # parse jaspar file
    jasp_df = pd.read_csv(filename, sep = "\t", header= None, names = ['col1', 'col2', 'col3', 'col4', 'tf_genes'])
    
    # return transcription factors with ALL CAPS names
    return list(jasp_df['tf_genes'].str.upper())
    
    
len(load_jaspar("./upstream_regulator_analysis_project/jaspar_genereg_matrix.txt"))

2049

In [18]:
def create_TF_list(slowkow_bool = True,
                   slowkow_files = ['./upstream_regulator_analysis_project/slowkow_databases/TRED_TF.txt',
                 './upstream_regulator_analysis_project/slowkow_databases/ITFP_TF.txt',
                 './upstream_regulator_analysis_project/slowkow_databases/ENCODE_TF.txt',
                 './upstream_regulator_analysis_project/slowkow_databases/Neph2012_TF.txt',
                 './upstream_regulator_analysis_project/slowkow_databases/TRRUST_TF.txt',
                  './upstream_regulator_analysis_project/slowkow_databases/Marbach2016_TF.txt'],
                   jaspar_bool = True, 
                   jaspar_file = "./upstream_regulator_analysis_project/jaspar_genereg_matrix.txt"):
    
    TF_list = []
    
    if slowkow_bool == True:
        slowkow_TFs = load_slowkow(slowkow_files)
        TF_list.extend(slowkow_TFs)
        
    if jaspar_bool == True:
        jaspar_TFs = load_jaspar(jaspar_file)
        TF_list.extend(jaspar_TFs)
        
    return list(set(TF_list))
        
        
TF_list = create_TF_list()
len(TF_list)

3983

#### Cross reference Brin's TR list with background STRING db


In [156]:
filename="./upstream_regulator_analysis_project/STRING_network.xlsx"

# Load STRING database as background network
STRING_DF = pd.read_excel(filename)
STRING_DF.drop(['ID1','ID2','Source_Type','Sign_Score','NumberOfScreens','Interaction_Database'], axis=1, inplace=True)
STRING_DF.rename(index=str, columns={'Symbol1':'source', 'Symbol2':'target','Edge_Sign':'sign','Weight':'weight'}, inplace = True)

In [157]:
# make all gene symbol names upper case
STRING_DF = pd.concat([STRING_DF[col].astype(str).str.upper() for col in STRING_DF.columns], axis=1)

In [158]:
STRING_DF

Unnamed: 0,source,target,sign,weight
0,BEG,CG12170,+,1.41421356237
1,NUP358,KARYBETA3,+,1.63299316186
2,PROSALPHA6T,PROSBETA3,+,1.41421356237
3,RLC1,MRPL46,+,1.41421356237
4,CG7993,CG6724,+,1.73205080757
5,CG13747,CG13243,+,1.41421356237
6,RPL38,RPL35,+,3.46410161514
7,CG7149,CG11438,+,1.41421356237
8,CG10881,RPS15,-,1.0
9,W-CUP,DJ,+,1.41421356237


In [159]:
# mark sign attribute as +1 for activating and -1 for inhibiting
STRING_DF[STRING_DF == '+'] = 1
STRING_DF[STRING_DF == '-'] = -1
STRING_DF['weight'] = map(lambda x: float(x), STRING_DF['weight'])

# make digraph
G_str = nx.from_pandas_dataframe(STRING_DF, 'source', 'target', ['sign', 'weight'], create_using=nx.DiGraph())

G_str.edges(data=True)

[('RCD-1', 'POP2', {'sign': -1, 'weight': 1.41421356237}),
 ('RCD-1', 'CG8232', {'sign': -1, 'weight': 1.41421356237}),
 ('RCD-1', 'EIF4E-4', {'sign': -1, 'weight': 1.41421356237}),
 ('RCD-1', 'L(2)NC136', {'sign': -1, 'weight': 1.73205080757}),
 ('NC73EF', 'TM2', {'sign': 1, 'weight': 1.41421356237}),
 ('NC73EF', 'WUPA', {'sign': 1, 'weight': 1.41421356237}),
 ('NC73EF', 'CG12338', {'sign': 1, 'weight': 1.41421356237}),
 ('NC73EF', 'BLW', {'sign': 1, 'weight': 1.0}),
 ('NC73EF', 'CG7755', {'sign': -1, 'weight': 1.41421356237}),
 ('NC73EF', 'UP', {'sign': -1, 'weight': 1.3416407865}),
 ('NC73EF', 'SUCB', {'sign': 1, 'weight': 1.41421356237}),
 ('NC73EF', 'IDH', {'sign': 1, 'weight': 1.41421356237}),
 ('NC73EF', 'GDH', {'sign': 1, 'weight': 1.3416407865}),
 ('NC73EF', 'PDSW', {'sign': 1, 'weight': 1.41421356237}),
 ('CG11790', 'VELO', {'sign': -1, 'weight': 1.3416407865}),
 ('CG34384', 'CG6847', {'sign': 1, 'weight': 1.41421356237}),
 ('CG34384', 'CG15625', {'sign': -1, 'weight': 1.4142

In [128]:
G_str

In [129]:
len(STRING_DF)

40216

In [130]:
len(G_str.edges())

40106

In [95]:
#G=nx.from_pandas_dataframe(STRING_DF, 0, 'b', ['weight', 'cost'])
STRING_DF

Unnamed: 0,ID1,Symbol1,ID2,Symbol2,Source_Type,Edge_Sign,Sign_Score,Weight,NumberOfScreens,Interaction_Database
0,FBgn0036691,beg,FBgn0037356,CG12170,Predicted,+,1.414214,1.414214,2,STRING
1,FBgn0039302,Nup358,FBgn0087013,Karybeta3,Predicted,+,1.632993,1.632993,6,STRING
2,FBgn0032492,Prosalpha6T,FBgn0026380,Prosbeta3,Predicted,+,1.414214,1.414214,2,STRING
3,FBgn0014023,Rlc1,FBgn0035272,mRpL46,Predicted,+,1.414214,1.414214,2,STRING
4,FBgn0038585,CG7993,FBgn0032298,CG6724,Predicted,+,1.732051,1.732051,3,STRING
5,FBgn0033364,CG13747,FBgn0028903,CG13243,Predicted,+,1.414214,1.414214,2,STRING
6,FBgn0040007,RpL38,FBgn0029785,RpL35,Predicted,+,3.464102,3.464102,12,STRING
7,FBgn0031948,CG7149,FBgn0037164,CG11438,Predicted,+,1.414214,1.414214,2,STRING
8,FBgn0038796,CG10881,FBgn0034138,RpS15,Predicted,-,-1.000000,1.000000,4,STRING
9,FBgn0032269,w-cup,FBgn0019828,dj,Predicted,+,1.414214,1.414214,2,STRING


In [112]:
G=nx.from_pandas_dataframe(STRING_DF, 'source', 'target', ['sign', 'weight'])

In [115]:
G.edges(data = True)

[('RCD-1', 'CG8232', {'sign': -1, 'weight': '1.41421356237'}),
 ('RCD-1', 'EIF4E-4', {'sign': -1, 'weight': '1.41421356237'}),
 ('RCD-1', 'L(2)NC136', {'sign': -1, 'weight': '1.73205080757'}),
 ('RCD-1', 'POP2', {'sign': -1, 'weight': '1.41421356237'}),
 ('NC73EF', 'TM2', {'sign': 1, 'weight': '1.41421356237'}),
 ('NC73EF', 'WUPA', {'sign': 1, 'weight': '1.41421356237'}),
 ('NC73EF', 'CG12338', {'sign': 1, 'weight': '1.41421356237'}),
 ('NC73EF', 'BLW', {'sign': 1, 'weight': '1.0'}),
 ('NC73EF', 'CG7755', {'sign': -1, 'weight': '1.41421356237'}),
 ('NC73EF', 'UP', {'sign': -1, 'weight': '1.3416407865'}),
 ('NC73EF', 'SUCB', {'sign': 1, 'weight': '1.41421356237'}),
 ('NC73EF', 'IDH', {'sign': 1, 'weight': '1.41421356237'}),
 ('NC73EF', 'GDH', {'sign': 1, 'weight': '1.3416407865'}),
 ('NC73EF', 'PDSW', {'sign': 1, 'weight': '1.41421356237'}),
 ('CG11790', 'VELO', {'sign': -1, 'weight': '1.3416407865'}),
 ('CG34384', 'CG6847', {'sign': 1, 'weight': '1.41421356237'}),
 ('CG34384', 'CG15625

In [111]:

STRING_DF

Unnamed: 0,source,target,sign,weight
0,BEG,CG12170,1,1.41421356237
1,NUP358,KARYBETA3,1,1.63299316186
2,PROSALPHA6T,PROSBETA3,1,1.41421356237
3,RLC1,MRPL46,1,1.41421356237
4,CG7993,CG6724,1,1.73205080757
5,CG13747,CG13243,1,1.41421356237
6,RPL38,RPL35,1,3.46410161514
7,CG7149,CG11438,1,1.41421356237
8,CG10881,RPS15,-1,1.0
9,W-CUP,DJ,1,1.41421356237


In [8]:
def load_and_process_STRING(filename = ".\upstream_regulator_analysis_project\9606.protein.actions.v10.5.txt", confidence_filter = 400):
    
    start = time.time()

    # read STRING file to dataframe
    df_full = pd.read_csv(filename, sep = "\t")

    df = df_full.loc[df_full['score'] > confidence_filter] # filter by confidence
    df_act = df.loc[df['action'] == 'activation'] # filter by activation and inhibition
    df_inh = df.loc[df['action'] == 'inhibition']
    
    # make separate source and target lists for activating and inhibiting
    sources_ut_a = [entry.split('.')[1] for entry in df_act['item_id_a']]
    targets_ut_a = [entry.split('.')[1] for entry in df_act['item_id_b']]
    sources_ut_i = [entry.split('.')[1] for entry in df_inh['item_id_a']]
    targets_ut_i = [entry.split('.')[1] for entry in df_inh['item_id_b']]
    
    # create edge list with affiliated sign
    edge_ut_a = zip(sources_ut_a, targets_ut_a, [1]*len(sources_ut_a))
    edge_ut_i = zip(sources_ut_i, targets_ut_i, [-1]*len(sources_ut_i))
    edges_ut = edge_ut_a + edge_ut_i
    
    # create the network
    G_str = nx.MultiDiGraph()
    G_str.add_weighted_edges_from(edges_ut)
    
    # make quick translation list
    to_translate = list(set( zip(*edges_ut)[0] + zip(*edges_ut)[1]))
    
    # translate quick list and make translation dictionary
    mg = mygene.MyGeneInfo()
    mg_temp = mg.querymany(to_translate,scopes='ensemblprotein',fields='symbol')
    ensembl_list = [x['query'] for x in mg_temp]
    symbol_list = [x['symbol'] if 'symbol' in x.keys() else 'None' for x in mg_temp]
    ensembl_to_symbol = dict(zip(ensembl_list,symbol_list))
    
    # relabel nodes with symbols
    G_str = nx.relabel_nodes(G_str,ensembl_to_symbol)  # only keep the proteins that
    G_str.remove_node('None')
    
    s = list(zip(*G_str.edges())[0])
    t = list(zip(*G_str.edges())[0])
    signs = [dict_w.values()[0] for dict_w in zip(*list(G_str.edges(data = True)))[2]]
    
    end = time.time()
    print end - start
    
    return df_full, list(zip(s,t)), list(zip(s,t,signs)), (end - start)

df_full, db_edges, db_sign_att, time3 = load_and_process_STRING()

KeyboardInterrupt: 

In [70]:
def load_and_process_STRING_test(filename = ".\upstream_regulator_analysis_project\9606.protein.actions.v10.5.txt", confidence_filter = 400):
    
    start = time.time()

    # read STRING file to dataframe
    df_full = pd.read_csv(filename, sep = "\t")

    df = df_full.loc[df_full['score'] > confidence_filter] # filter by confidence
    df_act = df.loc[df['action'] == 'activation'] # filter by activation and inhibition
    df_inh = df.loc[df['action'] == 'inhibition']
    
    # make separate source and target lists for activating and inhibiting
    sources_ut_a = [entry.split('.')[1] for entry in df_act['item_id_a']]
    targets_ut_a = [entry.split('.')[1] for entry in df_act['item_id_b']]
    sources_ut_i = [entry.split('.')[1] for entry in df_inh['item_id_a']]
    targets_ut_i = [entry.split('.')[1] for entry in df_inh['item_id_b']]
    
    # create edge with weight of 1
    edges_ut_a = zip(sources_ut_a, targets_ut_a, [1]*len(sources_ut_a))
    edges_ut_i = zip(sources_ut_i, targets_ut_i, [1]*len(sources_ut_i))
    edges_ut = edges_ut_a + edges_ut_i
    
    # create separate activating and inhibiting networks
    G_a = nx.MultiDiGraph()
    G_a.add_weighted_edges_from(edges_ut_a)
    
    G_i = nx.MultiDiGraph()
    G_i.add_weighted_edges_from(edges_ut_i)
    
    # add sign attribute
    edges_with_keys_a = G_a.edges(keys = True)
    signs_a = [1]*len(G_a.edges())
    edges_to_sign_a = dict(zip(edges_with_keys_a,signs_a))
    nx.set_edge_attributes(G_a, name = 'sign', values = edges_to_sign_a)
    
    edges_with_keys_i = G_i.edges(keys = True)
    signs_i = [-1]*len(G_i.edges())
    edges_to_sign_i = dict(zip(edges_with_keys_i,signs_i))
    nx.set_edge_attributes(G_i, name = 'sign', values = edges_to_sign_i)
    
    # combine two graphs
    G_str = nx.compose(G_a, G_i)
    
    # make quick translation list
    to_translate = list(set( zip(*edges_ut)[0] + zip(*edges_ut)[1]))
    
    # translate quick list and make translation dictionary
    mg = mygene.MyGeneInfo()
    mg_temp = mg.querymany(to_translate,scopes='ensemblprotein',fields='symbol')
    ensembl_list = [x['query'] for x in mg_temp]
    symbol_list = [x['symbol'] if 'symbol' in x.keys() else 'None' for x in mg_temp]
    ensembl_to_symbol = dict(zip(ensembl_list,symbol_list))
    
    # relabel nodes with symbols
    G_str = nx.relabel_nodes(G_str,ensembl_to_symbol)  # only keep the proteins that
    G_str.remove_node('None')
    
    end = time.time()
    print end - start
    
    return G_str 

G = load_and_process_STRING_test()

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-7374...done.
Finished.
312 input query terms found no hit:
	[u'ENSP00000376684', u'ENSP00000289352', u'ENSP00000202788', u'ENSP00000373637', u'ENSP00000367802',
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
46.0329999924


In [86]:
def filter_digraph(G,TF_list):
    edges = G.out_edges(TF_list,keys = True, data = True)
    H = nx.MultiDiGraph()
    H.add_edges_from(edges)
    return H

In [90]:
H = nx.MultiDiGraph()
H.add_edges_from([[0,0],[0,1],[0,0],[1,0],[2,1],[2,2],[2,2],[1,2]])
TF = [2]
I = filter_digraph(H,TF)
I.edges(data = True, keys = True)

[(2, 1, 0, {}), (2, 2, 0, {}), (2, 2, 1, {})]

In [10]:
def filter_background(db_edges, db_sign_att, TF_list):
    
    # extracting TR edge information from background database
    edge_list_filtered = []
    sign_att_list_filtered = []
    for i in range(len(db_edges)):
        if db_edges[i][0] in list(TF_list):
            edge_list_filtered.append(db_edges[i])
            sign_att_list_filtered.append(db_sign_att[i])
            
    return edge_list_filtered, sign_att_list_filtered

edge_list_filtered, sign_att_list_filtered = filter_background(db_edges, db_sign_att, TF_list)

In [11]:
sources = zip(*db_sign_att)[0]
targets = zip(*db_sign_att)[1]
signs = zip(*db_sign_att)[2]
to_dataframe = {'source': sources, 'target': targets, 'sign': signs}
df = pd.DataFrame(data=to_dataframe)
df

Unnamed: 0,sign,source,target
0,1,RNF14,RNF14
1,-1,RNF14,RNF14
2,-1,RNF11,RNF11
3,1,RNF13,RNF13
4,1,REM1,REM1
5,1,REM1,REM1
6,1,REM1,REM1
7,1,REM1,REM1
8,1,REM1,REM1
9,1,REM1,REM1


In [11]:
def make_digraph(db_edges, db_sign_att, TF_list):
    
    # use only edges from background network associated with our TF list
    edge_list_filtered, sign_att_list_filtered = filter_background(db_edges, db_sign_att, TF_list)
    
    # create networkx digraph from weighted edge list, add sign edge attributes 
    DG = nx.DiGraph()
    DG.add_weighted_edges_from(edge_list_filtered)
    for i in range(len(sign_att_list_filtered)):
        DG[sign_att_list_filtered[i][0]][sign_att_list_filtered[i][1]]['sign'] = sign_att_list_filtered[i][2]
    
    return DG

DG = make_digraph(db_edges, db_sign_att, TF_list)

In [12]:
len(DG.nodes())

687

### p-value with differencially expressed genes

In [13]:
def load_DEG_with_up_downs(filename = "differencially_expressed_genes.txt", filter_value = 0.3):

    # load differencially expressed genes (experimental results)
    DEG_db = pd.read_csv(filename, sep = "\t")

    # filtering for lfdr < 0.3
    DEG_list = []
    DEG_to_updown = {}
    for i in range(len(DEG_db)):

        # removing Nan values
        if str(DEG_db.symbol[i]).upper() != 'NAN':

            # filtering DEG list by lfdr < filter_value
            if (DEG_db['lfdr.89.12'][i] < filter_value):
                DEG_list.append(str(DEG_db.symbol[i]).upper())

                # creating dictionary between DEG symbols and their up/down value
                if DEG_db['log2.89.12'][i] != 0: 
                    DEG_to_updown[str(DEG_db.symbol[i]).upper()] = DEG_db['log2.89.12'][i]
                else:
                    DEG_to_updown[str(DEG_db.symbol[i]).upper()] = 0
    
    return DEG_list, DEG_to_updown

DEG_list, DEG_to_updown = load_DEG_with_up_downs()
print len(DEG_list)
print len(DEG_to_updown)

2782
2782


In [14]:
def add_updown_from_DEG(DG, DEG_filename = "differencially_expressed_genes.txt", DEG_filter_value = 0.3):
    
    DEG_list, DEG_to_updown = load_DEG_with_up_downs(DEG_filename, DEG_filter_value)
    
    # get all the differencially expressed genes in DG
    DEG_in_DG = set(DG.nodes()) & set(DEG_list)
    
    # add node attribute to each node in DG if it exists, otherwise set to zero
    zero_dict = dict(zip(DG.nodes(), [0]*len(DG.nodes())))
    for gene in DEG_in_DG:
        zero_dict[gene] = DEG_to_updown[gene]
    nx.set_node_attributes(DG, 'updown', zero_dict)
    
    return DEG_list

In [15]:
DEG_list = add_updown_from_DEG(DG)
DG.nodes(data = True)[0:20]

[(u'PDM2', {'updown': 0}),
 (u'ICLN', {'updown': 0}),
 (u'MCM10', {'updown': 0}),
 (u'CG30085', {'updown': 0}),
 (u'VPS2', {'updown': 0}),
 (u'SCB', {'updown': 0}),
 (u'SIN', {'updown': 0}),
 (u'BAP60', {'updown': 0}),
 (u'VPS4', {'updown': 0}),
 (u'SMG5', {'updown': 0.18674921190000002}),
 (u'SPZ', {'updown': 0}),
 (u'SMG6', {'updown': 0.1827090375}),
 (u'TAP', {'updown': 0}),
 (u'RPS19B', {'updown': 0}),
 (u'RPS19A', {'updown': 0}),
 (u'TAZ', {'updown': 0}),
 (u'MRPL51', {'updown': 0}),
 (u'PROSBETA5R', {'updown': 0}),
 (u'RPT3', {'updown': 0}),
 (u'RPT1', {'updown': 0})]

In [16]:
# calculating all the p-scores

def tr_pvalues(DG, db_edges, DEG_list):
    
    source_nodes = list(set(zip(*DG.edges())[0]))  #identifying unique source nodes in graph
    background_list = list(set(zip(*db_edges)[0]) | set(zip(*db_edges)[1]))
    
    TR_to_pvalue = {}
    for TR in source_nodes:
        x = len(list(set(DG.neighbors(TR)) & set(DEG_list))) # per TR, observed overlap between TR neighbors and DEG_list
        M = len(background_list)  # num unique nodes in universe, aka background network (STRING)
        n = len(DG.neighbors(TR)) # per TR, number of targets for that TR
        N = len(list(set(background_list) & set(DEG_list))) # number of DEG, picked from universe "at random"
    
        TR_to_pvalue[TR] = -(scipy.stats.hypergeom.logsf(x, M, n, N, loc=0)) # remove unnecessary negative sign
        
    return TR_to_pvalue 
    
TR_to_pvalue = tr_pvalues(DG, db_edges, DEG_list)
len(TR_to_pvalue)

102

### z-score with DEG

In [17]:
def tr_zscore(DG, DEG_list):
    
    source_nodes = list(set(zip(*DG.edges())[0])) #identifying unique source nodes in graph
    
    TR_to_zscore = {}
    for TR in source_nodes:
        N_minus = 0 # number of inhibiting predicting DEG edges
        N_plus = 0 # number of activating predicting DEG edges
        N_zero = 0 # number of edges with errorous calculations
    
        TRs_DEG_neighbors = set(DG.neighbors(TR)) & set(DEG_list)
        for n in TRs_DEG_neighbors:
                sign_of_edge = DG[TR][n]['sign']
                up_down_of_n = (DG.node[n]['updown']/abs(DG.node[n]['updown']))
                
                # predict whether this neighbor thinks the TR is Act. or Inhib.
                if ((sign_of_edge * up_down_of_n) == 1):
                    N_plus += 1
                elif ((sign_of_edge * up_down_of_n) == -1):
                    N_minus += 1
                else:
                    N_zero += 1 # mark an error if could not predict
                    print "Issue with edge (" + str(TR) + ',' + str(n) + ')'
                
        if N_zero != 0:
            print "Could not attribute activated or inhibiting trait to " + str(N_zero) + 'nodes'
      
        # prevent a divide-by-zero calculation
        N = N_plus + N_minus
        if N == 0:
            z_score = 0
        else:
            z_score = (N_plus - N_minus)/float(math.sqrt(N))
                
        TR_to_zscore[TR] = z_score #create zscore dict where 1 means activating
                                                            # -1 means inhibiting
                                                            # 0 means could not be calculated
    
    return TR_to_zscore

tr_zscore(DG, DEG_list)

{u'ABD-B': 0,
 u'ACHI': 0,
 u'AKT1': 0,
 u'ANTP': 0,
 u'ARR1': 0,
 u'ATF6': 0,
 u'BAP': 0,
 u'BUB3': 0,
 u'CDC16': -1.0,
 u'CDC27': -1.0,
 u'CDC6': -1.0,
 u'CG11294': 0,
 u'CNOT4': 0,
 u'DFD': 0,
 u'DL': 0.0,
 u'ECD': 0,
 u'EVE': 0,
 u'FKH': 0,
 u'GATA': 0,
 u'GCM': 0,
 u'GSTO1': 0,
 u'H': 0,
 u'HBN': 0,
 u'HDAC3': 0,
 u'HDAC6': 0,
 u'HKB': 0,
 u'ILK': 0,
 u'ING3': 0,
 u'INR': 0,
 u'INTS4': -1.0,
 u'INTS6': -1.0,
 u'INTS8': -1.0,
 u'KLHL18': 0,
 u'KR': 0,
 u'LIG3': 0,
 u'MAD': 0,
 u'MARS': 0,
 u'MAX': 0,
 u'MCM2': -1.414213562373095,
 u'MCM3': -1.414213562373095,
 u'MCM5': -1.0,
 u'MCM6': -0.5773502691896258,
 u'MCM7': -1.0,
 u'MED1': -1.0,
 u'MED15': -1.414213562373095,
 u'MRPL23': 0,
 u'MRPL24': -1.7320508075688774,
 u'MRPL44': 0,
 u'MSH6': 0,
 u'MYB': 0,
 u'NF1': 0,
 u'NUB': 0,
 u'NUP133': 0,
 u'NUP50': 0,
 u'ONECUT': 0,
 u'OPTIX': 0,
 u'PAN': 0,
 u'PAX': 0.0,
 u'PNR': 0,
 u'PTEN': -1.0,
 u'PXN': 0,
 u'RAE1': 0,
 u'RBBP5': 0,
 u'REL': -1.0,
 u'REPO': 0,
 u'RFC4': -1.0,
 u'RNPS1': -1

## Testing

In [18]:
TF_list = create_TF_list()
STRING_DF, db_edges, db_sign_att = load_and_process_STRING()
edge_list_filtered, sign_att_list_filtered = filter_background(db_edges, db_sign_att, TF_list)
DG = make_digraph(db_edges, db_sign_att, TF_list)
DEG_list = add_updown_from_DEG(DG)
tr_pvalues(DG, db_edges, DEG_list)

{u'ABD-B': 2.5619299458446618,
 u'ACHI': 4.1400668115602457,
 u'AKT1': 2.005135344406761,
 u'ANTP': 4.1400668115602457,
 u'ARR1': 3.4547738088220896,
 u'ATF6': 1.6680928067201766,
 u'BAP': 2.3873854072043716,
 u'BUB3': 1.2902562879794237,
 u'CDC16': 3.144198753053967,
 u'CDC27': 2.0214533281294007,
 u'CDC6': 3.9095518809588667,
 u'CG11294': 3.4547738088220896,
 u'CNOT4': 2.005135344406761,
 u'DFD': 4.1400668115602457,
 u'DL': 5.3485051248380939,
 u'ECD': 2.5619299458446618,
 u'EVE': 4.1400668115602457,
 u'FKH': 4.1400668115602457,
 u'GATA': 4.1400668115602457,
 u'GCM': 4.1400668115602457,
 u'GSTO1': 4.1400668115602457,
 u'H': 2.3873854072043716,
 u'HBN': 4.1400668115602457,
 u'HDAC3': 2.3873854072043716,
 u'HDAC6': 2.7772771668821576,
 u'HKB': 3.4547738088220896,
 u'ILK': 3.4547738088220896,
 u'ING3': 2.3873854072043716,
 u'INR': 1.9074744647340081,
 u'INTS4': 5.6306387511574956,
 u'INTS6': 6.0258370197536495,
 u'INTS8': 6.5263844837146863,
 u'KLHL18': 1.8198445447020257,
 u'KR': 4.140

In [19]:
not_biased_zcsores = tr_zscore(DG, DEG_list)

In [20]:
def calculate_bias(DG, DEG_list):
    
    source_nodes = list(set(zip(*DG.edges())[0])) #identifying unique source nodes in graph
    
    TR_to_bias = {}
    for TR in source_nodes:
        
        N_up = 0 # number of up regulated target
        N_down = 0 # number of down regulated targets
        
        N_act = 0 # number of activating edges
        N_inh = 0 # number of inhibiting edges
        
        N_problem = 0 # number of edges with errorous calculations
    
        TRs_DEG_neighbors = set(DG.neighbors(TR)) & set(DEG_list)
        for n in TRs_DEG_neighbors:
            
            # count up edge signs
            sign_of_edge = DG[TR][n]['sign']
            if sign_of_edge == 1:
                N_act += 1
            elif sign_of_edge == -1:
                N_inh += 1
            else:
                N_problem += 1
                print "Issue with edge (" + str(TR) + ',' + str(n) + ') A/I'

            # count up node regulations
            up_down_of_n = (DG.node[n]['updown']/abs(DG.node[n]['updown']))
            if up_down_of_n == 1:
                N_up += 1
            elif up_down_of_n == -1:
                N_down += 1
            else:
                N_problem += 1 
                print "Issue with edge (" + str(TR) + ',' + str(n) + ') up/down'
             
        # calculate up down bias
        if (N_up + N_down) != 0:
            u_data = (N_up - N_down)/float(N_up + N_down)
        else:
            u_data = 0
        
        # calculate act-inh bias
        if (N_act + N_inh) != 0:
            u_TR = (N_act - N_inh)/float(N_act + N_inh)
        else:
            u_TR = 0
            
        # calculate overall bias
        u = u_data * u_TR
        TR_to_bias[TR] = u
    
    return TR_to_bias

TR_to_bias = calculate_bias(DG, DEG_list)

In [21]:
def bias_corrected_tr_zscore(DG, DEG_list, TR_to_bias):

    source_nodes = list(set(zip(*DG.edges())[0]))  # identifying unique source nodes in graph
    
    TR_to_zscore = {}
    for TR in source_nodes:
        w = []
        x = []
        TRs_DEG_neighbors = set(DG.neighbors(TR)) & set(DEG_list)
        for n in TRs_DEG_neighbors:
            sign_of_edge = DG[TR][n]['sign']
            up_down_of_n = (DG.node[n]['updown'] / abs(DG.node[n]['updown']))

            # predict whether this neighbor thinks the TR is Act. or Inhib.
            prediction = sign_of_edge * up_down_of_n
            if ((prediction == 1) | (prediction == -1)):
                x.append(prediction)
            else:
                print "Issue with edge (" + str(TR) + ',' + str(n) + ')'   
                
            # keep track of each target's weight
            w.append(DG[TR][n]['weight'])
            
        u = TR_to_bias[TR]
        
        # calculate bias-corrected z-score
        z_score_top = 0
        z_score_bottom = 0
        for i in range(len(w)):
            z_score_top += w[i]*(x[i]-u)
            z_score_bottom += w[i]*w[i]
        z_score = z_score_top/((z_score_bottom)**(1/2))
        
        TR_to_zscore[TR] = z_score
            

    return TR_to_zscore
bias_zscores = bias_corrected_tr_zscore(DG, DEG_list, TR_to_bias)

In [22]:
zip(not_biased_zcsores.values(), bias_zscores.values())

[(0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0.0, 0.0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (-1.0, 0.0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (-1.7320508075688774, 0.0),
 (1.6329931618554523, 9.3822741353567842),
 (0, 0),
 (-1.0, 0.0),
 (0.0, 0.31783724519577983),
 (0, 0),
 (-1.0, 0.0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (-1.0, 0.0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (1.0, 0.0),
 (0.0, 1.0352761804100798),
 (-1.0, 0.0),
 (1.0, 0.0),
 (0.0, 2.0),
 (-1.0, 0.0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (-1.0, 0.0),
 (0, 0),
 (0, 0),
 (1.0, 0.0),
 (0.0, 0.31783724519577983),
 (1.889822365046136, 4.8952263445894966),
 (-1.0, 0.0),
 (-1.0, 0.0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (1.6329931618554523, 2.9937022354948404),
 (2.23606797749979, 0.0),
 (1.3416407864998738, 2.7158479847794479),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (-1.0, 0.0),
 (0, 0),
 (0, 0