In [1]:
import pandas as pd
import numpy as np
import matplotlib as ml
import matplotlib.pyplot as plt
import igraph as ig
import datetime

In [2]:
rawLinks = pd.read_csv('data/9606.protein.links.v10.5.txt', delim_whitespace=True)
rawAlias = pd.read_csv('data/9606.protein.aliases.v10.5.txt', delimiter='\t')
print('Here')
print(str(datetime.datetime.now()))

Here
2019-01-16 15:19:14.336482


In [3]:
#build a map of proteinId in rawLinks to gene product names
proteinIdToAlias = {}
for val in list(zip(rawAlias.string_protein_id, rawAlias.alias)):
    if not val[0] in proteinIdToAlias: #the alias file has lots of duplicates!
        proteinIdToAlias[val[0]]=[val[1]]
    else:
        proteinIdToAlias[val[0]].append(val[1])
        
print('Here')
print(str(datetime.datetime.now()))

Here
2019-01-16 15:19:15.520723


In [4]:
#build the list of links mapped to gene product name
links = list(zip(rawLinks.protein1, rawLinks.protein2))
print('Here')
print(str(datetime.datetime.now()))

Here
2019-01-16 15:19:16.962307


In [5]:
#filter links where both in and out are in the dataset
acceptableGeneSetB = set(pd.read_csv('Diff_1_2_01.txt', header=None)[0])
acceptableGeneSetC = set(pd.read_csv('Diff_2_3_01.txt', header=None)[0])
acceptableGeneSet = acceptableGeneSetB | acceptableGeneSetC

print('Acceptable: ', len(acceptableGeneSet))
print(str(datetime.datetime.now()))

filteredLinks = []
for link in links:
    aliasesOfStart = proteinIdToAlias[link[0]]
    aliasesOfEnd = proteinIdToAlias[link[1]]
    
    startIsAcceptable = not acceptableGeneSet.isdisjoint(aliasesOfStart)
    endIsAcceptable = not acceptableGeneSet.isdisjoint(aliasesOfEnd)
    
    if startIsAcceptable and endIsAcceptable:
        filteredLinks.append(link)
        
print('Links: ', len(filteredLinks))
print(str(datetime.datetime.now()))

Acceptable:  2575
2019-01-16 15:19:16.979202
Links:  263874
2019-01-16 15:20:48.066933


In [6]:
nodesDict = {}
for link in filteredLinks:
    nodesDict[link[0]] = 1
    nodesDict[link[1]] = 1
    
nodes = list(nodesDict.keys())
nodeColors = []
nodeSizes = []
redCnt = 0
blueCnt = 0
for node in nodes:
    nodeSizes.append(60)
    aliasesOfNode = proteinIdToAlias[node]
    if not acceptableGeneSetB.isdisjoint(aliasesOfNode):
        nodeColors.append('red')
        redCnt+=1
    else:
        nodeColors.append('blue')
        blueCnt+=1
        
print('Nodes: ', len(nodes))
print('Red Count ', redCnt)
print('Blue Count ', blueCnt)
print('Links: ', len(filteredLinks))
print(str(datetime.datetime.now()))

Nodes:  2399
Red Count  698
Blue Count  1701
Links:  263874
2019-01-16 15:20:48.164008


In [7]:
g = ig.Graph()
g.add_vertices(nodes)
g.add_edges(filteredLinks)
g.vs["label"] = g.vs["name"]
g.vs["color"] = nodeColors
g.vs["vertex_size"] = nodeSizes
layout = g.layout_lgl()
#g.write_svg('graph.svg', layout = layout, width=25000, height=25000)
print(str(datetime.datetime.now()))

2019-01-16 15:20:55.667112


In [8]:
print('start')
print(str(datetime.datetime.now()))
ig.plot(g, 'out/fullmap/graph_lgl_simplified_union_01.png', layout = layout, bbox = (30000, 30000), edge_color='#999999')
print(str(datetime.datetime.now()))
print('done -- open graph_lgl_simplified_union_01.png')

start
2019-01-16 15:20:55.672323
2019-01-16 15:23:27.624395
done -- open graph_lgl_simplified_union_01.png
