In [1]:
import pandas as pd
import numpy as np
import matplotlib as ml
import matplotlib.pyplot as plt
import igraph as ig
import datetime

In [2]:
rawLinks = pd.read_csv('data/9606.protein.links.v10.5.txt', delim_whitespace=True)
rawAlias = pd.read_csv('data/9606.protein.aliases.v10.5.txt', delimiter='\t')
print('Here')
print(str(datetime.datetime.now()))

Here
2019-01-16 11:20:54.119011


In [3]:
#build a map of proteinId in rawLinks to gene product names
proteinIdToAlias = {}
for val in list(zip(rawAlias.string_protein_id, rawAlias.alias)):
    if not val[0] in proteinIdToAlias: #the alias file has lots of duplicates!
        proteinIdToAlias[val[0]]=[val[1]]
    else:
        proteinIdToAlias[val[0]].append(val[1])
        
print('Here')
print(str(datetime.datetime.now()))

Here
2019-01-16 11:20:55.460644


In [4]:
#build the list of links mapped to gene product name
links = list(zip(rawLinks.protein1, rawLinks.protein2))
print('Here')
print(str(datetime.datetime.now()))

Here
2019-01-16 11:20:57.300597


In [5]:
#filter links where both in and out are in the dataset
acceptableGeneList = pd.read_csv('Diff_2_3_01.txt', header=None)
#acceptableGeneList = ['ARF5','AK302958']
acceptableGeneSet = set(acceptableGeneList[0])

print('Acceptable: ', len(acceptableGeneList))
print(str(datetime.datetime.now()))

filteredLinks = []
for link in links:
    aliasesOfStart = proteinIdToAlias[link[0]]
    aliasesOfEnd = proteinIdToAlias[link[1]]
    
    startIsAcceptable = not acceptableGeneSet.isdisjoint(aliasesOfStart)
    endIsAcceptable = not acceptableGeneSet.isdisjoint(aliasesOfEnd)
    
    if startIsAcceptable and endIsAcceptable:
        filteredLinks.append(link)
        
print('Links: ', len(filteredLinks))
print(str(datetime.datetime.now()))

Acceptable:  1986
2019-01-16 11:20:57.319275
Links:  162118
2019-01-16 11:22:37.310346


In [6]:
nodesDict = {}
for link in filteredLinks:
    nodesDict[link[0]] = 1
    nodesDict[link[1]] = 1
    
nodes = list(nodesDict.keys())
print('Nodes: ', len(nodes))
print('Links: ', len(filteredLinks))
print(str(datetime.datetime.now()))

Nodes:  1836
Links:  162118
2019-01-16 11:22:37.492250


In [7]:
g = ig.Graph()
g.add_vertices(nodes)
g.add_edges(filteredLinks)
g.vs["label"] = g.vs["name"]
layout = g.layout_lgl()
#g.write_svg('graph.svg', layout = layout, width=25000, height=25000)
print(str(datetime.datetime.now()))

2019-01-16 11:22:42.983463


In [8]:
print('start')
print(str(datetime.datetime.now()))
ig.plot(g, 'out/fullmap/graph_lgl_simplified_2_3.png', layout = layout, bbox = (30000, 30000))
print(str(datetime.datetime.now()))
print('done -- open out/fullmap/graph_lgl_simplified_2_3.png')

start
2019-01-16 11:22:42.990600
2019-01-16 11:25:01.064199
done -- open graph_lgl_simplified_2_3.png
