In [1]:
import pandas as pd
import numpy as np
import matplotlib as ml
import matplotlib.pyplot as plt
import igraph as ig
import datetime

In [2]:
rawLinks = pd.read_csv('data/9606.protein.links.v10.5.txt', delim_whitespace=True)
rawAlias = pd.read_csv('data/9606.protein.aliases.v10.5.txt', delimiter='\t')
print('Here')
print(str(datetime.datetime.now()))

Here
2019-01-16 16:11:38.806501


In [3]:
#build a map of aliases to protein_ids
proteinIdToAlias = {}
aliasToProteinId = {}
for val in list(zip(rawAlias.string_protein_id, rawAlias.alias)):
    aliasToProteinId[val[1]] = val[0]
    if not val[0] in proteinIdToAlias: #the alias file has lots of duplicates!
        proteinIdToAlias[val[0]]=[val[1]]
    else:
        proteinIdToAlias[val[0]].append(val[1])
        
print('Here')
print(str(datetime.datetime.now()))

Here
2019-01-16 16:11:40.727038


In [4]:
#build the list of links mapped to gene product name
links = list(zip(rawLinks.protein1, rawLinks.protein2))
print('Here')
print(str(datetime.datetime.now()))

Here
2019-01-16 16:11:42.158247


In [5]:
acceptableGeneSetB = set(pd.read_csv('Diff_1_2_01.txt', header=None)[0])
acceptableGeneSetC = set(pd.read_csv('Diff_2_3_01.txt', header=None)[0])
acceptableGeneSet = acceptableGeneSetB | acceptableGeneSetC

print('Acceptable: ', len(acceptableGeneSet))
print(str(datetime.datetime.now()))

Acceptable:  2575
2019-01-16 16:11:42.171418


In [6]:
validLinks = []
for link in links:
    aliasesOfStart = proteinIdToAlias[link[0]]
    aliasesOfEnd = proteinIdToAlias[link[1]]

    startIsAcceptable = not acceptableGeneSet.isdisjoint(aliasesOfStart)
    endIsAcceptable = not acceptableGeneSet.isdisjoint(aliasesOfEnd)
    
    if startIsAcceptable and endIsAcceptable:
        validLinks.append(link)

print('Found ', len(validLinks), ' valid links')
print(str(datetime.datetime.now()))

Found  263874  valid links
2019-01-16 16:13:14.789960


In [19]:
def createPlotProtein(proteinId, degreesToViz, fname):
    print(str(datetime.datetime.now()))
    print ('Finding all potential links to plot ' + str(proteinId))

    filteredLinks = []
    proteinsToSearch = [{'p': proteinId, 'd':1}]
    while len(proteinsToSearch) > 0:
        candidate = proteinsToSearch.pop(0)
        for link in validLinks:
            aliasesOfStart = proteinIdToAlias[link[0]]
            aliasesOfEnd = proteinIdToAlias[link[1]]

            startIsAcceptable = link[0] == candidate['p'] and not acceptableGeneSet.isdisjoint(aliasesOfStart)
            endIsAcceptable = not acceptableGeneSet.isdisjoint(aliasesOfEnd)

            if startIsAcceptable and endIsAcceptable:
                filteredLinks.append(link)
                if candidate['d'] < degreesToViz:
                    toExplore = {'p':link[1], 'd':candidate['d']+1}
                    proteinsToSearch.append(toExplore)

    print('Found links: ', len(filteredLinks))
    print(str(datetime.datetime.now()))

    nodesDict = {}
    for link in filteredLinks:
        nodesDict[link[0]] = 1
        nodesDict[link[1]] = 1

    nodes = list(nodesDict.keys())

    nodeColors = []
    for node in nodes:
        aliasesOfNode = proteinIdToAlias[node]
        if not acceptableGeneSetB.isdisjoint(aliasesOfNode):
            nodeColors.append('red')
        else:
            nodeColors.append('blue')

    print('Which has nodes: ', len(nodes))
    print(str(datetime.datetime.now()))

    g = ig.Graph()
    g.add_vertices(nodes)
    g.add_edges(filteredLinks)
    g.vs["label"] = g.vs["name"]
    g.vs["color"] = nodeColors

    foundNodeToDelete = True
    while foundNodeToDelete:
        foundNodeToDelete = False
        for v in g.vs:
            if v.degree() <= 1:
                v.delete()
                foundNodeToDelete = True
                break
                
    print('After deleting nodes with 1 degree: ', len(g.vs))
    print(str(datetime.datetime.now()))

    layout = g.layout_kamada_kawai()
    ig.plot(g, 'out/geneproducts/' + fname, layout=layout, bbox=(8000,6000), margins=(50,50))
    print('Done ' + fname)
    print(str(datetime.datetime.now()))

In [17]:
def createPlotGeneProduct(geneProductToViz, degreesToViz):
    print(str(datetime.datetime.now()))
    print ('Finding all potential links to plot ' + str(geneProductToViz))
    fname = geneProductToViz + '_degree_' + str(degreesToViz) + '.png'
    createPlotProtein(aliasToProteinId[geneProductToViz], degreesToViz, fname)

In [20]:
degreesToViz = 2
createPlotGeneProduct('KPI2', degreesToViz)
print('Finished')

2019-01-16 17:02:19.637155
Finding all potential links to plot KPI2
2019-01-16 17:02:19.637243
Finding all potential links to plot 9606.ENSP00000297293
Found links:  5591
2019-01-16 17:02:46.185258
Which has nodes:  1528
2019-01-16 17:02:46.195131
After deleting nodes with 1 degree:  1088
2019-01-16 17:02:46.514163
Done KPI2_degree_2.png
2019-01-16 17:02:58.324654
Finished


In [26]:
degreesToViz = 2
proteinsToPlot = list(pd.read_csv('out/fullmap/high_degree_significant_genes.txt', header=None)[0])
for protein in proteinsToPlot:
    fname = protein + '_degree_' + str(degreesToViz) + '.png'
    createPlotProtein(protein, degreesToViz, fname)


2019-01-16 17:05:07.605946
Finding all potential links to plot 9606.ENSP00000229239
Found links:  186578
2019-01-16 17:18:34.999199
Which has nodes:  2340
2019-01-16 17:18:35.067794
After deleting nodes with 1 degree:  2279
2019-01-16 17:18:35.821816
Done 9606.ENSP00000229239_degree_2.png
2019-01-16 17:19:42.773613
2019-01-16 17:19:42.776144
Finding all potential links to plot 9606.ENSP00000215832
Found links:  150144
2019-01-16 17:28:21.473716
Which has nodes:  2322
2019-01-16 17:28:21.522047
After deleting nodes with 1 degree:  2233
2019-01-16 17:28:22.276039
Done 9606.ENSP00000215832_degree_2.png
2019-01-16 17:29:32.359641
2019-01-16 17:29:32.361647
Finding all potential links to plot 9606.ENSP00000349960
Found links:  158461
2019-01-16 17:38:17.856564
Which has nodes:  2330
2019-01-16 17:38:17.905784
After deleting nodes with 1 degree:  2253
2019-01-16 17:38:18.559605
Done 9606.ENSP00000349960_degree_2.png
2019-01-16 17:39:29.195359
2019-01-16 17:39:29.197705
Finding all potential 