In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib as ml
import matplotlib.pyplot as plt
import datetime

trajectory = 'VE-HG1'

In [2]:
rawLinks = pd.read_csv('MOCA-data/10090.protein.links.full.v11.0.CombinedScore.HighConfidence.csv')
rawAlias = pd.read_csv('MOCA-data/10090.protein.aliases.v11.0.UniProt.csv')
print('Here')
print(str(datetime.datetime.now()))

Here
2019-03-27 13:13:17.314791


In [3]:
#build a map of proteinId in rawLinks to gene product names
proteinIdToAlias = {}
aliasToProteinId = {}
duplicatedAliases = []
for val in list(zip(rawAlias.string_protein_id, rawAlias.alias)):
    if val[1] in aliasToProteinId:
        duplicatedAliases.append(str(val[1]))
        aliasToProteinId[val[1]].append(val[0])
    else:
        aliasToProteinId[val[1]] = [val[0]]
    
    if not val[0] in proteinIdToAlias: #the alias file has lots of duplicates!
        proteinIdToAlias[val[0]] = [val[1]]
    else:
        proteinIdToAlias[val[0]].append(val[1])
        
print('Here')
print('There are ', len(duplicatedAliases), ' aliases which point to multiple protein ids')
print(str(datetime.datetime.now()))

Here
There are  0  aliases which point to multiple protein ids
2019-03-27 13:13:17.428235


In [4]:
#build the list of links mapped to gene product name
links = list(zip(rawLinks.protein1, rawLinks.protein2, rawLinks.combined_score))
links = [list(elem) for elem in links]
print('Here')
print(str(datetime.datetime.now()))

Here
2019-03-27 13:13:18.108712


In [5]:
#read in the gene list and build the acceptable set
print(str(datetime.datetime.now()))
rawGeneList = pd.read_csv('MOCA-data/41586_2019_933_MOESM4_ESM.csv')
genesByTrajectory = {};

for index, row in rawGeneList.iterrows():
    traj = row.trajectory;
    mgi = row.mgi.upper();
    cluster = row.cluster;
    
    if traj in genesByTrajectory:
        genesByTrajectory[traj].append({'mgi': mgi, 'cluster': cluster});
    else:
        genesByTrajectory[traj] = [{'mgi': mgi, 'cluster': cluster}];

print(str(datetime.datetime.now()))

2019-03-27 13:13:18.114910
2019-03-27 13:13:18.316540


In [6]:
#filter links where both in and out are in the dataset
print(str(datetime.datetime.now()))

for cluster in range(1,12):
    acceptableGeneSet = set(map(lambda x: x['mgi'], filter(lambda x: x['cluster'] == cluster, genesByTrajectory[trajectory])))
    
    filteredLinks = []
    for link in links:
        aliasesOfStart = proteinIdToAlias[link[0]]
        aliasesOfEnd = proteinIdToAlias[link[1]]

        startIsAcceptable = not acceptableGeneSet.isdisjoint(aliasesOfStart)
        endIsAcceptable = not acceptableGeneSet.isdisjoint(aliasesOfEnd)

        if startIsAcceptable and endIsAcceptable:
            filteredLinks.append(link.copy())

    print('There are ', len(filteredLinks), ' filtered links')
    print(str(datetime.datetime.now()))

    for geneProduct in acceptableGeneSet:
        if not geneProduct in aliasToProteinId:
            continue

        proteinIds = aliasToProteinId[geneProduct]
        for proteinId in proteinIds:
            for link in filteredLinks:
                if link[0] == proteinId:
                    link[0] = geneProduct

                if link[1] == proteinId:
                    link[1] = geneProduct

    print('Done...')
    
    G = nx.DiGraph()
    G.add_weighted_edges_from(filteredLinks)
    nx.write_gml(G, 'out/gephi/moca/moca' + trajectory + '_Cluster_' + str(cluster) + '.gml')
    print('Now open out/gephi/moca/moca' + trajectory + '_Cluster_' + str(cluster) + '.gml')
    print(str(datetime.datetime.now()))

2019-03-27 13:13:18.327471
There are  258  filtered links
2019-03-27 13:13:18.871784
Done...
Now open out/gephi/moca/mocaVE-HG1_Cluster_1.gml
2019-03-27 13:13:18.888978
There are  164  filtered links
2019-03-27 13:13:19.434878
Done...
Now open out/gephi/moca/mocaVE-HG1_Cluster_2.gml
2019-03-27 13:13:19.443229
There are  292  filtered links
2019-03-27 13:13:19.997940
Done...
Now open out/gephi/moca/mocaVE-HG1_Cluster_3.gml
2019-03-27 13:13:20.010045
There are  74  filtered links
2019-03-27 13:13:20.550134
Done...
Now open out/gephi/moca/mocaVE-HG1_Cluster_4.gml
2019-03-27 13:13:20.553849
There are  50  filtered links
2019-03-27 13:13:21.100760
Done...
Now open out/gephi/moca/mocaVE-HG1_Cluster_5.gml
2019-03-27 13:13:21.103751
There are  138  filtered links
2019-03-27 13:13:21.639818
Done...
Now open out/gephi/moca/mocaVE-HG1_Cluster_6.gml
2019-03-27 13:13:21.644449
There are  38  filtered links
2019-03-27 13:13:22.184540
Done...
Now open out/gephi/moca/mocaVE-HG1_Cluster_7.gml
2019-03-2