In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [1]:
import networkx as nx
import pandas as pd
import markov_clustering as mc
import community
import matplotlib.pyplot as plt
import numpy as np

import utils as ut

In [None]:
### TODO: nella funzione louvain, return i cluster e non la partizione

def read_graph(file):
    
    G = nx.Graph()
    
    with open("../interactomes/"+file+".tsv", "r") as f:
        for row in f.readlines()[1:]:
            edge = row[:-1].split("\t")
            G.add_edge(edge[0],edge[1])
    
    return G


def find_best_inflation(mat):
    
    infl_lst = []
    for inflation in [i/10 for i in range(15, 26)]:
        
        result = mc.run_mcl(mat, inflation=inflation)
        clusters = mc.get_clusters(result)
        Q = mc.modularity(matrix=np.asmatrix(result), clusters=clusters)
        infl_lst.append((Q, inflation))
    
    return(max(infl_lst)[1])

def get_labels(G_lcc, clusters):
    
    # map node name to numbers
    lbls = {}
    for k in range(len(G_lcc)):
        lbls[k] = list(G_lcc.nodes())[k]

    # use labels
    for i in range(len(clusters)):
        clusters[i] = [lbls.get(item, item) for item in clusters[i]]
    
    return(clusters)


def MCL(G_lcc):
    
    mat = nx.to_numpy_matrix(G_lcc)
    
    # find best inflation
    max_infl = find_best_inflation(mat)
    
    result = mc.run_mcl(mat, inflation = max_infl)
    clusters = mc.get_clusters(result)
    
    # return name of genes
    return(get_labels(G_lcc, clusters))


def louvain(G_lcc):
    
    partition = community.best_partition(G_lcc)

    return(partition)


def hypergeom_test(mod, genes, G_lcc):
    
    M = len(G_lcc.nodes())
    n = len(genes)
    N = len(mod)
    x = len(set(genes).intersection(set(mod)))
    
    pval = hypergeom.cdf(x, M, n, N)
    
    return(pval, x, M, n, N)



def find_modules(clusters):
    modules = [i for i in clusters if len(i) >=10]
    
    return(modules)


In [2]:
# Initialize the gene list
with open("seed_genes.txt","r") as f:
    genes = [gene.rstrip() for gene in f.readlines()]


### read data

ii = ut.read_graph("ii")
ui = ut.read_graph("ui")

# get LCC
G_ii = max(nx.connected_component_subgraphs(ii), key=len)
G_ui = max(nx.connected_component_subgraphs(ui), key=len)


### LOUVAIN 

partition_ii = ut.louvain(G_ii)
partition_ui = ut.louvain(G_ui)


### MCL

cl_ii_mcl = ut.MCL(G_ii)
cl_ui_mcl = ut.MCL(G_ui)


# create table
col_lst = ['cl_algo', 'mod_id', 'n_sg', 'n_g', 'sg_id', 'g_id', 'p_value']
df_ui = pd.DataFrame(data=None, columns = col_lst)
df_ii = pd.DataFrame(data=None, columns = col_lst)


### hypergeometric test - ui, MCL
df_ui = pd.DataFrame(data=None, columns = col_lst)
r = df_ui.shape[0]
for idx, c in enumerate(cl_ui_mcl):
    df_ui.loc[r+idx, 'cl_algo'] = 'MCL'
    df_ui.loc[r+idx, 'mod_id'] = idx
    df_ui.loc[r+idx, 'n_sg'] = ut.hypergeom_test(c, genes, G_ui)[1]
    df_ui.loc[r+idx, 'n_g'] = ut.hypergeom_test(c, genes, G_ui)[4]
    df_ui.loc[r+idx, 'sg_id'] = list(set(genes).intersection(set(c)))
    df_ui.loc[r+idx, 'g_id'] = list(set(c))
    df_ui.loc[r+idx, 'p_value'] = ut.hypergeom_test(c, genes, G_ui)[0]

       
        

### hypergeometric test - ii, MCL

df_ii = pd.DataFrame(data=None, columns = col_lst)
r = df_ii.shape[0]
for idx, c in enumerate(cl_ii_mcl):
    df_ii.loc[r+idx, 'cl_algo'] = 'MCL'
    df_ii.loc[r+idx, 'mod_id'] = idx
    df_ii.loc[r+idx, 'n_sg'] = ut.hypergeom_test(c, genes, G_ii)[1]
    df_ii.loc[r+idx, 'n_g'] = ut.hypergeom_test(c, genes, G_ii)[4]
    df_ii.loc[r+idx, 'sg_id'] = list(set(genes).intersection(set(c)))
    df_ii.loc[r+idx, 'g_id'] = list(set(c))
    df_ii.loc[r+idx, 'p_value'] = ut.hypergeom_test(c, genes, G_ii)[0]


### hypergeometric test - ui, Louvain


### hypergeometric test - ii, Louvain


#save

df_ui.to_csv('df_ui.csv')
df_ii.to_csv('df_ii.csv')

In [None]:
df_ui

In [None]:
df_ii

In [None]:
df_ui.shape[0]

## LCC for ii - LOUVAIN

In [None]:
# read data and create graph
ii = read_graph("ii")

# find LCC
G_ii = max(nx.connected_component_subgraphs(ii), key=len)

# community - LOUVAIN 
partition_ii = community.best_partition(G_ii)


In [None]:
nx.draw(G_ii, node_size=3)
plt.show()

In [None]:
#drawing
size = float(len(set(partition_ii.values())))
pos = nx.spring_layout(G_ii)
count = 0.
for com in set(partition_ii.values()) :
    count = count + 1.
    list_nodes = [nodes for nodes in partition_ii.keys()
                                if partition_ii[nodes] == com]
    nx.draw_networkx_nodes(G_ii, pos, list_nodes, node_size = 20,
                                node_color = str(count / size))


nx.draw_networkx_edges(G_ii, pos, alpha=0.5)
plt.show()

## LCC - ui - LOUVAIN

In [None]:
# read data and create graph
ui = read_graph("ui")

# find LCC
G_ui = max(nx.connected_component_subgraphs(ui), key=len)

# community - LOUVAIN 
partition_ui = community.best_partition(G_ui)

In [None]:
#drawing
size = float(len(set(partition_ui.values())))
pos = nx.spring_layout(G_ui)
count = 0.
for com in set(partition_ui.values()) :
    count = count + 1.
    list_nodes = [nodes for nodes in partition_ui.keys()
                                if partition_ui[nodes] == com]
    nx.draw_networkx_nodes(G_ui, pos, list_nodes, node_size = 20,
                                node_color = str(count / size))


nx.draw_networkx_edges(G_ui, pos, alpha=0.5)
plt.show()

## LCC - ii - MCL

In [None]:
ii_mat = nx.to_numpy_matrix(G_ii)

In [None]:
#### hyperparameters

# perform clustering using different inflation values from 1.5 and 2.5
# for each clustering run, calculate the modularity
infl_lst = []
for inflation in [i/10 for i in range(15, 26)]:
    result = mc.run_mcl(ii_mat, inflation=inflation)
    clusters = mc.get_clusters(result)
    Q = mc.modularity(matrix=np.asmatrix(result), clusters=clusters)
    infl_lst.append((Q, inflation))

max_infl_ii = max(infl_lst)[1]

In [None]:
result_ii = mc.run_mcl(ii_mat, inflation = max_infl_ii)
clusters_ii = mc.get_clusters(result)

In [None]:
mc.draw_graph(ii_mat, clusters, node_size=50, with_labels=False, edge_color="silver")

In [None]:
list(G_ui.nodes())[0]

## LCC - ui - MCL

In [None]:
ui_mat = nx.to_numpy_matrix(G_ui)

# map node name to numbers
lbls = {}
for k in range(len(G_ui)):
    lbls[k] = list(G_ui.nodes())[k]

In [None]:
#### hyperparameters

# perform clustering using different inflation values from 1.5 and 2.5
# for each clustering run, calculate the modularity
infl_lst = []
for inflation in [i/10 for i in range(15, 26)]:
    result = mc.run_mcl(ui_mat, inflation=inflation)
    clusters = mc.get_clusters(result)
    Q = mc.modularity(matrix=np.asmatrix(result), clusters=clusters)
    infl_lst.append((Q, inflation))

max_infl_ui = max(infl_lst)[1]

In [None]:
result_ui = mc.run_mcl(ui_mat, inflation = max_infl_ui)
clusters_ui = mc.get_clusters(result)

# use labels
for i in range(len(clusters_ui)):
    clusters_ui[i] = [lbls.get(item, item) for item in clusters_ui[i]]

In [None]:
mc.draw_graph(ui_mat, clusters, node_size=50, with_labels=False, edge_color="silver")

## MCL - ui - test ipergeometrico

In [None]:
def hypergeom_test(mod, genes, G_lcc):
    
    M = len(G_lcc.nodes())
    n = len(genes)
    N = len(mod)
    x = len(set(genes).intersection(set(prova)))
    
    pval = hypergeom.cdf(x, M, n, N)
    
    return(pval)


In [None]:
# find modules with more than 10 nodes 
#mod_10_ii = [i for i in clusters_ii if len(i) >=10]
mod_10_ui = [i for i in clusters_ui if len(i) >=10]

In [None]:
prova = mod_10_ui[2]
prova

In [None]:
####### test ipergeometrico sui seed genes!!!!

# x-1
# M -> population size
# n -> number of successes in the population
# N -> is the sample size 
# x -> numero seed genes nel cluster 

In [None]:
with open("seed_genes.txt","r") as f:
    genes = [gene.rstrip() for gene in f.readlines()]

In [None]:
M = len(G_ui.nodes())
n = len(genes)
N = len(prova)
x = len(set(genes).intersection(set(prova)))

In [None]:
from scipy.stats import hypergeom
pval = hypergeom.cdf(x, M, n, N)
pval

In [None]:
# https://github.com/GuyAllard/markov_clustering

# https://blog.alexlenail.me/understanding-and-implementing-the-hypergeometric-test-in-python-a7db688a7458

# https://www.biostars.org/p/66729/

# 