In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import networkx as nx

import json
import pickle

import pandas as pd

In [5]:
g = nx.read_edgelist('network_finale.csv', data=(('weight', int),('industry',int)) , delimiter=",")

In [6]:
NNODES = g.number_of_nodes()
NEDGES = g.number_of_edges()

print("Number of Nodes: ", NNODES)
print("Number of Edges: ", NEDGES)

Number of Nodes:  14875
Number of Edges:  43932


In [7]:
def get_local_clustering_coefficient(node):
    return nx.clustering(g,node)

In [8]:
def get_nodes_from_community(comm):
    return list(g.subgraph(comm).nodes())

In [9]:
def get_edge_data(edge):
    return g.get_edge_data(edge[0], edge[1])

In [10]:
def get_community_index_from_node(node):
    to_ret = None

    for idx, com in enumerate(communities_map):
        if node in com:
            to_ret = idx
            break
            
    return to_ret

#### Degree

In [11]:
degrees = dict(g.degree())

#### Triangles

In [12]:
triangles = nx.triangles(g)

#### Centralities

In [13]:
degree_centrality = nx.degree_centrality(g)

In [14]:
eigenvector_centrality = nx.eigenvector_centrality(g, weight='weight', max_iter=1000)

In [15]:
pagerank_centrality = nx.pagerank(g, weight='weight')

In [16]:
closeness_centrality = nx.closeness_centrality(g)

In [17]:
harmonic_centrality = nx.harmonic_centrality(g)

In [18]:
betweenness_centrality = nx.betweenness_centrality(g, weight='weight')

#### Communities (louvain)

In [22]:
import dill as pickle
with open ("louvain.pickle", "rb") as f:
    Louvain = pickle.load(f)

In [25]:
communities_map = []

for idx, com in enumerate(Louvain.communities):
    links = get_nodes_from_community(com)
    
    communities_map.append(links)

### Creo il dataset

In [26]:
edge_list = list(g.edges())

In [27]:
data1 = []
data2 = []

for idx, edge in enumerate(edge_list):
    node_a = edge[0]
    node_b = edge[1]
    
    edge_data = get_edge_data(edge)
    
    weight = edge_data['weight']
    industry = edge_data['industry']
    
    triangles_a = triangles[node_a]
    triangles_b = triangles[node_b]
    triangles_avg = (triangles_a + triangles_b) / 2
    
    comm_a = get_community_index_from_node(node_a)
    comm_b = get_community_index_from_node(node_b)
    comm = 1 if comm_a == comm_b else 0
    
    cl_coeff_a = get_local_clustering_coefficient(node_a)
    cl_coeff_b = get_local_clustering_coefficient(node_b)
    cl_coeff_avg = (cl_coeff_a + cl_coeff_b) / 2
    
    degree_a = degrees[node_a]
    degree_b = degrees[node_b]
    degree_avg = (degree_a + degree_b) / 2
    
    degree_centr_a = degree_centrality[node_a]
    degree_centr_b = degree_centrality[node_b]
    degree_centr_avg = (degree_centr_a + degree_centr_b) / 2
    
    eig_centr_a = eigenvector_centrality[node_a]
    eig_centr_b = eigenvector_centrality[node_b]
    eig_centr_avg = (eig_centr_a + eig_centr_b) / 2
    
    pagerank_centr_a = pagerank_centrality[node_a]
    pagerank_centr_b = pagerank_centrality[node_b]
    pagerank_centr_avg = (pagerank_centr_a + pagerank_centr_b) / 2
    
    closeness_cent_a = closeness_centrality[node_a]
    closeness_cent_b = closeness_centrality[node_b]
    closeness_cent_avg = (closeness_cent_a + closeness_cent_b) / 2
    
    harmonic_centr_a = harmonic_centrality[node_a]
    harmonic_centr_b = harmonic_centrality[node_b]
    harmonic_centr_avg = (harmonic_centr_a + harmonic_centr_b) / 2
    
    betweenness_centr_a = betweenness_centrality[node_a]
    betweenness_centr_b = betweenness_centrality[node_b]
    betweenness_centr_avg = (betweenness_centr_a + betweenness_centr_b) / 2
    
    data1.append([
        node_a, node_b,
        weight, industry,
        triangles_a, triangles_b,
        comm_a, comm_b,
        cl_coeff_a, cl_coeff_b,
        degree_a, degree_b,
        degree_centr_a, degree_centr_b,
        eig_centr_a, eig_centr_b,
        pagerank_centr_a, pagerank_centr_b,
        closeness_cent_a, closeness_cent_b,
        harmonic_centr_a, harmonic_centr_b,
        betweenness_centr_a, betweenness_centr_b
    ])
    
    data2.append([
        node_a, node_b,
        weight, industry,
        triangles_avg,
        comm,
        cl_coeff_avg,
        degree_avg,
        degree_centr_avg,
        eig_centr_avg,
        pagerank_centr_avg,
        closeness_cent_avg, 
        harmonic_centr_avg,
        betweenness_centr_avg,
    ])
    
 

KeyError: 'weight'

In [None]:
df1 = pd.DataFrame(data1, columns=[
                                 'NODE_A', 'NODE_B', 
                                 'WEIGHT', 'INDUSTRY', 
                                 'TRIANGLES_A', 'TRIANGLES_B',
                                 'COMM_A', 'COMM_B', 
                                 'CL_COEF_A', 'CL_COEF_B', 
                                 'DEGREE_A', 'DEGREE_B', 
                                 'DEGREE_CENTR_A', 'DEGREE_CENTRALITY_B',
                                 'EIGENVECTOR_CENTR_A', 'EIGENVECTOR_CENTR_B',
                                 'PAGERANK_CENTR_A', 'PAGERANK_CENTR_B',
                                 'CLOSENESS_CENTR_A', 'CLOSENESS_CENTR_B',
                                 'HARMONIC_CENTR_A', 'HARMONIC_CENTR_B',
                                 'BETWEENESS_CENTR_A', 'BETWEENESS_CENTR_B'
                                ])


df2 = pd.DataFrame(data2, columns=[
                                 'NODE_A', 'NODE_B', 
                                 'WEIGHT', 'INDUSTRY', 
                                 'TRIANGLES_AVG', 
                                 'COMM',
                                 'CL_COEF_AVG',
                                 'DEGREE_AVG', 
                                 'DEGREE_CENTR_AVG',
                                 'EIGENVECTOR_CENTR_AVG',
                                 'PAGERANK_CENTR_AVG', 
                                 'CLOSENESS_CENTR_AVG', 
                                 'HARMONIC_CENTR_AVG', 
                                 'BETWEENESS_CENTR_AVG'
                                ])

In [None]:
df1.info()

In [None]:
df2.info()

In [None]:
for i, row in df1.iterrows():
    if row["COMM_A"] >  row["COMM_B"]:
        #print(row["COMM_A"], row["COMM_B"])
        temp = row["COMM_B"]
        df1.loc[i, "COMM_B"] = row["COMM_A"]
        df1.loc[i, "COMM_A"] = temp
        #print(df2["COMM_A"][i], df2["COMM_B"][i])    

In [None]:
lista = []

for i, row in df.iterrows():
    
    lista.append(str(row["COMM_A"])+"_"+str(row["COMM_B"]))
    
    
df1["COMM_UN"] = pd.Series(lista)
df2["COMM_UN"] = pd.Series(lista)

In [None]:
one_hot = pd.get_dummies(df["COMM_UN"])
df1 = df1.drop("COMM_A", axis = 1)
df1 = df1.drop("COMM_B", axis = 1)
df1 = df1.join(one_hot)

In [None]:
one_hot = pd.get_dummies(df_average["COMM_UN"])
df2 = df_average.drop("COMM", axis = 1)

df2 = df_average.join(one_hot)

In [None]:
df1.head()

In [None]:
df2.head()

In [None]:
df1.to_csv('DF_ALL_NEW_COMM.csv',index=False)

In [None]:
df2.to_csv('DF_AVG_NEW_COMM.csv',index=False)