In [1]:
import networkx as nx
from networkx.algorithms import community as comm1
import community as comm2
from tqdm import tqdm
import numpy as np
from matplotlib import pyplot as plt
import pickle
import igraph as ig
import pandas as pd

from collections import defaultdict #Used to transpose dictionary
from itertools import combinations

In [2]:
networks_folder = "networks_folder/"
nodestats_folder = "nodestats_folder/"

In [3]:
seed_nodes = [27522964, 42786325, 52352820, 72931184, 75736238, 87229781, 90657826, 114374226, 152852932, 154891961, 209693451, 271397818, 322027737, 402181258, 433462889, 1457805708, 2167525794, 2231109295, 2349347329, 2442888666, 2490088512, 2888660047, 2915187060, 2995401932, 3012158891, 3234613430, 3999537573, 813914393788481540, 817163123182465024, 832319306541178881, 850408392925499392, 857482409012547584, 894682675893735426, 903046544919732224]

In [4]:
with open(networks_folder+'2013-09-07_reg.pickle','rb') as f:
    g = pickle.load(f)
#Making iGraph, because networkx doesn't have good native community detection
#igraph graph can't store userid so using lookup table
g_ig_lookup_table = {}
counter = 0
for node in g:
    g_ig_lookup_table[node] = counter
    counter += 1
g_ig = ig.Graph(len(g), [(g_ig_lookup_table[x],g_ig_lookup_table[y]) for x,y in list(g.edges()) if y!='None'])

In [20]:
def shortestPath2Seeds(source_seed,seed_nodes,g):
    test_minimum_path = []
    #Remove edges from seed nodes
    seed_nodes_in_graph = [x for x in seed_nodes if x in g]
    if len(seed_nodes_in_graph)==0:
        return -2
    for remaining_seed in seed_nodes_in_graph:
        removed_seed_edges = g.edges([x for x in seed_nodes if x!= remaining_seed])
        h = nx.DiGraph(g)
        h.remove_edges_from(removed_seed_edges)
        if remaining_seed in h and nx.has_path(h,source_seed,remaining_seed):
            test_minimum_path.append( len(nx.shortest_path(h,source_seed,remaining_seed)) )
        #g.add_edges_from(removed_seed_edges)
    if test_minimum_path==[]:
        return -1
    else: 
        return min(test_minimum_path)
    
def transposeDict(original_dict,items=False):
    result = defaultdict(list)
    items_l = original_dict.items() if not items else original_dict
    for k,v in items_l:
        result[v].append(k)
    result = dict(result)
    return result

def incrementWeights(partition_transpose,g_temp,is_ig=False): #CHANGE THIS TO ACTUALLY BE RIGHT
    if is_ig:
        g_ig_lookup_table_rev = transposeDict(g_ig_lookup_table)
        for thing in partition_transpose:
            partition_transpose[thing] = set([g_ig_lookup_table_rev[x][0] for x in partition_transpose[thing]])
    for comm in partition_transpose:
        for e1,e2 in combinations(partition_transpose[comm],2):
            if g_temp.has_edge(e1,e2):
                g_temp[e1][e2]['weight'] += 1
#             else:
#                 g.add_edge(e1,e2,attr_dict={'weight':1})
    return g_temp

## Topics of interest
* Shortest path to seeds
* Distribution of people in k-shells over time
* Communities and their membership over time
* Figure out centrality distribution for evenness. Are bots really high?

Use LaNet-Vi to visualize k-core
How do check for number of overlaps

## Node statistics - Single

In [6]:
#k_core values
k_core = nx.core_number(g)

In [21]:
shortest_paths_2_seeds = {}
for node in tqdm(g):
    shortest_paths_2_seeds[node] = shortestPath2Seeds(node,seed_nodes,g)

  0%|          | 102/53133 [02:24<20:55:37,  1.42s/it]

NetworkXError: Input is not a correct NetworkX graph.

---

## Community detection - Single

In this section, I'm going evaluate the communities on the dynamic RT/MN networks to track the emergence and evolution of communities in the network. 

I implement the _consensus clustering_ method outlined in Lancichinetti & Fortunato's paper (2012) with application to dynamic networks as outlined in Fortunato (2016). Consensus clustering is basically running many community detection methods and then weighting the edges of the network with the mutual pairs in each community detection result. Then run a weighted community detection method on the weighted graph. For the dynamic networks, I'm going to use the Hungarian algorithm for the assignment problem of matching communities across moving bins that minimize the Jaccard distances between community membership. 

|                 | My ideas      | Learned from Santo |
|-----------------|---------------|--------------------|
| Partition Type  | Community     | Cover              |
| Distance metric | Jaccard D     | NMI                |
| Assignment Alg  | Min Hungarian | Pick best?         |

In [None]:
#Basic parameters
g_consensus_cluster = nx.Graph(g)
#Add edge attributes
nx.set_edge_attributes(g_consensus_cluster,name='weight',values=0)

#### Individual community assessments

In [None]:
#Communities over time - Using consensus clustering
#Louvain community
partition_lou = comm2.best_partition(nx.Graph(g))
partition_lou_transpose = transposeDict(partition_lou)
#Push to original graph
g_consensus_cluster = incrementWeights(partition_lou_transpose,g_consensus_cluster)

In [None]:
#Infomap
partition_info = g_ig.community_infomap(trials=20)
partition_memberships = zip([g_ig.vs[x].index for x in range(g_ig.vcount())], partition_info.membership)
partition_info_transpose = transposeDict(partition_memberships,items=True)
#Push to original graph
g_consensus_cluster = incrementWeights(partition_info_transpose,g_consensus_cluster,is_ig=True)

In [None]:
#Label propogation method
#comm1.label_propagation_communities(g) #networkx implementation, doesn't seem to work
partition_label = g_ig.community_label_propagation()
partition_memberships = zip([g_ig.vs[x].index for x in range(g_ig.vcount())], partition_label.membership)
partition_label_transpose = transposeDict(partition_memberships,items=True)
#Push to original graph
g_consensus_cluster = incrementWeights(partition_label_transpose,g_consensus_cluster,is_ig=True)

#### Consensus cluster community assessment

In [None]:
#Louvain
partition_f = comm2.best_partition(g_consensus_cluster, weight='weight')
partition_f_transpose = transposeDict(partition_f)

## Running this for all networks

In [None]:
for filename in tqdm(os.listdir(networks_folder)):
    with open(networks_folder+filename) as f:
        g = pickle.load(f)
    k_core = nx.core_number(g)
    shortest_paths_2_seeds = {}
    for node in tqdm(g):
        shortest_paths_2_seeds[node] = shortestPath2Seeds(node,seed_nodes,g)
    
    #Creating igraph
    g_ig_lookup_table = {}
    counter = 0
    for node in g:
        g_ig_lookup_table[node] = counter
        counter += 1
    g_ig = ig.Graph(len(g), [(g_ig_lookup_table[x],g_ig_lookup_table[y]) for x,y in list(g.edges()) if y!='None'])
    #Creating consensus cluster
    g_consensus_cluster = nx.Graph(g)
    nx.set_edge_attributes(g_consensus_cluster,name='weight',values=0)
    
    #Communities over time - Using consensus clustering
    #Louvain community
    partition_lou = comm2.best_partition(nx.Graph(g))
    partition_lou_transpose = transposeDict(partition_lou)
    #Push to original graph
    g_consensus_cluster = incrementWeights(partition_lou_transpose,g_consensus_cluster)

    #Infomap
    partition_info = g_ig.community_infomap(trials=20)
    partition_memberships = zip([g_ig.vs[x].index for x in range(g_ig.vcount())], partition_info.membership)
    partition_info_transpose = transposeDict(partition_memberships,items=True)
    #Push to original graph
    g_consensus_cluster = incrementWeights(partition_info_transpose,g_consensus_cluster,is_ig=True)

    #Label propogation method
    #comm1.label_propagation_communities(g) #networkx implementation, doesn't seem to work
    partition_label = g_ig.community_label_propagation()
    partition_memberships = zip([g_ig.vs[x].index for x in range(g_ig.vcount())], partition_label.membership)
    partition_label_transpose = transposeDict(partition_memberships,items=True)
    #Push to original graph
    g_consensus_cluster = incrementWeights(partition_label_transpose,g_consensus_cluster,is_ig=True)
    
    #Final community assignment
    partition_f = comm2.best_partition(g_consensus_cluster, weight='weight')
    
    df = pd.DataFrame([k_core,shortest_paths_2_seeds,partition_f]).T.rename(index=str,columns={0:"k_value",1:"SP2S",2:"Community"})
    with open(nodestats_folder+filename) as f:
        df.to_csv()

---

### Testing area