# Community Detection

In [4]:
import networkx as nx
import pandas as pd
import numpy as np

from sklearn.cluster import SpectralClustering
from node2vec import Node2Vec as n2v

## Create Network

In [6]:
def generate_graph_deg_dist(deg_dist, n):
    '''
    This function will generate a networkx graph G based on a degree distribution
    provided by the user.
    
    params:
        deg_dist (Dictionary) : The key will be the degree and the value is the probability
                                of a node having that degree. The probabilities must sum to
                                1
        n (Integer) : The number of nodes you want the graph to yield
                          
    example:
        G = generate_graph_deg_dist(
                deg_dist = {
                    6:0.2,
                    3:0.14,
                    8:0.35,
                    4:0.3,
                    11:0.01
                },
                n = 1000
        )
    '''
    deg = list(deg_dist.keys())
    proba = list(deg_dist.values())
    if sum(proba) == 1.:
        deg_sequence = np.random.choice(
            deg,
            n,
            proba
        )
        return nx.configuration_model(deg_sequence)
    raise ValueError("Probabilities do not equal to 1")
    
G = generate_graph_deg_dist(
        deg_dist = {
            6:0.2,
            3:0.14,
            8:0.35,
            4:0.3,
            11:0.01
        },
        n = 1000
)

print(nx.info(G))

Name: 
Type: MultiGraph
Number of nodes: 1000
Number of edges: 3232
Average degree:   6.4640


## Algorithmic Community Detection  
There are many algorithmic algorithms associated with community detection on networks. In this tutorial I will cover the Greedy and Girvan Newman's approach to community detection. There are many other algorithms (like Louvaine's algorithm) which does community detection on networks and I highly encourage you to research and implement those as well. You can find a lot of them here in the NetworkX [documentation](https://networkx.org/documentation/stable/reference/algorithms/community.html) [1]. I will go over the intution and python implementation of these two algorithms and see how they differ on the network in comparison to a semi-supervised machine learning approach. 

## Apply Node2Vec

In [7]:
WINDOW = 1 # Node2Vec fit window
MIN_COUNT = 1 # Node2Vec min. count
BATCH_WORDS = 4 # Node2Vec batch words

g_emb = n2v(
  G,
  dimensions=16
)

mdl = g_emb.fit(
    vector_size = 16,
    window=WINDOW,
    min_count=MIN_COUNT,
    batch_words=BATCH_WORDS
)

Computing transition probabilities:   0%|          | 0/1000 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:18<00:00,  1.85s/it]


## Generate Embeddings

In [8]:
emb_df = (
    pd.DataFrame(
        [mdl.wv.get_vector(str(n)) for n in G.nodes()],
        index = G.nodes
    )
)

## Spectral Clustering

In [10]:
X = emb_df.values

In [12]:
clustering = SpectralClustering(
    n_clusters=10, 
    assign_labels='discretize',
    random_state=0
).fit(X)

In [16]:
clustering

SpectralClustering(assign_labels='discretize', n_clusters=10, random_state=0)

## Identify Communities

In [19]:
comm_dct = dict(zip(emb_df.index, clustering.labels_))

{0: 0,
 1: 0,
 2: 6,
 3: 0,
 4: 0,
 5: 0,
 6: 0,
 7: 0,
 8: 0,
 9: 0,
 10: 0,
 11: 3,
 12: 0,
 13: 0,
 14: 0,
 15: 0,
 16: 5,
 17: 0,
 18: 0,
 19: 0,
 20: 0,
 21: 0,
 22: 0,
 23: 0,
 24: 0,
 25: 0,
 26: 0,
 27: 0,
 28: 0,
 29: 0,
 30: 0,
 31: 0,
 32: 0,
 33: 0,
 34: 0,
 35: 0,
 36: 0,
 37: 0,
 38: 0,
 39: 0,
 40: 0,
 41: 0,
 42: 5,
 43: 0,
 44: 5,
 45: 0,
 46: 0,
 47: 0,
 48: 0,
 49: 0,
 50: 0,
 51: 0,
 52: 0,
 53: 0,
 54: 0,
 55: 0,
 56: 0,
 57: 0,
 58: 0,
 59: 7,
 60: 0,
 61: 0,
 62: 0,
 63: 0,
 64: 0,
 65: 0,
 66: 2,
 67: 0,
 68: 0,
 69: 0,
 70: 0,
 71: 0,
 72: 0,
 73: 0,
 74: 0,
 75: 0,
 76: 0,
 77: 0,
 78: 0,
 79: 0,
 80: 6,
 81: 0,
 82: 0,
 83: 0,
 84: 0,
 85: 0,
 86: 0,
 87: 0,
 88: 0,
 89: 0,
 90: 0,
 91: 0,
 92: 0,
 93: 0,
 94: 0,
 95: 0,
 96: 0,
 97: 0,
 98: 0,
 99: 5,
 100: 0,
 101: 0,
 102: 0,
 103: 9,
 104: 0,
 105: 0,
 106: 0,
 107: 0,
 108: 5,
 109: 0,
 110: 0,
 111: 0,
 112: 5,
 113: 0,
 114: 0,
 115: 0,
 116: 0,
 117: 0,
 118: 0,
 119: 0,
 120: 0,
 121: 0,
 122: 0,
 12

## Concluding Remarks

## Resources
- [1] https://networkx.org/documentation/stable/reference/algorithms/community.html

---