# SD212: Graph mining
## Solution to Lab 3: Graph clustering

In this lab, you will learn to cluster a graph and interpret the results. 

## Import

In [None]:
from IPython.display import SVG

In [None]:
import numpy as np
from scipy import sparse

In [None]:
from sknetwork.data import load_netset, grid, karate_club
from sknetwork.clustering import Louvain, get_modularity
from sknetwork.ranking import PageRank
from sknetwork.linalg import normalize
from sknetwork.utils import get_membership
from sknetwork.visualization import svg_graph, svg_bigraph

## Data

We will work on the following graphs (see the [NetSet](https://netset.telecom-paris.fr/) collection for details):
* Openflights (graph)
* WikiVitals (directed graph)
* Cinema (bipartite graph)

In [None]:
openflights = load_netset('openflights')
wikivitals = load_netset('wikivitals')
cinema = load_netset('cinema')

## 1. Graphs

The Louvain algorithm aims at maximizing [modularity](https://en.wikipedia.org/wiki/Modularity_(networks)).

## Grid

Consider a grid:

In [None]:
dataset = grid(9, 9, True)
adjacency = dataset.adjacency
position = dataset.position

In [None]:
image = svg_graph(adjacency, position, width=200, height=200)
SVG(image)

## To do

* Cluster the graph by Louvain and return the corresponding modularity.
* Try to shuffle the nodes and observe the results.
* Find a better clustering than Louvain in terms of modularity.<br>**Hint:** Propose a simple clustering and calculate its modularity!

In [None]:
louvain = Louvain()

In [None]:
labels = louvain.fit_predict(adjacency)

In [None]:
image = svg_graph(adjacency, position, labels=labels, width=200, height=200)
SVG(image)

In [None]:
get_modularity(adjacency, labels)

In [None]:
n = adjacency.shape[0]
index = np.arange(n)
np.random.shuffle(index)
adjacency_shuffle = adjacency[index][:, index]

In [None]:
labels_shuffle = louvain.fit_predict(adjacency_shuffle)

In [None]:
image = svg_graph(adjacency_shuffle, position[index], labels=labels_shuffle, width=200, height=200)
SVG(image)

In [None]:
get_modularity(adjacency_shuffle, labels_shuffle)

In [None]:
n = adjacency.shape[0]
labels = np.array([((i%9) // 3) * 3 + (i//9) // 3 for i in range(n)])

In [None]:
image = svg_graph(adjacency, position, labels=labels, width=200, height=200)
SVG(image)

In [None]:
get_modularity(adjacency, labels)

## Karate Club


The [karate club graph](https://en.wikipedia.org/wiki/Zachary%27s_karate_club) provides ground-truth clusters.

In [None]:
dataset = karate_club(metadata=True)

In [None]:
adjacency = dataset.adjacency
position = dataset.position
labels_true = dataset.labels

In [None]:
image = svg_graph(adjacency, position, labels=labels_true)
SVG(image)

## To do

* Cluster the graph by Louvain and display the labels.
* Give the modularity.
* Display the aggregate graph (use ``display_node_weight`` to show the volume of each cluster).
* Set the resolution to $\frac 1 2$ and display the new clustering. Compare with the ground truth.

In [None]:
louvain = Louvain()

In [None]:
labels = louvain.fit_predict(adjacency)

In [None]:
image = svg_graph(adjacency, position, labels=labels)
SVG(image)

In [None]:
adjacency_aggregate = louvain.aggregate_

In [None]:
average = normalize(get_membership(labels).T)
position_aggregate = average.dot(position)

In [None]:
image = svg_graph(adjacency_aggregate, position_aggregate, labels=np.unique(labels), display_node_weight=True)
SVG(image)

In [None]:
louvain = Louvain(resolution=0.5)

In [None]:
labels = louvain.fit_predict(adjacency)

In [None]:
image = svg_graph(adjacency, position, labels=labels)
SVG(image)

In [None]:
labels_pred = labels
if np.sum(labels_pred == labels_true) <  np.sum(labels_pred == 1 - labels_true):
    labels_pred = 1 - labels_pred

In [None]:
misclassified = np.argwhere(labels_pred != labels_true).ravel()

In [None]:
image = svg_graph(adjacency, position, labels={i: labels[i] for i in misclassified})
SVG(image)

## Openflights


In [None]:
dataset = openflights

In [None]:
adjacency = dataset.adjacency
position = dataset.position
names = dataset.names

In [None]:
image = svg_graph(adjacency, position, width=800, height=400, node_size=3, display_edges=False)
SVG(image)

## To do

* Display the same world map with the clusters found by Louvain (resolution 1).
* How many clusters are there?
* What is the Simpson index of this clustering? Check the function ``modularity``.
* How do you interpret the *inverse* of the Simpson index?
* List the top airport of each cluster in number of flights.
* Display the aggregate graph with the cluster volumes and the name of the top airport of each cluster.
* Display the same graph restricted to clusters with at least 100 nodes.
* Which of these clusters has the highest traffic (inside + outside)?
* Which of these clusters is the strongest?

In [None]:
louvain = Louvain()

In [None]:
labels = louvain.fit_predict(adjacency)

In [None]:
image = svg_graph(adjacency, position, width=800, height=400, node_size=3, labels=labels, display_edges=False, filename='openflights-louvain1')
SVG(image)

In [None]:
len(set(labels))

In [None]:
get_modularity(adjacency, labels, return_all=True)

In [None]:
simpson = get_modularity(adjacency, labels, return_all=True)[2]

In [None]:
# approximate number of actual clusters
1 / simpson

In [None]:
# number of significant clusters
_, counts = np.unique(labels, return_counts=True)
np.sum(counts > 100)

In [None]:
labels_unique, counts = np.unique(labels, return_counts=True)

In [None]:
n = adjacency.shape[0]
weights = adjacency.dot(np.ones(n))

In [None]:
top_airports = []
for label in labels_unique:
    cluster = np.argwhere(labels == label).ravel()
    top_airports.append(cluster[np.argmax(weights[cluster])])

In [None]:
adjacency_aggregate = louvain.aggregate_
average = normalize(get_membership(labels).T)
position_aggregate = average.dot(position)

In [None]:
adjacency_aggregate

In [None]:
np.unique(labels)

In [None]:
image = svg_graph(adjacency_aggregate, position_aggregate, names=names[top_airports], width=800, height=400, display_node_weight=True, edge_width_max=200, labels=np.unique(labels))
SVG(image)

In [None]:
_, counts = np.unique(labels, return_counts=True)
index = np.argwhere(counts >= 100).ravel()

In [None]:
image = svg_graph(adjacency_aggregate[index][:, index], position_aggregate[index], names=names[top_airports][index], width=800, height=400, display_node_weight=True, edge_width_max=200, labels=np.unique(labels)[index])
SVG(image)

In [None]:
n_aggregate = adjacency_aggregate.shape[0]
traffics = adjacency_aggregate.dot(np.ones(n_aggregate))
strengths = adjacency_aggregate.diagonal() / traffics

In [None]:
names[top_airports][index][np.argmax(traffics[index])]

In [None]:
names[top_airports][index][np.argmax(strengths[index])]

## 2. Directed graphs

## Wikipedia Vitals

In [None]:
dataset = wikivitals

In [None]:
adjacency = dataset.adjacency
names = dataset.names

## To do

* Cluster the graph by Louvain (resolution 1).
* List the top-5 pages of each cluster in terms of Personalized PageRank.
* Display the aggregate graph with the cluster volumes and the name of the top page of each cluster.
* Display the same graph restricted to clusters with at least 100 nodes.
* Which of these clusters is the strongest? the weakest?
* Display the bipartite graph linking these clusters to their ground-truth labels.
* Display the same graph restricted to the meain ground-truth labels of each cluster, each representing at least 10% of the labels of the cluster.
* Repeat the same experiments at resolution 2.

In [None]:
louvain = Louvain(resolution=1)

In [None]:
labels = louvain.fit_predict(adjacency)

In [None]:
len(set(labels))

In [None]:
pagerank = PageRank()

In [None]:
top_pages = []
for label in np.unique(labels):
    scores = pagerank.fit_predict(adjacency, weights=labels==label)
    scores *= labels==label
    top_nodes = np.argsort(-scores)[:5]
    print(names[top_nodes], len(cluster))
    top_pages.append(top_nodes[0])

In [None]:
names[top_pages]

In [None]:
adjacency_aggregate = louvain.aggregate_

In [None]:
image = svg_graph(adjacency_aggregate, names=names[top_pages], display_node_weight=True, display_edge_weight=True, labels=np.unique(labels))
SVG(image)

In [None]:
_, counts = np.unique(labels, return_counts=True)

In [None]:
index = np.argwhere(counts >= 100).ravel()

In [None]:
image = svg_graph(adjacency_aggregate[index][:, index], names=names[top_pages][index], width=800, height=400, 
                    display_node_weight=True, display_edge_weight=True, edge_width_max=20, labels=np.unique(labels)[index])
SVG(image)

In [None]:
_, counts = np.unique(labels, return_counts=True)

In [None]:
n_aggregate = adjacency_aggregate.shape[0]
out_links = adjacency_aggregate.dot(np.ones(n_aggregate))
strengths = adjacency_aggregate.diagonal() / out_links

In [None]:
top_pages = np.array(top_pages)

In [None]:
names[top_pages][index][np.argmax(counts[index])]

In [None]:
names[top_pages][index][np.argmax(strengths[index])]

In [None]:
names[top_pages][index][np.argmin(strengths[index])]

In [None]:
labels_true = dataset.labels
names_labels = dataset.names_labels

In [None]:
membership = get_membership(labels).astype(int).T.dot(get_membership(labels_true)).tocsr()


In [None]:
membership.shape

In [None]:
image = svg_bigraph(membership[index], names_row=names[top_pages][index], names_col=names_labels, 
                    display_node_weight=True, color_row='blue', color_col='red')
SVG(image)

In [None]:
membership_filter = membership.multiply(normalize(membership) > 0.1)

In [None]:
membership_filter

In [None]:
image = svg_bigraph(membership_filter[index], names_row=names[top_pages][index], names_col=names_labels, 
                    display_node_weight=True, color_row='blue', color_col='red')
SVG(image)

## 3. Bipartite graphs

## Cinema

In [None]:
dataset = cinema

In [None]:
biadjacency = dataset.biadjacency
movies = dataset.names_row
actors = dataset.names_col

## To do

* Cluster the graph by Louvain (resolution 1). 
* List the 10 largest clusters and display the names of the top-5 actors and top-5 movies of each cluster in terms of Personalized PageRank.

In [None]:
louvain = Louvain()

In [None]:
louvain.fit(biadjacency)

In [None]:
louvain.aggregate_

In [None]:
labels_row = louvain.labels_row_
labels_col = louvain.labels_col_

In [None]:
pagerank = PageRank()

In [None]:
for label in np.unique(labels_row)[:10]:
    pagerank.fit(biadjacency, weights_row=labels_row == label, weights_col=labels_col == label)
    scores_row = pagerank.scores_row_ * (labels_row==label)
    scores_col = pagerank.scores_col_ * (labels_col==label)
    print(movies[np.argsort(-scores_row)[:5]])
    print(actors[np.argsort(-scores_col)[:5]])
    print(np.sum(labels_row == label), np.sum(labels_col == label))