# SD212: Graph mining
## Solution to Lab 4: Hierarchical clustering

In this lab, you will learn to analyse the hierarchical structure of a graph. 

## Import

In [None]:
from IPython.display import SVG

In [None]:
import numpy as np
from scipy import sparse

In [None]:
from sknetwork.data import load_netset, karate_club
from sknetwork.hierarchy import LouvainIteration, Paris, cut_straight
from sknetwork.ranking import PageRank
from sknetwork.visualization import svg_graph, svg_dendrogram

## Data

We will work on the following graphs (see the [NetSet](https://netset.telecom-paristech.fr/) collection for details):
* Openflights (graph)
* WikiVitals (directed graph)

In [None]:
openflights = load_netset('openflights')
wikivitals = load_netset('wikivitals')

## 1. Graphs

## Karate Club


The [karate club graph](https://en.wikipedia.org/wiki/Zachary%27s_karate_club) provides ground-truth clusters.

In [None]:
dataset = karate_club(True)

In [None]:
adjacency = dataset.adjacency
position = dataset.position
labels_true = dataset.labels

In [None]:
image = svg_graph(adjacency, position, labels=labels_true, names=np.arange(len(labels_true)))
SVG(image)

## To do

* Cluster the graph by the hierarchical Louvain algorithm.
* Display the dendrogram.
* How many clusters are there at depth 1?
* Display the graph with the corresponding clustering.
* How many clusters are there at depth 2?

In [None]:
louvain = LouvainIteration()

In [None]:
dendrogram = louvain.fit_predict(adjacency)

In [None]:
image = svg_dendrogram(dendrogram, names=np.arange(34))
SVG(image)

In [None]:
labels = cut_straight(dendrogram, n_clusters=2)

In [None]:
image = svg_graph(adjacency, position, labels=labels, names=np.arange(34))
SVG(image)

In [None]:
labels = cut_straight(dendrogram, n_clusters=5)

In [None]:
len(set(labels))

## To do

* Cluster the graph by the Paris algorithm.
* Display the dendrogram.
* Display the graph with the top-2 clusters. Compare this clustering to the ground-truth labels.
* Display the graph with the top-4 clusters. Compare this clustering with that given by Louvain. 
* Display the corresponding aggregate dendrogram. 

In [None]:
paris = Paris()

In [None]:
dendrogram = paris.fit_predict(adjacency)

In [None]:
image = svg_dendrogram(dendrogram)
SVG(image)

In [None]:
labels = cut_straight(dendrogram, 2)

In [None]:
image = svg_graph(adjacency, position, labels=labels)
SVG(image)

In [None]:
misclassified = np.flatnonzero(labels != labels_true)

In [None]:
image = svg_graph(adjacency, position, labels={i: labels[i] for i in misclassified})
SVG(image)

In [None]:
labels = cut_straight(dendrogram, 4)

In [None]:
image = svg_graph(adjacency, position, labels=labels)
SVG(image)

In [None]:
dendrogram_ = louvain.fit_predict(adjacency)
labels_ = cut_straight(dendrogram_, 4)

In [None]:
diff = np.flatnonzero(labels != labels_)

In [None]:
len(diff)

In [None]:
image = svg_graph(adjacency, position, labels={i: labels[i] for i in diff})
SVG(image)

In [None]:
labels, dendrogram_aggregate = cut_straight(dendrogram, 4, return_dendrogram=True)

In [None]:
_, counts = np.unique(labels, return_counts=True)

In [None]:
image = svg_dendrogram(dendrogram_aggregate, names=counts)
SVG(image)

## Openflights


In [None]:
dataset = openflights

In [None]:
adjacency = dataset.adjacency
position = dataset.position
names = dataset.names

In [None]:
image = svg_graph(adjacency, position, width=800, height=400, node_size=3, display_edges=False)
SVG(image)

## To do

* Display the same world map with the top-10 clusters found by Paris.
* Display the aggregate dendrogram formed by the top-40 clusters found by Paris, with the name of the top airport  (in traffic) of each cluster.

In [None]:
paris = Paris()

In [None]:
dendrogram = paris.fit_predict(adjacency)

In [None]:
labels = cut_straight(dendrogram, 10)

In [None]:
image = svg_graph(adjacency, position, width=800, height=400, node_size=3, labels=labels, display_edges=False)
SVG(image)

In [None]:
labels, dendrogram_aggregate = cut_straight(dendrogram, n_clusters=40, return_dendrogram=True)

In [None]:
labels_unique, counts = np.unique(labels, return_counts=True)

In [None]:
weights = adjacency.dot(np.ones(adjacency.shape[0]))

In [None]:
top_airports = []
for l in labels_unique:
    cluster = np.flatnonzero(labels == l)
    top_airports.append(cluster[np.argmax(weights[cluster])])

In [None]:
image = svg_dendrogram(dendrogram_aggregate, names=names[top_airports], rotate=True, n_clusters=10, height=500)
SVG(image)

## 2. Directed graphs

## Wikipedia Vitals

In [None]:
dataset = wikivitals

In [None]:
adjacency = dataset.adjacency
names = dataset.names

## To do

* Cluster the graph by the Paris algorithm.
* Give the size and top-5 articles of each of the top-40 clusters found by Paris.
* Display the aggregate dendrogram formed by the top-40 clusters, with the name of the top article of each cluster.

In [None]:
paris = Paris()

In [None]:
dendrogram = paris.fit_predict(adjacency)

In [None]:
labels, dendrogram_aggregate = cut_straight(dendrogram, n_clusters=40, return_dendrogram=True)

In [None]:
pagerank = PageRank()

In [None]:
for label in np.unique(labels):
    mask = labels == label
    scores = pagerank.fit_predict(adjacency, weights=mask)
    top = np.argsort(-scores[mask])[:5]
    print(names[mask][top], np.sum(mask))

In [None]:
top_articles = []
for label in np.unique(labels):
    mask = labels == label
    scores = pagerank.fit_predict(adjacency, weights=mask)
    top = np.argmax(scores[mask])
    top_articles.append(np.flatnonzero(mask)[top])

In [None]:
image = svg_dendrogram(dendrogram_aggregate, names=names[top_articles], rotate=True, scale=2)
SVG(image)

## To do

* Complete the following function that returns the nested clusters from a target node in a dendrogram.
* Give the size and top-5 articles of each nested cluster from the article **Riverboat**.

In [None]:
def get_cluster_nested(dendrogram: np.ndarray, node: int): 
    '''Get the nested clusters of a node, following the path from the corresponding leaf to the root in the dendrogram.
    
    Parameters
    ----------
    dendrogram: 
        Dendrogram
    node:
        Target node
        
    Returns
    -------
    cluster_nested: list of list
        Nested clusters, as a partition of the set of nodes. 
        
    Example
    -------
    >>> dendrogram = np.array([[0, 1, 1, 2], [2, 3, 1, 2], [4, 5, 1, 4]])
    >>> get_cluster_nested(dendrogram, 0)
    [[0], [1], [2, 3]]
    '''
    n = dendrogram.shape[0] + 1
    
    cluster = {i: [i] for i in range(n)}
    cluster_nested = [[node]]
    cluster_index = node
    
    for t in range(n - 1):
        i = int(dendrogram[t][0])
        j = int(dendrogram[t][1])
        if i == cluster_index:
            cluster_nested.append(cluster[j])
            cluster_index = n + t
        elif j == cluster_index:
            cluster_nested.append(cluster[i])
            cluster_index = n + t
        cluster[n + t] = cluster.pop(i) + cluster.pop(j)
    
    return cluster_nested

In [None]:
{i: name for i, name in enumerate(names) if name=='Riverboat'}

In [None]:
node = 7696

In [None]:
names[node]

In [None]:
cluster_nested = get_cluster_nested(dendrogram, node)

In [None]:
len(cluster_nested)

In [None]:
cluster = []
for cluster_add in cluster_nested:
    cluster += cluster_add
    weights = {node: 1 for node in cluster}
    scores = pagerank.fit_predict(adjacency, weights)
    top_nodes = np.array(cluster)[np.argsort(-scores[cluster])[:5]]
    print(names[top_nodes], len(cluster))