# SD212: Graph mining
## Solution to Lab 5: Heat diffusion

In this lab, you will learn to use diffusion for **contrastive ranking** and **classification** of nodes.

## Import

In [None]:
from IPython.display import SVG

In [None]:
import numpy as np
from scipy import sparse

In [None]:
from sknetwork.data import load_netset, grid, karate_club
from sknetwork.regression import Dirichlet
from sknetwork.classification import DiffusionClassifier
from sknetwork.ranking import PageRank
from sknetwork.visualization import svg_graph

## Data

We will work on the following graphs (see the [NetSet](https://netset.telecom-paris.fr/) collection for details):
* Openflights (graph)
* WikiVitals (directed graph + bipartite graph)

In [None]:
openflights = load_netset('openflights')
wikivitals = load_netset('wikivitals')

## 1. Graphs

## Grid

We first illustrate the notion of contrastive ranking on a $k\times k$ grid.

In [None]:
k = 5
dataset = grid(k, k, True)
adjacency = dataset.adjacency
position = dataset.position

In [None]:
image = svg_graph(adjacency, position, names=np.arange(k * k), width=200, height=200)
SVG(image)

## To do

* Display the graph with 1 cold source and 1 hot source, at the bottom-left and top-right corners of the square.
* Add a hot source at the top-left corner of the square. What is the free node of highest temperature?

In [None]:
dirichlet = Dirichlet()

In [None]:
bottom_left = 0
top_right = k * k - 1

In [None]:
values = {bottom_left: 0, top_right: 1}

In [None]:
temperatures = dirichlet.fit_predict(adjacency, values=values)

In [None]:
image = svg_graph(adjacency, position, scores=temperatures, width=200, height=200)
SVG(image)

In [None]:
top_left = k - 1

In [None]:
values[top_left] = 1

In [None]:
temperatures = dirichlet.fit_predict(adjacency, values=values)

In [None]:
image = svg_graph(adjacency, position, scores=temperatures, width=200, height=200)
SVG(image)

In [None]:
mask = np.ones(k * k, dtype=bool)
mask[list(values)] = 0
i = np.argmax(temperatures[mask])
hot = np.flatnonzero(mask)[i]

In [None]:
image = svg_graph(adjacency, position, scores=temperatures, seeds=hot, width=200, height=200)
SVG(image)

## Karate Club


We now consider the classification of nodes by heat diffusion. We use the [karate club graph](https://en.wikipedia.org/wiki/Zachary%27s_karate_club) that has ground-truth labels.

In [None]:
dataset = karate_club(True)

In [None]:
adjacency = dataset.adjacency
position = dataset.position
labels_true = dataset.labels

In [None]:
image = svg_graph(adjacency, position, labels=labels_true, names=np.arange(len(labels_true)))
SVG(image)

## To do

* Select one node in each ground-truth cluster, and predict the labels of the other nodes by heat diffusion.
* Display the graph with the predicted labels. 
* What is the accuracy of the classification?

In [None]:
classifier = DiffusionClassifier()

In [None]:
# train set
labels = labels_true.copy()
seeds = np.array([12, 29])
mask = np.zeros(len(labels), dtype=bool)
mask[seeds] = 1
labels[~mask] = -1

In [None]:
labels_pred = classifier.fit_predict(adjacency, labels)

In [None]:
image = svg_graph(adjacency, position, labels=labels_pred, names=np.arange(len(labels_true)))
SVG(image)

In [None]:
# accuracy
np.mean(labels_true[~mask]==labels_pred[~mask])

## To do

The classifier applies temperature centering after diffusion.

* Repeat the same experiment without temperature centering.
* Do the same experiments with 3 nodes in a ground-truth label and 1 in the other.
* What is your conclusion?

In [None]:
classifier = DiffusionClassifier(centering=False)

In [None]:
labels_pred = classifier.fit_predict(adjacency, labels)

In [None]:
# accuracy
np.mean(labels_true==labels_pred)

In [None]:
# train set
labels = labels_true.copy()
seeds = np.array([0, 12, 19, 29])
mask = np.zeros(len(labels), dtype=bool)
mask[seeds] = 1
labels[~mask] = -1

In [None]:
labels[seeds]

In [None]:
labels_pred = classifier.fit_predict(adjacency, labels)
np.mean(labels_true==labels_pred)

In [None]:
classifier = DiffusionClassifier(centering=True)

In [None]:
labels_pred = classifier.fit_predict(adjacency, labels)
np.mean(labels_true==labels_pred)

## Openflights


We now show how to classify the nodes of a graph **without labels**, through human labelling of a few nodes.

In [None]:
dataset = openflights

In [None]:
adjacency = dataset.adjacency
position = dataset.position
names = dataset.names

In [None]:
image = svg_graph(adjacency, position, width=800, height=400, node_size=3, display_edges=False)
SVG(image)

## To do

* Display the same world map with the labels predicted given 3 nodes (Paris, New-York, Beijing), each with its own  label.
* Add Madrid with another label and observe the result.

In [None]:
paris = 622
newyork = 1842
beijing = 1618

In [None]:
classifier = DiffusionClassifier()

In [None]:
labels = {paris: 0, newyork: 1, beijing: 2}

In [None]:
labels_pred = classifier.fit_predict(adjacency, labels)

In [None]:
image = svg_graph(adjacency, position, width=800, height=400, node_size=3, labels=labels_pred, display_edges=False)
SVG(image)

In [None]:
madrid = 572

In [None]:
labels[madrid] = 3

In [None]:
labels_pred = classifier.fit_predict(adjacency, labels)

In [None]:
image = svg_graph(adjacency, position, width=800, height=400, node_size=3, labels=labels_pred, display_edges=False)

SVG(image)

## To do

* List the top-10 airports that are close to Tokyo in terms of Personalized PageRank.
* List of the top-10 airports that are close to Tokyo and far from Paris Charles de Gaulle, using heat diffusion (contrastive ranking).<br> What do you observe?
* Display the temperatures of these airports after heat diffusion and explain the result.

In [None]:
pagerank = PageRank()

In [None]:
tokyo = 1084

In [None]:
scores = pagerank.fit_predict(adjacency, weights={tokyo:1})

In [None]:
top_pagerank = np.argsort(-scores)[:10]

In [None]:
print(names[top_pagerank])

In [None]:
dirichlet = Dirichlet()

In [None]:
tokyo = 1084

In [None]:
values = {paris:0, tokyo: 1}

In [None]:
values = dirichlet.fit_predict(adjacency, values)

In [None]:
top_diffusion = np.argsort(-values)[:10]

In [None]:
print(names[top_diffusion])

In [None]:
# no common node (except Tokyo)
len(set(list(top_pagerank)) & set(list(top_diffusion)))

In [None]:
values[top_diffusion]

These airports are connected to Paris through Tokyo only.

## 2. Directed graphs

We now consider the graph of links of WikiVitals. We first focus on constrastive ranking, then on node classification.

## Wikipedia Vitals

In [None]:
dataset = wikivitals

In [None]:
adjacency = dataset.adjacency
names = dataset.names
labels = dataset.labels
names_labels = dataset.names_labels

## Ranking

## To do

* List the top-10 articles that are close to **Cat** and **Dog** in terms of Personalized PageRank, considering the graph as bipartite.
* Compare with the list of top-10 articles that are close to **Cat** and **Dog** and far from **Bear** and **Tiger** using heat diffusion (contrastive ranking).
* List the top-10 articles that are close to **Bear** and **Tiger** and far from **Cat** and **Dog** (you can use previous diffusion). Interpret the results.

In [None]:
pagerank = PageRank()

In [None]:
cat = int(np.flatnonzero(names=='Cat'))
dog = int(np.flatnonzero(names=='Dog'))

In [None]:
scores = pagerank.fit_predict(adjacency, weights={cat:1, dog:1}, force_bipartite=True)

In [None]:
top_pagerank = np.argsort(-scores)[:10]
print(names[top_pagerank])

In [None]:
diffusion = Dirichlet()

In [None]:
bear = int(np.flatnonzero(names=='Bear'))
tiger = int(np.flatnonzero(names=='Tiger'))

In [None]:
values = diffusion.fit_predict(adjacency, values={cat:1, dog:1, bear:0, tiger:0})

In [None]:
top_diffusion = np.argsort(-values)[:10]
print(names[top_diffusion])

In [None]:
bottom_diffusion = np.argsort(values)[:10]
print(names[bottom_diffusion])

The articles 'Gloss (optics)' and 'Goby' are sinks: their temperatures remain constant (here 0).

In [None]:
values[bottom_diffusion]

In [None]:
from sknetwork.utils import get_degrees
out_degrees = get_degrees(adjacency)
out_degrees[bottom_diffusion]

## Node classification

## To do

* What is the accuracy of node classification, using half of the nodes in the train set?
* Compare with the accuracy obtained on the undirected graph.

In [None]:
dataset = wikivitals
adjacency = dataset.adjacency
labels_true = dataset.labels

In [None]:
algo = DiffusionClassifier()

In [None]:
# train set
labels = labels_true.copy()
mask = np.random.random(size=len(labels_true)) < 0.5
labels[~mask] = -1

In [None]:
labels_pred = algo.fit_predict(adjacency, labels)

In [None]:
np.mean(labels_pred[~mask]==labels_true[~mask])

In [None]:
from sknetwork.utils import directed2undirected

In [None]:
adjacency_ = directed2undirected(adjacency)

In [None]:
labels_pred = algo.fit_predict(adjacency_, labels)

In [None]:
np.mean(labels_pred[~mask]==labels_true[~mask])

## To do

* Classify each article of the **People** category in one of the other categories (Mathematics, History, etc.), using all other labels.
* List top-5 people of each category using PageRank.

In [None]:
print(names_labels)

In [None]:
labels = labels_true.copy()
labels[labels == 6] = -1

In [None]:
labels_pred = algo.fit_predict(adjacency_, labels)

In [None]:
for label in np.unique(labels_true):
    if label != 6:
        pagerank = PageRank()
        mask = (labels_pred==label)&(labels_true==6)
        scores = pagerank.fit_predict(adjacency, mask, force_bipartite=True)
        scores *= mask
        print(names_labels[label], f'({np.sum(mask)})')
        print(names[np.argsort(-scores)[:5]])
        print('---')    

## 3. Bipartite graphs

Finally, we consider the bipartite graph between articles and words.

In [None]:
dataset = wikivitals

In [None]:
biadjacency = dataset.biadjacency

## To do

* Repeat the experiments on node classification using the bipartite graph between articles and words.
* Which information seems to be richer, the text or the links between articles?
* Propose a solution exploiting both sources of information.

In [None]:
algo = DiffusionClassifier()

In [None]:
# train set
labels = labels_true.copy()
mask = np.random.random(size=len(labels_true)) < 0.5
labels[~mask] = -1

In [None]:
labels_pred = algo.fit_predict(biadjacency, labels)

In [None]:
# better accuracy
np.mean(labels_pred[~mask]==labels_true[~mask])

In [None]:
# concatenation
full_matrix = sparse.hstack((adjacency, biadjacency))
labels_pred = algo.fit_predict(full_matrix, labels)

In [None]:
np.mean(labels_pred[~mask]==labels_true[~mask])

In [None]:
labels = labels_true.copy()
labels[labels == 6] = -1

In [None]:
labels_pred = algo.fit_predict(biadjacency, labels)

In [None]:
for label in np.unique(labels_true):
    if label != 6:
        pagerank = PageRank()
        mask = (labels_pred==label)&(labels_true==6)
        scores = pagerank.fit_predict(biadjacency, mask)
        scores *= mask
        print(names_labels[label], f'({np.sum(mask)})')
        print(names[np.argsort(-scores)[:5]])
        print('---')    