In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("white")
plt.rcParams["figure.figsize"] = (20, 20)

import pickle
import numpy as np
import pandas as pd
import networkx as nx
from umap import UMAP
from itertools import combinations
from tqdm import tqdm_notebook as tqdm
from sklearn.cluster import AgglomerativeClustering

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# assemble the data

In [None]:
with open("/mnt/efs/wikipedia/good_article_links.pkl", "rb") as fp:
    graph_dict = pickle.load(fp)
    G = nx.from_dict_of_lists(graph_dict)

In [None]:
adjacency_matrix = torch.Tensor(nx.adjacency_matrix(G).todense())

### dataset

In [None]:
class AdjacencyDataset(Dataset):
    def __init__(self, adjacency_matrix):
        self.adjacency_matrix = adjacency_matrix

    def __getitem__(self, index):
        return self.adjacency_matrix[index]

    def __len__(self):
        return len(self.adjacency_matrix)

In [None]:
dataset = AdjacencyDataset(adjacency_matrix)

### dataloader

In [None]:
batch_size = 64

dataloader = DataLoader(
    dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=5
)

# define autoencoder model

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size=50):
        super().__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        # use the multiplicative midpoint between the two sizes
        self.mid_size = int(
            self.input_size // np.sqrt(self.input_size / self.embedding_size)
        )
        print()
        self.encode = nn.Sequential(
            nn.Linear(in_features=self.input_size, out_features=self.mid_size),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(in_features=self.mid_size, out_features=self.embedding_size),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(
                in_features=self.embedding_size, out_features=self.embedding_size
            ),
        )

    def forward(self, x):
        return self.encode(x)

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size=50):
        super().__init__()
        self.output_size = output_size
        self.embedding_size = embedding_size
        # use the multiplicative midpoint between the two sizes
        self.mid_size = int(
            self.output_size // np.sqrt(self.output_size / self.embedding_size)
        )

        self.decode = nn.Sequential(
            nn.Linear(
                in_features=self.embedding_size, out_features=self.embedding_size
            ),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(in_features=self.embedding_size, out_features=self.mid_size),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(in_features=self.mid_size, out_features=self.output_size),
        )

    def forward(self, x):
        return self.decode(x)

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_size, embedding_size=50):
        super().__init__()
        self.embedding_size = embedding_size
        self.input_size = input_size
        self.output_size = input_size

        self.encoder = Encoder(self.input_size, self.embedding_size)
        self.decoder = Decoder(self.output_size, self.embedding_size)

    def forward(self, x):
        embedding = self.encoder(x)
        decoded = self.decoder(embedding)
        return nn.Sigmoid()(decoded)

# create model

In [None]:
input_size = len(adjacency_matrix)

In [None]:
autoencoder = Autoencoder(input_size=len(G.nodes), embedding_size=20).to(device)

In [None]:
autoencoder

# training

In [None]:
losses = []


def train(model, train_loader, n_epochs, loss_function, optimiser, device=device):
    model.train()
    for epoch in range(n_epochs):
        loop = tqdm(train_loader)
        for batch in loop:
            data = batch.cuda(non_blocking=True)
            target = batch.cuda(non_blocking=True)

            optimiser.zero_grad()
            prediction = model(data)

            loss = loss_function(prediction, target)
            losses.append(loss.item())

            loss.backward()
            optimiser.step()

            loop.set_description(f"Epoch {epoch + 1}/{n_epochs}")
            loop.set_postfix(loss=loss.item())

In [None]:
torch.backends.cudnn.benchmark = True

trainable_parameters = filter(lambda p: p.requires_grad, autoencoder.parameters())

loss_function = nn.BCELoss()
optimiser = optim.Adam(trainable_parameters, lr=0.001)

In [None]:
train(
    model=autoencoder,
    train_loader=dataloader,
    loss_function=loss_function,
    optimiser=optimiser,
    n_epochs=10,
)

In [None]:
loss_data = pd.Series(losses).rolling(window=15).mean()
ax = loss_data.plot(subplots=True);

In [None]:
with torch.no_grad():
    embeddings_50d = (
        autoencoder.encoder(adjacency_matrix.to(device)).detach().cpu().numpy()
    )

In [None]:
embeddings_2d = UMAP(n_components=2, metric="cosine").fit_transform(embeddings_50d)

In [None]:
df = pd.DataFrame(embeddings_2d)
cluster = AgglomerativeClustering()
df["cluster"] = cluster.fit_predict(embeddings_50d)

In [None]:
df.plot.scatter(x=0, y=1, c=df["cluster"], cmap="Paired");

In [None]:
node_names[df[df["cluster"] == 1].index.values]

# query with nmslib

In [None]:
import nmslib

index = nmslib.init(method="hnsw")
index.addDataPointBatch(embeddings_50d)
index.createIndex({"post": 2}, print_progress=True)

In [None]:
node_names = np.array(G.nodes)

query_index = np.random.choice(len(node_names))
query_embedding = embeddings_50d[query_index].reshape(1, -1)
query_node_name = node_names[query_index]
query_node_name

In [None]:
ids, distances = index.knnQuery(query_embedding, k=10)

In [None]:
node_names[ids]