# knowledge graph embeddings

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("white")
plt.rcParams["figure.figsize"] = (20, 20)

import pickle
import wikipedia
import numpy as np
import pandas as pd
from umap import UMAP
from tqdm import tqdm_notebook as tqdm
from sklearn.cluster import AgglomerativeClustering

# just using the featured articles
they're much more densely interconnected than the average article. 5411 of them. ~30,000 good articles which we could use for a larger version of this

In [None]:
# titles = set(wikipedia.page('Wikipedia:Featured_articles').links)
titles = set(wikipedia.page("WP:GA/ALL").links)
len(titles)

In [None]:
graph_dict = {}

for title in tqdm(titles):
    try:
        possible_links = {link for link in wikipedia.page(title).links if link != title}

        links_to_keep = list(titles.intersection(possible_links))
        graph_dict[title] = links_to_keep
    except (wikipedia.DisambiguationError, wikipedia.PageError):
        print(f"couldn't resolve page: {title}\n")

In [None]:
with open("featured_article_links.pkl", "rb") as fp:
    graph_dict = pickle.load(fp)

In [None]:
graph_dict

In [None]:
len(graph_dict)

In [None]:
title_to_index = {title: index for index, title in enumerate(titles)}
index_to_title = {index: title for index, title in enumerate(titles)}

In [None]:
adjacency_matrix = np.zeros((len(titles), len(titles)))

In [None]:
for title in titles:
    title_index = title_to_index[title]
    connections = graph_dict[title]
    for connection in connections:
        connection_index = title_to_index[connection]
        adjacency_matrix[title_index][connection_index] = 1

In [None]:
embedding = UMAP(n_components=2, metric="cosine").fit_transform(adjacency_matrix)
large_embedding = UMAP(n_components=300, metric="cosine").fit_transform(
    adjacency_matrix
)

In [None]:
df = pd.DataFrame(embedding)
cluster = AgglomerativeClustering(n_clusters=40)
df["cluster"] = cluster.fit_predict(large_embedding)

In [None]:
df.plot.scatter(x=0, y=1, s=2, c=df["cluster"], cmap="Paired");

In [None]:
chosen_cluster = 26
df["selected"] = df["cluster"] == chosen_cluster
df.plot.scatter(x=0, y=1, s=2, c=df["selected"], cmap="Paired");

In [None]:
for i in df[df["cluster"] == chosen_cluster].index.values:
    print(index_to_title[i])

In [None]:
with open("featured_article_links.pkl", "wb") as fp:
    pickle.dump(graph_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("white")
plt.rcParams["figure.figsize"] = (20, 20)

import pickle
import numpy as np
import pandas as pd
import networkx as nx
from umap import UMAP
from itertools import combinations
from tqdm import tqdm_notebook as tqdm
from sklearn.cluster import AgglomerativeClustering

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# assemble the data

In [None]:
with open("/mnt/efs/wikipedia/good_article_links.pkl", "rb") as fp:
    graph_dict = pickle.load(fp)
    G = nx.from_dict_of_lists(graph_dict)

In [None]:
adjacency_matrix = torch.Tensor(nx.adjacency_matrix(G).todense())

In [None]:
adjacency_matrix = nx.adjacency_matrix(G).todense()

In [None]:
embedding = UMAP(n_components=2, metric="cosine").fit_transform(adjacency_matrix)

In [None]:
large_embedding = UMAP(n_components=300, metric="cosine").fit_transform(
    adjacency_matrix
)

In [None]:
n_clusters = 100
df = pd.DataFrame(embedding)
df.index = node_names

In [None]:
cluster = AgglomerativeClustering(n_clusters)
df["cluster"] = cluster.fit_predict(embedding)

In [None]:
df.plot.scatter(x=0, y=1, c=df["cluster"], cmap="Paired");

In [None]:
for selected_cluster in range(n_clusters):
    df["selected_cluster"] = df["cluster"] == selected_cluster
    print(
        np.random.choice(
            df.index.values[df["cluster"] == selected_cluster], size=10, replace=False
        ),
        "\n\n",
    )