In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
plt.rcParams['figure.figsize'] = (30, 30)

import re
import spacy
from bs4 import BeautifulSoup
from IPython.core.display import display, HTML

import numpy as np
import pandas as pd
import networkx as nx

from umap import UMAP
from sklearn.cluster import KMeans, AgglomerativeClustering

from tqdm import tqdm_notebook as tqdm

from sklearn.decomposition import PCA
import community

In [None]:
df = pd.read_json('../data/calm_records.json')

df['AltRefNo'] = df['AltRefNo'].dropna().apply(lambda x: x[0])
df['Level'] = df['Level'].dropna().apply(lambda x: x[0])

In [None]:
collection_ids = [id for id in sorted(df['AltRefNo'][df['Level'] == 'Collection'].values)]

In [None]:
df = df.set_index('AltRefNo')

In [None]:
collection_descriptions = (df['AdminHistory']
                           [collection_ids]
                           .dropna()
                           .apply(lambda x: x[0]))
collection_descriptions

In [None]:
ent_types = ['PERSON', 
             'NORP', 
             'FACILITY', 
             'ORG', 
             'GPE', 
             'LOC', 
             'PRODUCT', 
             'EVENT', 
             'WORK_OF_ART', 
             'LAW', 
             'LANGUAGE']

In [None]:
nlp = spacy.load('en')

In [None]:
entities_in_descriptions = pd.Series()
for id, record in tqdm(list(collection_descriptions.items())):
    plain_text = BeautifulSoup(record, 'html.parser').get_text()
    doc = nlp(plain_text)
    entities_in_descriptions[id] = [str(ent) for ent in doc.ents]

In [None]:
def flatten(list_of_lists):
    return [j for i in list_of_lists for j in i]

In [None]:
unique_entities = np.unique(flatten(entities_in_descriptions))

In [None]:
occurences = pd.DataFrame(data=[[entities_in_descriptions[collection].count(entity) 
                                 for entity in unique_entities]
                                for collection in tqdm(collection_descriptions.index.values)],
                          index=collection_descriptions.index.values,
                          columns=unique_entities)

In [None]:
adjacency = pd.DataFrame(data=occurences.values.T.dot(occurences.values),
                         index=unique_entities,
                         columns=unique_entities)

In [None]:
adjacency.shape

In [None]:
G = nx.from_pandas_adjacency(adjacency)

In [None]:
partitions = community.best_partition(G)
values = [partitions.get(node) for node in G.nodes()]

nx.draw_spring(G, 
               node_size=60, 
               node_color=values)

In [None]:
embedding_nd = pd.DataFrame(PCA(n_components=50)
                              .fit_transform(adjacency))

In [None]:
embedding_2d = pd.DataFrame(UMAP(n_components=2).fit_transform(embedding_nd.values))

embedding_2d.plot.scatter(x=0, y=1);

In [None]:
n_clusters = 8
embedding_2d['labels'] = KMeans(n_clusters).fit_predict(embedding_2d.values)
embedding_2d.plot.scatter(x=0, y=1, c='labels', cmap='Paired');

In [None]:
cluster = 7
collection_ids_in_cluster = (collection_descriptions.index.values
                             [embedding_2d[embedding_2d['labels'] == cluster]
                              .index.values])
collection_ids_in_cluster

In [None]:
pd.Series(flatten(entities_in_descriptions[collection_ids_in_cluster].tolist())).value_counts()