# images of graphy subjects

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
plt.rcParams['figure.figsize'] = (20, 20)

import itertools
import re

import pandas as pd
import numpy as np
import networkx as nx

from sklearn.cluster import AgglomerativeClustering

from umap import UMAP
from tqdm import tqdm_notebook as tqdm

In [None]:
def flatten(input_list):
    return [item 
            for sublist in input_list 
            for item in sublist]

def clean(subject):
    return subject.strip().lower().replace('<p>', '')

In [None]:
df = pd.read_json('data/calm_records.json')

In [None]:
df['AltRefNo'] = df['AltRefNo'].dropna().apply(lambda x: x[0])
df['Subject'] = df['Subject'].dropna().apply(lambda x: list(map(clean, x)))

In [None]:
dirty_subjects = flatten(df['Subject'].dropna().tolist())
subjects = list(set(map(clean, dirty_subjects)))

In [None]:
adjacency = pd.DataFrame(data=0, 
                         index=subjects, 
                         columns=subjects)

In [None]:
for row_of_subjects in tqdm(df['Subject'].dropna()):
    
    clean_row = list(set([clean(subject) for subject in row_of_subjects]))
    
    for subject_1, subject_2 in itertools.product(clean_row, repeat=2):
        adjacency[subject_1][subject_2] += 1

In [None]:
G = nx.from_pandas_adjacency(adjacency)
nx.draw_spring(G, node_size=60)

### quite complicated
lets get rid of the rare subjects and only look at common ones

subject_counts = pd.Series(map(clean, dirty_subjects)).value_counts() 
rare_subjects = subject_counts[subject_counts < 10].index.values

len(subjects) - len(rare_subjects) 

mini_adjacency = (adjacency
                  .drop(rare_subjects, axis=0)
                  .drop(rare_subjects, axis=1))

In [None]:
mini_adjacency = adjacency.mask(mini_adjacency < 10, 0)
mini_adjacency

In [None]:
G = nx.from_pandas_adjacency(mini_adjacency)

nx.draw_spring(G, node_size=60)

lots of orphans... lets get rid of those

In [None]:
orphans = [subject for subject in mini_adjacency.index.values
           if mini_adjacency[subject].sum() == mini_adjacency[subject][subject]]

mini_adjacency = (adjacency
                  .drop(orphans, axis=0)
                  .drop(orphans, axis=1))

In [None]:
G = nx.from_pandas_adjacency(mini_adjacency)

nx.draw(G, node_size=60)

### now clustering

In [None]:
embedding_2d = pd.DataFrame(UMAP(n_components=2)
                            .fit_transform(mini_adjacency))

In [None]:
n_clusters = 15

embedding_2d['labels'] = (AgglomerativeClustering(n_clusters)
                          .fit_predict(embedding_2d.values))

In [None]:
nx.draw_spring(G, node_size=40, 
               node_color=embedding_2d['labels'],
               alpha=0.1)

this thing is a real mess - subjects behave weirdly, and seem to either be weirdo one-offs, or super connected to everything. Still we can extract little structures and clusters. It's fine. Let's start using them.

# connect records by subject

In [None]:
with_subjects = df[~df['Subject'].isna()].set_index('AltRefNo').sample(1000)

In [None]:
indicies = np.nan_to_num(with_subjects.index.values).astype(str)

In [None]:
with_subjects.index = indicies

In [None]:
adjacency = pd.DataFrame(data=0,
                         index=indicies,
                         columns=indicies)

In [None]:
for record_1, record_2 in tqdm(list(itertools.combinations(indicies, r=2))):
    subjects_1 = set(with_subjects['Subject'][record_1])
    subjects_2 = set(with_subjects['Subject'][record_2])
    adjacency[record_1][record_2] = len(subjects_1 & subjects_2)

In [None]:
orphans = [record for record in adjacency.index.values
           if adjacency[record].sum() == adjacency[record][record]]

adjacency = (adjacency
             .drop(orphans, axis=0)
             .drop(orphans, axis=1))

In [None]:
G = nx.from_pandas_adjacency(adjacency)

nx.draw(G, node_size=60)

In [None]:
subjects

In [None]:
embedding_2d = pd.DataFrame(UMAP(n_components=2)
                            .fit_transform(adjacency))

In [None]:
n_clusters = 15

embedding_2d['labels'] = (AgglomerativeClustering(n_clusters)
                          .fit_predict(embedding_2d.values))

In [None]:
nx.draw_spring(G, node_size=40, 
               node_color=embedding_2d['labels'])

# connect records by subject _and_ hierarchy

In [None]:
alt_ref_no = 'GC'

codes_as_str = sorted(df['AltRefNo'][df['AltRefNo']
                                     .str.startswith(alt_ref_no)
                                     .fillna(False)]
                      .tolist())

In [None]:
codes = {code: re.split('/|\.', code.strip())
         for code in codes_as_str}

In [None]:
adjacency_tree = pd.DataFrame(data=0,
                              index=codes_as_str,
                              columns=codes_as_str)

In [None]:
for parent_str, parent_list in tqdm(codes.items()):
    for child_str, child_list in codes.items():
        if child_list == parent_list + [child_list[-1]]:
            adjacency_tree[parent_str][child_str] = 1

In [None]:
ack = adjacency_tree.add(adjacency, fill_value=0).fillna(0)

In [None]:
orphans = [subject for subject in ack.index.values
           if ack[subject].sum() == ack[subject][subject]]

ack = (ack
       .drop(orphans, axis=0)
       .drop(orphans, axis=1))

In [None]:
ack.shape

In [None]:
G = nx.from_pandas_adjacency(ack)

In [None]:
nx.draw_spring(G, node_size=40)

In [None]:
embedding_2d = pd.DataFrame(UMAP(n_components=2)
                            .fit_transform(ack))

In [None]:
n_clusters = 15

embedding_2d['labels'] = (AgglomerativeClustering(n_clusters)
                          .fit_predict(embedding_2d.values))

In [None]:
nx.draw_spring(G, node_size=40, 
               node_color=embedding_2d['labels'])

In [None]:
(set(map(str.strip, np.array(codes_as_str).astype(str))) & 
 set(map(str.strip, np.array(indicies).astype(str))))