In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
plt.rcParams['figure.figsize'] = (20, 20)

import pandas as pd
import numpy as np
import networkx as nx

from umap import UMAP
from tqdm import tqdm_notebook as tqdm

In [None]:
def flatten(list):
    return [item for sublist in list for item in sublist]

def cartesian(*arrays):
    return np.array([x.reshape(-1) for x in np.meshgrid(*arrays)]).T

def clean_subject(subject):
    return subject.strip().lower().replace('<p>', '')

In [None]:
df = pd.read_json('calm_records.json')

In [None]:
len(df)

In [None]:
df.astype(str).describe()

# explore columns

In [None]:
list(df)

In [None]:
df['Subject'].astype(str).value_counts()

# subjects look interesting 
lots of intersection here - lets make an adjacency matrix

In [None]:
subjects = list(set(flatten(df['Subject'].dropna().tolist())))
clean_subjects = list(map(clean_subject, subjects))

In [None]:
index_to_subject = {index: subject for index, subject in enumerate(clean_subjects)}
subject_to_index = {subject: index for index, subject in enumerate(subjects)}

In [None]:
adjacency = np.empty((len(subjects), len(subjects)), 
                     dtype=np.uint16)

In [None]:
for row_of_subjects in tqdm(df['Subject'].dropna()):
    for subject_pair in cartesian(row_of_subjects, row_of_subjects):
        subject_index_1 = subject_to_index[subject_pair[0]]
        subject_index_2 = subject_to_index[subject_pair[1]]

        adjacency[subject_index_1, subject_index_2] += 1

In [None]:
embedding_2d = pd.DataFrame(UMAP(n_components=2)
                            .fit_transform(adjacency))

embedding_2d.plot.scatter(x=0, y=1);

In [None]:
from sklearn.cluster import AgglomerativeClustering

n_clusters = 15
embedding_2d['labels'] = AgglomerativeClustering(n_clusters).fit_predict(embedding_2d.values)
embedding_2d.plot.scatter(x=0, y=1, c='labels', cmap='Paired');

In [None]:
for i in range(n_clusters):
    print(str(i) + ' ' + '-'*80 + '\n')
    print(np.sort([clean_subject(index_to_subject[index])
                    for index in embedding_2d[embedding_2d['labels'] == i].index.values]))
    print('\n')