In [None]:
from IPython.core.display import display, HTML

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
plt.rcParams['figure.figsize'] = (30, 30)

import pandas as pd
import numpy as np
import networkx as nx

import re

from umap import UMAP
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from tqdm import tqdm_notebook as tqdm

In [None]:
df = pd.read_json('calm_records.json')

# looking for trees 
- trees contained in the `AltRefNo` column.  
- heads of trees are in the `Level`
- arrangement described in `Arrangement`

In [None]:
display(HTML(df['Arrangement'][269057][0]))

In [None]:
df['AltRefNo'] = df['AltRefNo'].dropna().apply(lambda x: x[0])
df['Level'] = df['Level'].dropna().apply(lambda x: x[0])

In [None]:
alt_ref_no = df.loc[269057]['AltRefNo']
alt_ref_no = 'PENROSE'

In [None]:
codes_as_str = sorted(df['AltRefNo'][df['AltRefNo']
                                     .str.startswith(alt_ref_no)
                                     .fillna(False)]
                      .tolist())

In [None]:
codes_as_list = sorted(list(map(lambda x: re.split('/|\.', x), 
                                codes_as_str)))

In [None]:
codes = dict(zip(codes_as_str, codes_as_list))
len(codes)

In [None]:
adjacency = pd.DataFrame(data=0, 
                         index=codes_as_str, 
                         columns=codes_as_str)

In [None]:
pd.Series(np.hstack(adjacency.values)).value_counts()

In [None]:
for str_1, list_1 in tqdm(codes.items()):
    for str_2, list_2 in codes.items():
        if ((len(list_2) == len(list_1) + 1) & 
            (list_1 == list_2[:len(list_1)])):
            adjacency[str_1][str_2] = adjacency[str_1][str_2] + 1

In [None]:
pd.Series(np.hstack(adjacency.values)).value_counts()

In [None]:
G = nx.from_pandas_adjacency(adjacency)

In [None]:
nx.draw_spring(G, node_size=60)

adjacency matrices can also be represented on a 2d plane using dimensionality reduction techniques more widely used in ML, ie PCA, t-SNE and UMAP. Here's what the above graph looks like under UMAP

In [None]:
embedding_2d = pd.DataFrame(UMAP(n_components=2)
                            .fit_transform(adjacency))

embedding_2d.plot.scatter(x=0, y=1);

In [None]:
from sklearn.cluster import AgglomerativeClustering

n_clusters = 15
embedding_2d['labels'] = AgglomerativeClustering(n_clusters).fit_predict(embedding_2d.values)
embedding_2d.plot.scatter(x=0, y=1, c='labels', cmap='Paired');

In [None]:
nx.draw_spring(G, node_size=100, 
               node_color=embedding_2d['labels'])

In [None]:
for collection_id in sorted(df['AltRefNo'][df['Level'] == 'Collection'].values):
    print(collection_id)

In [None]:
df['AltRefNo'] == alt_ref_no