In [None]:
%matplotlib inline
from pathlib import Path
from itertools import combinations
import sys
sys.path.append("..")
from importlib import reload

import numpy as np
import scipy as sp
from scipy.cluster.hierarchy import linkage, fcluster
import pandas as pd
from matplotlib import pyplot as plt
import igraph

import umap

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN, OPTICS

import pythd

In [None]:
# Configuration
CUR_DIR = Path.cwd()
DATA_DIR = CUR_DIR.parent / 'data'

HELOC_NAME = 'heloc_dataset_v1.csv'
HELOC_PATH = DATA_DIR / HELOC_NAME
CLUSTER_METHODS = ['complete', 'average']

METRIC='euclidean'

In [None]:
df = pd.read_csv(HELOC_PATH, dtype={'RiskPerformance': 'category'})
df['MaxDelq2PublicRecLast12M'] = df['MaxDelq2PublicRecLast12M'].map({
    0: 0,
    1: 120,
    2: 90,
    3: 60,
    4: 30,
    5: 0,
    6: 0,
    7: 0,
    8: 0,
    9: 0
})

df['MaxDelqEver'] = df['MaxDelqEver'].map({
    1: 0,
    2: 0,
    3: 120,
    4: 90,
    5: 60,
    6: 30,
    7: 0,
    8: 0,
    9: 0
})

df = df.dropna()

X = df.drop(columns=['RiskPerformance', 'ExternalRiskEstimate']).values.astype(np.float32)
X = StandardScaler().fit_transform(X)
y = df['RiskPerformance'].cat.codes.values

In [None]:
Z_mats = {
    method: linkage(X, method=method, metric=METRIC)
    for method in CLUSTER_METHODS
}

In [None]:
filt = pythd.filter.ScikitLearnFilter(umap.UMAP, n_components=2, n_neighbors=9, min_dist=0.01, metric=METRIC)
f_x = filt(X)

In [None]:
reload(pythd)
reload(pythd.clustering)
reload(pythd.mapper)
reload(pythd.thd)

cov = pythd.cover.IntervalCover.EvenlySpacedFromValues(f_x, 200, 0.5)
clustering = pythd.clustering.HierarchicalClustering(method='complete', metric='precomputed')
#clustering = pythd.clustering.ScikitLearnClustering(OPTICS, min_samples=2, n_jobs=1, max_eps=100.0, metric='precomputed')
#clustering = pythd.clustering.ScikitLearnClustering(DBSCAN, n_jobs=1, metric='precomputed')
thd = pythd.thd.THD(X, filt, cov, full_df=X, clustering=clustering, 
                    group_threshold=2, contract_amount=0.1, 
                    precompute=True, metric=METRIC)

old_settings = np.seterr(divide='ignore', invalid='ignore')
groups = thd.run(verbose=True)
_ = np.seterr(**old_settings)

In [None]:
g = groups.as_igraph_graph()

vs = {
    "margin": 40,
    "bbox": (700, 300),
    #"vertex_label": g.vs["name"],
    "vertex_label_size": 10,
    "vertex_size": 5,
    "vertex_label_dist": 1.5,
    "vertex_label_angle": 0,
    "layout": g.layout_reingold_tilford(root=[0])
}

igraph.plot(g, **vs)

In [None]:
def make_cut_params(method):
    return {
        'combine_method': 'max',
        'cluster_method': method,
        'metric': METRIC
    }

g = None
max_dist = max(map(lambda Z: Z[-1, 2], Z_mats.values()))
dists = np.linspace(0.0, max_dist, num=50)

In [None]:
hc_labels = {
    name: [fcluster(Z, t=dist, criterion='distance') for dist in dists]
    for name, Z in Z_mats.items()
}

thd_labels = {
    name: [groups.cut_on_distance(dist, **make_cut_params(name))[1] for dist in dists]
    for name in CLUSTER_METHODS
}

for name, labels in hc_labels.items():
    plt.semilogy(dists, [len(np.unique(y_pred)) for y_pred in labels], label=name)
for name, labels in thd_labels.items():
    plt.semilogy(dists, [len(np.unique(y_pred))-1 for y_pred in labels], label='thd ({})'.format(name))
plt.legend(loc='best')
plt.xlabel("distance")
plt.ylabel("num. clusters")
_ = plt.show()

In [None]:
hc_ami = {
    name: np.array([metrics.adjusted_mutual_info_score(y, y_pred) for y_pred in labels])
    for name, labels in hc_labels.items()
}

thd_ami = {
    name: np.array([metrics.adjusted_mutual_info_score(y, y_pred) for y_pred in labels])
    for name, labels in thd_labels.items()
}

for name, values in hc_ami.items():
    plt.plot(dists, values, label=name)
for name, values in thd_ami.items():
    plt.plot(dists, values, label='thd ({})'.format(name))
plt.legend(loc='best')
plt.xlabel("Distance")
plt.ylabel("Adjusted Mutual Information")
_ = plt.show()

In [None]:
hc_ars = {
    name: np.array([metrics.adjusted_rand_score(y, y_pred) for y_pred in labels])
    for name, labels in hc_labels.items()
}

thd_ars = {
    name: np.array([metrics.adjusted_rand_score(y, y_pred) for y_pred in labels])
    for name, labels in thd_labels.items()
}

for name, values in hc_ars.items():
    plt.plot(dists, values, label=name)
for name, values in thd_ars.items():
    plt.plot(dists, values, label='thd ({})'.format(name))
plt.legend(loc='best')
plt.xlabel("Distance")
plt.ylabel("Adjusted Rand Score")
_ = plt.show()

In [None]:
pairwise = metrics.pairwise_distances(X, metric=METRIC)
            
hc_sil = {
    name: np.array([metrics.silhouette_score(pairwise, y_pred, metric='precomputed') for y_pred in labels
                    if np.unique(y_pred).shape[0] > 1])
    for name, labels in hc_labels.items()
}

thd_sil = {
    name: np.array([metrics.silhouette_score(pairwise, y_pred, metric='precomputed') for y_pred in labels
                    if np.unique(y_pred).shape[0] > 1])
    for name, labels in thd_labels.items()
}

for name, values in hc_sil.items():
    n = values.shape[0]
    plt.plot(dists[:n], values, label=name)
for name, values in thd_sil.items():
    n = values.shape[0]
    plt.plot(dists[:n], values, label='thd ({})'.format(name))
plt.legend(loc='best')
plt.xlabel('Distance')
plt.ylabel('Silhouette Score')
_ = plt.show()