In [None]:
%matplotlib inline
from pathlib import Path
from itertools import combinations
import sys
sys.path.append("..")
from importlib import reload

import numpy as np
import scipy as sp
from scipy.cluster.hierarchy import linkage, fcluster
import pandas as pd
from matplotlib import pyplot as plt
import igraph

from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import pythd

In [None]:
# Configuration
CUR_DIR = Path.cwd()
DATA_DIR = CUR_DIR.parent / 'data'

HELOC_NAME = 'heloc_dataset_v1.csv'
HELOC_PATH = DATA_DIR / HELOC_NAME

In [None]:
df = pd.read_csv(HELOC_PATH, dtype={'RiskPerformance': 'category'}).dropna()
X = df.drop(columns=['RiskPerformance', 'ExternalRiskEstimate']).values
y = df['RiskPerformance'].cat.codes.values

In [None]:
Z = linkage(X, method='average', metric='cosine')

In [None]:
dists = np.linspace(0.0, Z[-1, 2] - 1e-4, num=50)
labels = [fcluster(Z, t=dist, criterion='distance') for dist in dists]
nlabs = [len(np.unique(y_pred)) for y_pred in labels]
plt.semilogy(dists, nlabs)
_ = plt.show()

In [None]:
adj_rand = np.array([metrics.adjusted_rand_score(y, y_pred) for y_pred in labels])
plt.plot(dists, adj_rand)
plt.xlabel("Distance")
plt.ylabel("Adjusted Rand Score")
_ = plt.show()

In [None]:
ami = np.array([metrics.adjusted_mutual_info_score(y, y_pred) for y_pred in labels])
plt.plot(dists, ami)
plt.xlabel("Distance")
plt.ylabel("Adjusted Mutual Information")
_ = plt.show()

In [None]:
cal_har = np.array([metrics.calinski_harabasz_score(X, y_pred) for y_pred in labels])
plt.plot(dists, cal_har)
_ = plt.show()

In [None]:
pairwise = metrics.pairwise.cosine_distances(X)
sil = np.array([metrics.silhouette_score(pairwise, y_pred, metric='precomputed') for y_pred in labels])
plt.plot(dists, sil)
_ = plt.show()

In [None]:
filt = pythd.filter.ScikitLearnFilter(TSNE, n_components=2, metric='cosine')
f_x = filt(X)
cov = pythd.cover.IntervalCover.EvenlySpacedFromValues(f_x, 10, 0.25)

In [None]:
reload(pythd)
reload(pythd.thd)

thd = pythd.thd.THD(X, filt, cov, full_df=X)
groups = thd.run(verbose=True)

In [None]:
groups.compute_distance(combine_method="max", cluster_method="single", metric="cosine")