In [None]:
from pathlib import Path
import sys
sys.path.append("..")
from importlib import reload

import numpy as np
import scipy as sp
import pandas as pd
import igraph

import umap

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.cluster import DBSCAN, OPTICS

import pythd

In [None]:
# Configuration
CUR_DIR = Path.cwd()
DATA_DIR = CUR_DIR.parent / 'data'

HELOC_NAME = 'heloc_dataset_v1.csv'
HELOC_PATH = DATA_DIR / HELOC_NAME
CLUSTER_METHODS = ['complete', 'average']

METRIC='cosine'

In [None]:
df = pd.read_csv(HELOC_PATH, dtype={'RiskPerformance': 'category'})
df['MaxDelq2PublicRecLast12M'] = df['MaxDelq2PublicRecLast12M'].map({
    0: 0,
    1: 120,
    2: 90,
    3: 60,
    4: 30,
    5: 0,
    6: 0,
    7: 0,
    8: 0,
    9: 0
})

df['MaxDelqEver'] = df['MaxDelqEver'].map({
    1: 0,
    2: 0,
    3: 120,
    4: 90,
    5: 60,
    6: 30,
    7: 0,
    8: 0,
    9: 0
})

df = df.dropna()

X = df.drop(columns=['RiskPerformance', 'ExternalRiskEstimate']).values.astype(np.float32)
X = RobustScaler().fit_transform(X)
y = df['RiskPerformance'].cat.codes.values

In [None]:
filt = pythd.filter.ScikitLearnFilter(umap.UMAP, n_components=2, n_neighbors=20, min_dist=0.6, metric=METRIC)
f_x = filt(X)
cov = pythd.cover.IntervalCover.EvenlySpacedFromValues(f_x, 200, 0.5)

In [None]:
reload(pythd)
reload(pythd.clustering)
reload(pythd.mapper)
clustering = pythd.clustering.ScikitLearnClustering(OPTICS, min_samples=2, metric=METRIC)
thd = pythd.thd.THD(X, filt, cov, full_df=X, clustering=clustering, group_threshold=50, contract_amount=0.1)
groups = thd.run(verbose=True)