# hdbscan test

In [None]:
import hdbscan
import csv
import pandas as pd
from IPython.display import display
import itertools

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%run 'preprocessor.ipynb' #our own preprocessor functions

### Distance metric to fuse different types of data (Gower)

In [None]:
def gower_distance(df,types):
    """
    Treat each type of data with an appropriate distance measure
    Inputs:
        df: dataframe containing all data
        types: list containing type of data (length = number of data columns in df)
            categorical, rating (ordinal), continuous
    Outputs:
        pairwise_distance_matrix: full pairwise distance matrix between all rows and columns
    """
    
    return pairwise_distance_matrix

### Prepare Dataset

In [None]:
with open('data_w1w4.csv', 'r') as f:
      reader = csv.reader(f)
      data = list(reader)
    
matrix = obtain_data_matrix(data)
samples = len(matrix)

print("Number of samples: " + str(samples))
print("First entry: " + str(matrix[0]))

df = pd.DataFrame(matrix)
df.columns = ['uid',
              "Test Fit (tolerances)",
              "Test Strength",
              "Test Ergonomics",
              "Wearout",
              "Integration",
              "Ornamental / Design or Ornamental / Gift or Design (Look)",
              "Others",
              "Filament Used",
              "Print Time",
              "Satisfaction",
              "Print Failed"
             ]

### Perform clustering with hdbscan

In [None]:
#create clusterer
#TODO: distance metric appropriate to our mixed data? Alternatively, scale data?
clusterer = hdbscan.HDBSCAN()

#specify columns to ignore during clustering
remove_cols = ['uid']

#apply clusterer to data
clusterer.fit(df.drop(remove_cols,axis=1))

#add cluster labels and cluster probabilities to dataframe
df['clusters'] = clusterer.labels_
df['probabilities'] = clusterer.probabilities_

### Make all pairwise 2D plots

In [None]:
"""
Plot every pairwise combination of data. 
Points are colored according to cluster and 
faded according to probability of cluster membership.
"""

#remove entries that were not assigned to a cluster
df_noiseless = df[df['clusters'] != -1]

#generate color palette for noiseless dataframe (from hdbscan tutorial)
color_palette = sns.color_palette('colorblind', max(df_noiseless['clusters'])+1)
cluster_colors = [color_palette[x] if x >= 0
                  else (0.5, 0.5, 0.5)
                  for x in df_noiseless['clusters']]
cluster_member_colors = [sns.desaturate(x, p) for x, p in
                         zip(cluster_colors, df_noiseless['probabilities'])]

#make all the plots! (most will be nonsense)
for header in itertools.permutations(list(df.columns),2):
    plt.scatter(*np.array(df_noiseless[[header[0],header[1]]]).T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
    plt.xlabel(header[0])
    plt.ylabel(header[1])
    plt.show()

### Dump Dataframe

In [None]:
print full dataframe
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df.sort_values('clusters'))