In [None]:
# installs
# pip install scikit-learn


In [None]:
# imports - usual suspects
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# for distance and h-clustering
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import pdist, squareform


# sklearn does have some functionality too, but mostly a wrapper to scipy
from sklearn.metrics import pairwise_distances 
from sklearn.preprocessing import StandardScaler

In [None]:
# let's start basic
# x = np.array([1,2])
# y = np.array([3,4])
# z = np.array([2,4])
# a = np.stack([x,y,z])
# a_df = pd.DataFrame(a)  # dataframe version
# a                       # nd array

In [None]:
# lets get the euclidean distance


# prints out as pairs 
# 0/1, 0/2, 1/2
# https://stackoverflow.com/a/13079806/155406


In [None]:
# what is it
# technically its a condensed matrix of the upper triangle as 1d array


In [None]:
# but what we are mostly used to is the squareform


In [None]:
# NOTE: there are tools in sklearn, but some methods allow us to pass a compressed matrix
#       which gives us an analysts control over the input space


In [None]:
# QUICK EXERCISE:
#                 calculate the cosine distance matrix
#                 Tricky: calculate the Manhattan distance 
#                       HINT: documentation is your friend



In [None]:
## there are other distance calcs, but I really dont see these come up that often in practical applications
## nothing stopping you from looping parameters to assess what works the best

In [None]:
# we can also use sklearn to calc distances


In [None]:
# QUICK NOTE:
#             some implementations may be faster in sklearn, note the docs



In [None]:
# let's start to code up a simple example

In [None]:
# auth into GCP Big Query

# COLAB Only
# from google.colab import auth
# auth.authenticate_user()
# print('Authenticated')

# for non-Colab
# see resources, as long as token with env var setup properly, below should work

In [None]:
# get the data
SQL = "SELECT * from `questrom.datasets.mtcars`"
YOUR_BILLING_PROJECT = "questrom"

cars = pd.read_gbq(SQL, YOUR_BILLING_PROJECT)

In [None]:
# what do we have?


In [None]:
# the first few rows


In [None]:
# EXERCISE:
#         1) use the model column as the row index 
#         2) with the model column as index, we can now remove it
#         3) explore the data
#   Keep in mind that our goal is to use distance for clustering!
#   Does anything stand out?

In [None]:
# lets drop the model column and use it as the index


In [None]:
# confirm we have what we need


In [None]:
# ok, let's summarise the info


In [None]:
# no missing values is great, finally the summaries


In [None]:
# optional viz, but takes some time
# sns.pairplot(cars)


In [None]:
# keep just the continous variables
# cars2 = pd.concat((cars.loc[:, "mpg"], cars.loc[:, "disp":"qsec"]), axis=1)

In [None]:
# confirm we have what we need


In [None]:
# eventually we want to run the distance matrix through linkage
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html


In [None]:
# use scipy for distance matrix


# cdist = pdist(cars2.values)

In [None]:
## Why?  We have more control, as we could always build our distance matrix to our needs
##       Above is just the mechanics of getting this done

In [None]:
# visualize the matrix with seaborn

# sns.heatmap(squareform(cdist), cmap="Reds")

In [None]:
# Thought exercise:  Why might this help us think about the # of clusters

In [None]:
# lets perform our first hclust!

# hc1 = linkage(cdist)

# now visualize the dendrogram

# dendrogram(hc1, labels=cars.index)
# plt.show()

In [None]:
# the labels for the cluster - cleaner

# dendrogram(hc1,
#            labels = cars.index,
#            leaf_rotation=90,
#            leaf_font_size=10)
# plt.show()


In [None]:
# and the orientation/size

# plt.figure(figsize=(5,6))
# dendrogram(hc1,
#            labels = cars.index,
#            orientation = "left")
# plt.show()

In [None]:
# once we have seen the plots, we can start to think about cutting this up
# to define clusters - we use fcluster
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html

In [None]:
# we can slice up our clusters a few ways
# first, how many clusters (max)

# fcluster(hc1, 2, criterion="maxclust")

In [None]:
# we can also define by the distance



In [None]:
# want to visualize how you defined the cluster?

# DIST = 80
# plt.figure(figsize=(5,6))
# dendrogram(hc1, 
#            labels = cars.index,
#            orientation = "left", 
#            color_threshold = DIST)
# plt.axvline(x=DIST, c='grey', lw=1, linestyle='dashed')
# plt.show()

In [None]:
# YOUR TURN:
#           Use cosine distance
#           generate the linkage array
#           plot the dendrogram
#           assign the cluster labels back onto the ORIGINAL dataframe





In [None]:
# now that we have labels assigned, we can profile



In [None]:
# data dictionary for profiling above
# https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/mtcars.html

In [None]:
# DISCUSSION:
#            This is a simple dataset, but why is profiling important for us as analyts?
#            Applications of this approach?

In [None]:
# ok, 3 more things to consider

# scaling the data to give all features equal importance
# viz different approaches instead of 1x1
# more "advanced" ways to think about distance to help us inform cluster selection


In [None]:
# lets go back to our cars2 dataset (the one with just "continous")


In [None]:
# scaling variables allows each to have equal importance
# since they are now on the same unit scale

# sc = StandardScaler()
# cars_scaled = sc.fit_transform(cars2)

In [None]:
# what do we have


In [None]:
# make it a dataframe

# cars_scaled = pd.DataFrame(cars_scaled, columns=cars2.columns, index=cars2.index)
# cars_scaled.head(3)

In [None]:
# confirm scaled



In [None]:
# METHODS = ['single', 'complete', 'average', 'ward']
# plt.figure(figsize=(15,5))


# # loop and build our plot
# for i, m in enumerate(METHODS):
#   plt.subplot(1, 4, i+1)
#   plt.title(m)
#   dendrogram(linkage(cars_scaled.values, method=m),
#              labels = cars_scaled.index,
#              leaf_rotation=90,
#              leaf_font_size=10)
  
# plt.show()


In [None]:
# I am going to choose ward, choose whatever you like below

# wlink = linkage(cars_scaled.values, method="ward")
# dendrogram(wlink,
#           labels = cars_scaled.index,
#           leaf_rotation=90,
#           leaf_font_size=10)

# plt.show()

In [None]:
# lets look at the distance added at each step
# docs = 4th paragraph for output


In [None]:
# length of the entry


In [None]:
# look at the actual data


In [None]:
# lets look at the growth in distance added

# added_dist = wlink[:, 2]
# added_dist

In [None]:
# calculate the diff at each join

# penalty = np.diff(added_dist)
# penalty[-5:]

In [None]:
# elbow method - what clustering step starts to show signs of explosion in distance
# remember, we lost one via the diff

# sns.lineplot(range(1, len(penalty)+1), penalty)

In [None]:
# we can re-inspect


In [None]:
# set the clusters based on max dist

# labs2 = fcluster(wlink, 5.5, "distance")

In [None]:
# plot it

# dendrogram(wlink,
#           labels = cars_scaled.index,
#           leaf_rotation=90,
#           leaf_font_size=10)
# plt.axhline(y=5.5)
# plt.show()

In [None]:
# ensure intuition aligns with clusters
