# Hierarchal Clustering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

## The Data

In [2]:
df = pd.read_csv('DATA/cluster_mpg.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'DATA/cluster_mpg.csv'

In [None]:
df = df.dropna()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df['origin'].value_counts()

In [None]:
df_w_dummies = pd.get_dummies(df.drop('name',axis=1))

In [None]:
df_w_dummies

-----

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
scaled_data = scaler.fit_transform(df_w_dummies)

In [None]:
scaled_data

In [None]:
scaled_df = pd.DataFrame(scaled_data,columns=df_w_dummies.columns)

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(scaled_df,cmap='magma');

In [None]:
sns.clustermap(scaled_df,row_cluster=False)

In [None]:
sns.clustermap(scaled_df,col_cluster=False)

## Using Scikit-Learn

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
model = AgglomerativeClustering(n_clusters=4)

In [None]:
cluster_labels = model.fit_predict(scaled_df)

In [None]:
cluster_labels

In [None]:
plt.figure(figsize=(12,4),dpi=200)
sns.scatterplot(data=df,x='mpg',y='weight',hue=cluster_labels)

## Exploring Number of Clusters with Dendrograms

Make sure to read the documentation online!
https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html

#### Assuming every point starts as its own cluster

In [None]:
model = AgglomerativeClustering(n_clusters=None,distance_threshold=0)

In [None]:
cluster_labels = model.fit_predict(scaled_df)

In [None]:
cluster_labels

In [None]:
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster import hierarchy

## Linkage Model

In [None]:
linkage_matrix = hierarchy.linkage(model.children_)

In [None]:
linkage_matrix

In [None]:
plt.figure(figsize=(20,10))
# Warning! This plot will take awhile!!
dn = hierarchy.dendrogram(linkage_matrix)

In [None]:
plt.figure(figsize=(20,10))
dn = hierarchy.dendrogram(linkage_matrix,truncate_mode='lastp',p=48)

### Choosing a Threshold Distance

**What is the distance between two points?**

In [None]:
scaled_df.describe()

In [None]:
scaled_df['mpg'].idxmax()

In [None]:
scaled_df['mpg'].idxmin()

In [None]:
# https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy
a = scaled_df.iloc[320]
b = scaled_df.iloc[28]
dist = np.linalg.norm(a-b)

In [None]:
dist

#### Max possible distance?

Recall Euclidean distance: https://en.wikipedia.org/wiki/Euclidean_distance

In [None]:
np.sqrt(len(scaled_df.columns))

### Creating a Model Based on Distance Threshold

* distance_threshold
    * The linkage distance threshold above which, clusters will not be merged.

In [None]:
model = AgglomerativeClustering(n_clusters=None,distance_threshold=2)

In [None]:
cluster_labels = model.fit_predict(scaled_data)

In [None]:
cluster_labels

In [None]:
np.unique(cluster_labels)

### Linkage Matrix

Source: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage

    A (n-1) by 4 matrix Z is returned. At the i-th iteration, clusters with indices Z[i, 0] and Z[i, 1] are combined to form cluster n + i. A cluster with an index less than n corresponds to one of the original observations. The distance between clusters Z[i, 0] and Z[i, 1] is given by Z[i, 2]. The fourth value Z[i, 3] represents the number of original observations in the newly formed cluster.

In [None]:
linkage_matrix = hierarchy.linkage(model.children_)

In [None]:
linkage_matrix

In [None]:
plt.figure(figsize=(20,10))
dn = hierarchy.dendrogram(linkage_matrix,truncate_mode='lastp',p=11)