# Hierarchical Clustering Notebook

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform
%matplotlib inline

We're going to play with the whiskey dataset, let's take a look:

https://www.mathstat.strath.ac.uk/outreach/nessie/datasets/whiskies.txt

In [None]:
df = pd.read_csv('https://www.mathstat.strath.ac.uk/outreach/nessie/datasets/whiskies.txt')

In [None]:
df.head()

### Let's drop the columns we don't want, which are those?

In [None]:
columns_to_drop = ['RowID','Postcode',' Latitude',' Longitude']
df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
df.columns

### Let's make the index something nice, so we can use it later as a label

In [None]:
df.set_index('Distillery', inplace = True)

## Let's talk now about what the potential business applications are of this clustered data

Okay, let's do it

In [None]:
def make_dendrogram(dataframe, linkage_method, metric, color_threshold=None):
    '''
    This function creates and plots the dendrogram created by hierarchical clustering.
    
    INPUTS: Pandas Dataframe, string, string, int
    
    OUTPUTS: None
    '''
    distxy = squareform(pdist(dataframe.values, metric=metric))
    Z = linkage(distxy, linkage_method)
    plt.figure(figsize=(25, 10))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('sample index')
    plt.ylabel('distance')
    dendrogram(
        Z,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=12.,  # font size for the x axis labels
        labels = dataframe.index,
        color_threshold = color_threshold
    )
    plt.show()

### Which distance metric do we use?

### Which linkage should we use?

In [None]:
linktype = None
metric = None
make_dendrogram(df, linktype, metric, color_threshold=None)