# Modeling Exercises

## Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

from pydataset import data

from mall_wrangle import wrangle_mall_df

## Get Data

In [None]:
df = data('iris')
df.head()

## Prepare data

### Let's fix the column names by making everything lower case and replacing the dot separator with and underscore

In [None]:
new_col_names = []

for col in df.columns:
    new_col_names.append(col.lower().replace('.', '_'))
    
df.columns = new_col_names

df.head()

In [None]:
X = df[['petal_length', 'petal_width']]
X.head()

## Modeling

### Create and Fit cluster model

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

clusters = kmeans.predict(X)
clusters

### Add cluster to our original data

In [None]:
df['cluster'] = clusters
df.head()

### Visualize Clusters

In [None]:
df.groupby('species').mean()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(16, 9), sharex=True, sharey=True)

for species, subset in df.groupby('species'):
    axs[0].scatter(subset.petal_length, subset.petal_width, label=species)
    
axs[0].legend()
axs[0].set(title='Actual Species')
    
for cluster_n in df.cluster.sort_values().unique():
    axs[1].scatter(df[df.cluster == cluster_n].petal_length, df[df.cluster == cluster_n].petal_width, label=f'cluster_{cluster_n}')

axs[1].legend()
axs[1].set(title="K-Means Clusters")

### Use elbow method to determine cluster number

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

#### This graph seems to show a drop off in inertia around **k=4** so let's look at around there to see what seems best

### Visuals cluster around k=4

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(13, 13), sharex=True, sharey=True)

for ax, k in zip(axs.ravel(), range(2, 6)):
    clusters = KMeans(k).fit(X).predict(X)
    ax.scatter(X.petal_length, X.petal_width, c=clusters)
    ax.set(title='k = {}'.format(k), xlabel='petal length', ylabel='petal width')

### Clustering on 3 features

In [None]:
X = df[['petal_length', 'petal_width', 'sepal_length']]
X.head()

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

clusters = kmeans.predict(X)
clusters

In [None]:
df['three_feature_cluster'] = clusters
df.head()

In [None]:
fig, axs = plt.subplots(3, 2, figsize=(22, 11))

for species, subset in df.groupby('species'):
    axs[0,0].scatter(subset.petal_length, subset.petal_width, label=species)
    
axs[0,0].legend()
axs[0,0].set(title='Actual Species with petal_length and petal_width')
    
for cluster_n in df.three_feature_cluster.sort_values().unique():
    axs[0,1].scatter(df[df.three_feature_cluster == cluster_n].petal_length, df[df.three_feature_cluster == cluster_n].petal_width, label=f'cluster_{cluster_n}')

axs[0,1].legend()
axs[0,1].set(title="K-Means Clusters with petal_length and petal_width")

for species, subset in df.groupby('species'):
    axs[1,0].scatter(subset.petal_width, subset.sepal_length, label=species)
    
axs[1,0].legend()
axs[1,0].set(title='Actual Species with petal_width and sepal_length')

for cluster_n in df.three_feature_cluster.sort_values().unique():
    axs[1,1].scatter(df[df.three_feature_cluster == cluster_n].petal_width, df[df.three_feature_cluster == cluster_n].sepal_length, label=f'cluster_{cluster_n}')

axs[1,1].legend()
axs[1,1].set(title="K-Means Clusters with petal_width and sepal_length")

for species, subset in df.groupby('species'):
    axs[2,0].scatter(subset.petal_length, subset.sepal_length, label=species)
    
axs[2,0].legend()
axs[2,0].set(title='Actual Species with petal_length and sepal_length')

for cluster_n in df.three_feature_cluster.sort_values().unique():
    axs[2,1].scatter(df[df.three_feature_cluster == cluster_n].petal_length, df[df.three_feature_cluster == cluster_n].sepal_length, label=f'cluster_{cluster_n}')

axs[2,1].legend()
axs[2,1].set(title="K-Means Clusters with petal_length and sepal_length")

plt.show()

## Clustering the Mall Dataset

### Bring in our modeling data

In [None]:
scaler, train, validate, test = wrangle_mall_df()
train.head()

### Let's take a quick look at our data

In [None]:
sns.pairplot(train.drop(columns='is_male'))

### I see a really nice X shape in the combonation of spending score and annual_income, so let's start with clustering on those features

In [None]:
X = train[['spending_score', 'annual_income']]
X.head()

### Create our clusters

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)

clusters = kmeans.predict(X)
clusters

### Add our clusters onto the training set

In [None]:
train['cluster'] = clusters
train.head()

### Visualize our clusters

In [None]:
plt.figure(figsize=(5, 5))

for cluster_n in train.cluster.sort_values().unique():
    plt.scatter(train[train.cluster == cluster_n].spending_score, train[train.cluster == cluster_n].annual_income, label=f'cluster_{cluster_n}')

plt.legend()
plt.title("K-Means Clusters")
plt.show()

### Use the elbow method to determine if we have the appropriate number of clusters

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

#### This seem to show the elbow around five so lets check around there

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(13, 13), sharex=True, sharey=True)

for ax, k in zip(axs.ravel(), range(3, 7)):
    clusters = KMeans(k).fit(X).predict(X)
    ax.scatter(X.spending_score, X.annual_income, c=clusters)
    ax.set(title='k = {}'.format(k), xlabel='spending_score', ylabel='annual_income')

#### It looks like 5 clusters is good for capturing distinct groups in our data

## Clustering the Scaled Mall Dataset (Does scaling make a difference?)

In [None]:
train.head()

In [None]:
train2 = train.drop(columns='cluster').copy()

In [None]:
train2.head()

In [None]:
scaler = MinMaxScaler()

scaler.fit(train2)

train_scaled = scaler.transform(train2)
validate_scaled = scaler.transform(validate)
test_scaled = scaler.transform(test)

train_scaled

In [None]:
train_scaled = pd.DataFrame(train_scaled, columns=train2.columns.values).set_index([train2.index.values])
validate_scaled = pd.DataFrame(validate_scaled, columns=validate.columns.values).set_index([validate.index.values])
test_scaled = pd.DataFrame(test_scaled, columns=test.columns.values).set_index([test.index.values])

In [None]:
train_scaled.head()

In [None]:
sns.pairplot(train_scaled.drop(columns='is_male'))

In [None]:
X = train_scaled[['spending_score', 'annual_income']]
X.head()

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)

clusters = kmeans.predict(X)
clusters

In [None]:
train_scaled['cluster'] = clusters
train_scaled.head()

In [None]:
plt.figure(figsize=(5, 5))

for cluster_n in train_scaled.cluster.sort_values().unique():
    plt.scatter(train_scaled[train_scaled.cluster == cluster_n].spending_score, 
                train_scaled[train_scaled.cluster == cluster_n].annual_income, 
                label=f'cluster_{cluster_n}'
               )

plt.legend()
plt.title("K-Means Clusters")
plt.show()

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(13, 13), sharex=True, sharey=True)

for ax, k in zip(axs.ravel(), range(3, 7)):
    clusters = KMeans(k).fit(X).predict(X)
    ax.scatter(X.spending_score, X.annual_income, c=clusters)
    ax.set(title='k = {}'.format(k), xlabel='spending_score', ylabel='annual_income')

## Did scaling change our clusters

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14, 7))

for cluster_n in train.cluster.sort_values().unique():
    axs[0].scatter(train[train.cluster == cluster_n].spending_score, 
                train[train.cluster == cluster_n].annual_income, 
                label=f'cluster_{cluster_n}'
               )
axs[0].legend()
axs[0].set(title="K-Means Unscaled Clusters")

for cluster_n in train_scaled.cluster.sort_values().unique():
    axs[1].scatter(train_scaled[train_scaled.cluster == cluster_n].spending_score, 
                train_scaled[train_scaled.cluster == cluster_n].annual_income, 
                label=f'cluster_{cluster_n}'
               )
axs[1].legend()
axs[1].set(title="K-Means Scaled Clusters")
plt.show()

## DBSCAN

Lets try a similar process with DBSCAN. 

In [None]:
dbscan = DBSCAN(algorithm='auto', min_samples=4, eps=5)

In [None]:
dbscan.fit(train[['spending_score', 'annual_income']])

In [None]:
train['dbcluster'] = dbscan.labels_

DBSCAN assigns a value of `-1` to the cluster label of an observation that is considered "noise".

In [None]:
train.head()

In [None]:
plt.figure(figsize=(10, 5))

for cluster_n in train.dbcluster.sort_values().unique():
    plt.scatter(train[train.dbcluster == cluster_n].spending_score, 
                train[train.dbcluster == cluster_n].annual_income, 
                label=f'cluster_{cluster_n}'
               )

plt.legend()
plt.title("DBSCAN Clusters, eps=5, min_samples=4")
plt.show()

Okay, maybe we need to tune our DBSCAN a bit. There are two primary hyperparameters:

$\epsilon$ (epsilon): The distance scanned to identify if an adjacent observation is part of the same cluster.

min_samples: The minimum number of datapoints that need to be closely grouped (determined by epsilon) before they are given their unique cluster identity/label.

### min-samples

Finding the optimal min-samples is a bit of trial and error, but there are some rules of thumb:

- For two-dimensional data (only clustering on two features), use the default setting (min_samples = 4)
- For data with more than two dimensions, choose 2 * the number of dimensions
- If the data is noisy, choose a larger value for min_samples

For our purposes, we will stick with the default. 

### $\epsilon$ (epsilon)

Finding the optimal value for epsilon can be handled according to [this paper](https://iopscience.iop.org/article/10.1088/1755-1315/31/1/012012/pdf).

To find the optimal value for $\epsilon$, we are going to create an elbow plot. We start by training a KNN model, and set the value of `k` equal to 2:

In [None]:
neighbors = NearestNeighbors(n_neighbors=2)
neighbors_fit = neighbors.fit(train[['spending_score', 'annual_income']])

KNN will calculate for us the average distance between each point in the data set and its nearest neighbor:

In [None]:
distances, indices = neighbors_fit.kneighbors(train[['spending_score', 'annual_income']])

In the array that is created, each row is an observation. 

|0|1|
|---|---|
|Distance to self|Distance to 1st closest neighbor|

In [None]:
distances[:10] 

In [None]:
distances.shape

We are now going to sort each column vertically. 

In [None]:
distances = np.sort(distances, axis=0)

In [None]:
distances

In [None]:
distances.shape

Now lets just create a long 1-D list of all of the values:

In [None]:
distances = distances[:,1]

In [None]:
distances

In [None]:
distances.shape

In [None]:
plt.plot(distances)
plt.xlabel('Data Point')
plt.ylabel('Epsilon')
plt.grid(True)

The ideal value for $\epsilon$ is the elbow of the curve. For the example above, it looks like 6 is a decent value.

In [None]:
dbscan2 = DBSCAN(algorithm='auto', min_samples=4, eps=6)
dbscan2.fit(train[['spending_score', 'annual_income']])
train['dbcluster2'] = dbscan2.labels_
train.head()

In [None]:
plt.figure(figsize=(10, 5))

for cluster_n in train.dbcluster2.sort_values().unique():
    plt.scatter(train[train.dbcluster2 == cluster_n].spending_score, 
                train[train.dbcluster2 == cluster_n].annual_income, 
                label=f'cluster_{cluster_n}'
               )

plt.legend()
plt.title("DBSCAN Clusters, eps=6, min_samples=4")
plt.show()

Did changing epsilon from 5 to 6 improve our results? Debatable. 

Looks like KMeans is the winner here...at least with the unscaled data.

## Exercises

Do your work for this exercise in a notebook named `modeling` within your `clustering-exercises` repository. For these exercises, YOU ONLY NEED TO DO KMEANS. If you have extra time, then you can play around with DBSCAN.

1. Clustering with the Iris Dataset. Using this lesson as a guide, perform clustering on the iris dataset.
    a. Choose features other than the ones used in the lesson.
    b. Visualize the results of your clustering.
    c. Use the elbow method to visually select a good value for k.
    d. Repeat the clustering, this time with 3 different features.
2. Use the techniques discussed in this lesson, as well as the insights gained from the exploration exercise to perform clustering on the mall customers dataset. Be sure to visualize your results!
3. How does scaling impact the results of clustering? Compare k-means clustering results on scaled and unscaled data (you can choose any dataset for this exercise OR use the data/steps outlined in the bonus below). You can show how the resulting clusters differ either with descriptive statistics or visually.

**Bonus**
1. Read in the data from this google sheet: https://docs.google.com/spreadsheets/d/1j5EgXVTR5ikUj3G5ZCQmkq6ziz_gvtASGAdw23-5_6M/edit?usp=sharing
2. Visualize the data and guess the number and shape of the clusters.
3. Implement the KMeans algorithm on unscaled data and visualize the clusters.
4. Repeat the step above but with scaled data.
5. Write down the takeaways from this exercise.