# Unsupervised Learning Demo

### Dependencies and data

In [25]:
# Dependencies
import pandas as pd
import hvplot.pandas
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

%matplotlib inline

In [2]:
# Iris data
iris_df = pd.read_csv('data/demo/iris.csv')
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [3]:
# Filter for numeric data
iris_df.drop(columns='class', inplace=True)
iris_df.head(4)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2


In [4]:
# Shopping data
shopping_df = pd.read_csv('data/demo/shopping_data.csv')
shopping_df.head(4)

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0


### Iris data clustering

In [5]:
# K-means clustering
km = KMeans(n_clusters=3, random_state=3)
km.fit(iris_df)
km.labels_ # assigned labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1], dtype=int32)

In [6]:
# Plot sepal length vs width
iris_df['class'] = km.labels_ # add labels to df
iris_df.hvplot.scatter(x='sepal_length', y='sepal_width', by='class')

In [7]:
# Plot petal length vs width
iris_df.hvplot.scatter(x='petal_length', y='petal_width', by='class')

In [8]:
# Plot sepal length vs petal length
iris_df.hvplot.scatter(x='sepal_length', y='petal_length', by='class')

In [9]:
# Plot sepal width vs petal width
iris_df.hvplot.scatter(x='sepal_width', y='petal_width', by='class')

In [30]:
# Plot all 4 measurements
fig = px.scatter_3d(iris_df, x='sepal_length', y='sepal_width', 
                 z='petal_length', size='sepal_width',
                 color='class', symbol='class', width=800)
fig.update_layout(legend={'x': 0, 'y': 1})
fig.show()

In [43]:
# Elbow plot variables
clusters = range(1, 11) # x-axis
inertia = [] # y-axis

# K-means clustering with varying k
for k in clusters:
    km = KMeans(n_clusters=k, random_state=11)
    km.fit(iris_df.drop(columns='class'))
    inertia.append(km.inertia_)
    
# Elbow curve
ki = pd.DataFrame({'clusters': clusters, 'inertia': inertia}) # create dataframe for elbow data
ki.hvplot.line(x='clusters', y='inertia', title='Elbow Curve', )

### Shopping data preprocessing

In [11]:
# Check for missing data
shopping_df.isnull().sum()

CustomerID                0
Card Member               2
Age                       2
Annual Income             0
Spending Score (1-100)    1
dtype: int64

In [12]:
# Drop rows with missing values
shopping_df.dropna(inplace=True)
shopping_df.isnull().sum().sum()

0

In [13]:
# Convert `member` to numeric
shopping_df['member'] = (shopping_df['Card Member'] == 'Yes').astype(int)

# Drop unnecessary columns
shopping_df.drop(columns=['CustomerID', 'Card Member'], inplace=True)

shopping_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 0 to 202
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     200 non-null    float64
 1   Annual Income           200 non-null    int64  
 2   Spending Score (1-100)  200 non-null    float64
 3   member                  200 non-null    int64  
dtypes: float64(2), int64(2)
memory usage: 7.8 KB


In [14]:
# Rename columns
shopping_df.columns = ['age', 'income', 'spending_score', 'member']
shopping_df.head(4)

Unnamed: 0,age,income,spending_score,member
0,19.0,15000,39.0,1
1,21.0,15000,81.0,1
2,20.0,16000,6.0,0
3,23.0,16000,77.0,0


### Shopping data clustering

In [38]:
def kmeans(data, k, x='income', y='spending_score', z='age'):
    
    """
    Fit a K-means clustering model and optionally plot the clusters
    
    Parameters
    ----------
    data : pandas.core.frame.DataFrame
        Data to model
    k : int
        Number of clusters for K-means
    x : str, optional
        Name of feature to plot on the x-axis if plotting, by default "income"
    y : str, optional
        Name of feature to plot on the y-axis if plotting, by default "spending_score"
    z : str, optional
        Name of feature to plot on the z-axis if plotting, by default "age"
        
    Returns
    -------
    sklearn.cluster._kmeans.KMeans
        Trained KMeans model
    """
    
    # K-means model
    df = data.copy() # make copy of data
    km = KMeans(n_clusters=k, random_state=8)
    km.fit(data)
    df['class'] = km.labels_ # add labels to df
    
    # Plot clusters
    if x and y and z:
        fig = px.scatter_3d(df, x=x, y=y, z=z, color='class', symbol='class', width=600)
        fig.update_layout(legend={'x': 0, 'y': 1})
        fig.show()
    
    return km


# K-means with 2 clusters
kmeans(shopping_df, k=2)

KMeans(n_clusters=2, random_state=8)

In [40]:
# K-means with 3 clusters
kmeans(shopping_df, k=3)

KMeans(n_clusters=3, random_state=8)

In [41]:
# K-means with 4 clusters
kmeans(shopping_df, k=4)

KMeans(n_clusters=4, random_state=8)

In [42]:
# K-means with 5 clusters
kmeans(shopping_df, k=5)

KMeans(n_clusters=5, random_state=8)

KMeans(n_clusters=6, random_state=8)