In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Applications of clustering

### Example 1: Identifying patterns 

In [None]:
data = pd.read_csv('traffic.csv',parse_dates=True,index_col='Date')
data.head()

In [None]:
data_table = pd.pivot_table(data = data, values = 'Fremont Bridge Total', index = data.index.time, columns = data.index.date)
data_table

In [None]:
# fill missing values
data_table.fillna(method='ffill',axis=1,inplace=True)

In [None]:
data_table.plot(legend=False,alpha=0.2, figsize=(20,5), title='hourly bicycle traffic')
#plt.title('hourly bicycle traffic (2012-2021)',fontsize=20)

In [None]:
days = data_table.T
from sklearn.cluster import KMeans

In [None]:
k_list = np.arange(1,15)
inertias = []
for k in k_list:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(days)
    inertias.append(kmeans.inertia_)
    
plt.figure(figsize=(12,5))
plt.plot(k_list,inertias,'o-.')
plt.xlabel('k (number of clusters)', fontsize=15)
plt.ylabel('inertia', fontsize=15)

In [None]:
# k=2,3,4 seems a good choice

In [None]:
k = 4
cluster = KMeans(n_clusters=k)
cluster.fit(days)

centers = cluster.cluster_centers_
labels = cluster.labels_

cluster_centers = pd.DataFrame(centers.T, index=data_table.index)

# plot cluster centers
fig, ax = plt.subplots(figsize=(20,7))
for i in range(k):
    cluster_centers[i].plot(ax=ax)
plt.legend()

In [None]:
cluster_centers.plot(figsize=(20,5))

In [None]:
# clusters 0,2: weekdays
# clusters 1,3: weekends and holidays

In [None]:
pd.to_datetime(days[labels==0].index).dayofweek.value_counts()

In [None]:
pd.to_datetime(days[labels==1].index).dayofweek.value_counts()

## Example 2: image segmentation

The goal is to partition an image into multiple segments. In this example, we will cluster pixels based on color intensities (color segmentation).

In [None]:
import matplotlib.pyplot as plt

In [None]:
# load image
image = plt.imread('Tova_the_cat.png')

In [None]:
# display image
plt.imshow(image)
plt.axis(False)

In [None]:
# 220x294 pixels (3 color channels, red, green and blue)
image.shape

In [None]:
# reshape the array to get a long list of RGB colors
X = image.reshape(220*294,3)

In [None]:
from sklearn.cluster import KMeans

In [None]:
# cluster colors
k = 5
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
centers = kmeans.cluster_centers_
labels = kmeans.labels_
# replace each color by its cluster center
segmented_image = centers[labels].reshape(220,294,3)
# display segmented image
plt.imshow(segmented_image)
plt.axis(False)

## Example 3: topic modeling

The goal is to discover the latent "topics'' that occur in a collection of documents.

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
categories = [
    'alt.atheism',
    'comp.windows.x',
    'rec.autos',
    'rec.sport.baseball',
    'sci.electronics',
    'sci.space',
]

docs = fetch_20newsgroups(subset='all', 
                                categories=categories,
                                remove=('headers', 'footers', 'quotes')
                          )

In [None]:
data = pd.DataFrame({'doc' : docs['data'], 
                           'category' : docs['target']})
data.head()

In [None]:
docs['target_names']

In [None]:
# preprocess docs
import nltk 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')  
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def process_pos(pos):
    if pos.startswith('J'): # adjectives
        return wordnet.ADJ
    elif pos.startswith('V'): # verbes
        return wordnet.VERB
    elif pos.startswith('N'): # nouns
        return wordnet.NOUN
    elif pos.startswith('R'): # adverbs
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import string
punctuation = [punc for punc in string.punctuation]

def process_text(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word, pos=process_pos(pos)) 
                        for word,pos in nltk.pos_tag(words) 
                        if word not in stop_words # remove stop words
                        and word not in punctuation # remove punctuations
 ] 
    return ' '.join(lemmatized_words)

In [None]:
data['processed_doc'] = data.doc.apply(process_text)

In [None]:
data.processed_doc[0]

In [None]:
# word frequencies
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=10000)

In [None]:
X = vect.fit_transform(data.processed_doc)

In [None]:
len(vect.get_feature_names())

In [None]:
k = 15
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
labels = kmeans.labels_

In [None]:
# clusters = topics?
cluster = 1
data.loc[labels==cluster,'category'].value_counts()

In [None]:
# top 10 words in each cluster 
words_df = pd.DataFrame(data = X.toarray(),columns=vect.get_feature_names())
words_df

In [None]:
for i in range(k):
    print('---------------------------')
    print(words_df[labels==i].mean().sort_values(ascending=False).head(10).index)
    print('---------------------------')

## Example 3: Customer segmentation

Customer segmentation is the process of dividing customers into groups based on common characteristics

In [None]:
data = pd.read_csv('Mall_Customers.csv', index_col='CustomerID')
data.columns = ['gender','age','income','score']
data.head()

The dataset contains the basic information (ID, age, gender, income, spending score) about the customers.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.cluster import KMeans

In [None]:
processor = ColumnTransformer(transformers=[
    ('encoder', OneHotEncoder(),['gender']),
    ('scaler', MinMaxScaler(),['age','income','score'])
])


k = 5
pipe = Pipeline(steps=[
    ('processor',processor),
    ('clusterer',KMeans(n_clusters=k))
])

In [None]:
pipe.fit(data)

In [None]:
labels = pipe['clusterer'].labels_

- cluster 0: female, young, medium annual income, high score
- cluster 1: male, young, medium annual income, high score
- cluster 2: male, old, medium annual income, low score
- cluster 3: female, middle age, medium annual income, low score
- cluster 4: male, middle age, high annual income, low score

In [None]:
cluster = 4
data.loc[labels==cluster,'gender'].value_counts()

In [None]:
cluster = 4
data.loc[labels==cluster,'age'].mean()

In [None]:
cluster = 4
data.loc[labels==cluster,'income'].mean()

In [None]:
cluster = 4
data.loc[labels==cluster,'score'].mean()

In [None]:
data.score.max(), data.score.min()