In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Applications of clustering

### Example 1: Identifying patterns 

In [2]:
data = pd.read_csv('traffic.csv',parse_dates=True,index_col='Date')
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'traffic.csv'

In [None]:
data_table = pd.pivot_table(data = data, values = 'Fremont Bridge Total', 
                            index = data.index.time, 
                            columns = data.index.date)
data_table

In [None]:
# fill missing values
data_table.fillna(method='ffill',
                  axis=1,
                  inplace=True)

In [None]:
data_table.plot(legend=False,
                alpha=0.2, 
                figsize=(20,5), 
                title='hourly bicycle traffic')
#plt.title('hourly bicycle traffic (2012-2021)',fontsize=20)

In [None]:
days = data_table.T
from sklearn.cluster import KMeans

In [None]:
k_list = np.arange(1,15)
inertias = []
for k in k_list:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(days)
    inertias.append(kmeans.inertia_)
    
plt.figure(figsize=(12,5))
plt.plot(k_list,inertias,'o-.')
plt.xlabel('k (number of clusters)', fontsize=15)
plt.ylabel('inertia', fontsize=15)

In [None]:
# k=2,3,4 seems a good choice

In [None]:
k = 4
cluster = KMeans(n_clusters=k)
cluster.fit(days)

centers = cluster.cluster_centers_
labels = cluster.labels_

cluster_centers = pd.DataFrame(centers.T, index=data_table.index)

# plot cluster centers
fig, ax = plt.subplots(figsize=(20,7))
for i in range(k):
    cluster_centers[i].plot(ax=ax)
plt.legend()

In [None]:
cluster_centers.plot(figsize=(20,5))

In [None]:
# clusters 0,2: weekdays
# clusters 1,3: weekends and holidays

In [None]:
pd.to_datetime(days[labels==0].index).dayofweek.value_counts()

In [None]:
pd.to_datetime(days[labels==1].index).dayofweek.value_counts()

## Example 2: image segmentation

The goal is to partition an image into multiple segments. In this example, we will cluster pixels based on color intensities (color segmentation).

In [3]:
import matplotlib.pyplot as plt

In [4]:
# load image
image = plt.imread('Tova_the_cat.png')

FileNotFoundError: [Errno 2] No such file or directory: 'Tova_the_cat.png'

In [None]:
# display image
plt.imshow(image)
plt.axis(False)

In [None]:
# 220x294 pixels (3 color channels, red, green and blue)
image.shape

In [None]:
# reshape the array to get a long list of RGB colors
X = image.reshape(220*294,3)

In [None]:
from sklearn.cluster import KMeans

In [None]:
# cluster colors
k = 5
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
centers = kmeans.cluster_centers_
labels = kmeans.labels_
# replace each color by its cluster center
segmented_image = centers[labels].reshape(220,294,3)
# display segmented image
plt.imshow(segmented_image)
plt.axis(False)

## Example 3: topic modeling

The goal is to discover the latent "topics'' that occur in a collection of documents.

In [5]:
from sklearn.datasets import fetch_20newsgroups

In [6]:
categories = [
    'alt.atheism',
    'comp.windows.x',
    'rec.autos',
    'rec.sport.baseball',
    'sci.electronics',
    'sci.space',
]

docs = fetch_20newsgroups(subset='all', 
                                categories=categories,
                                remove=('headers', 'footers', 'quotes')
                          )

In [7]:
data = pd.DataFrame({'doc' : docs['data'], 
                           'category' : docs['target']})
data.head()

Unnamed: 0,doc,category
0,\n McDonnell Douglas rolls out DC-X\n\n ...\...,5
1,\nPortuguese launch complex??? Gosh.... Polish...,5
2,^^^...,4
3,[reply to jimh@carson.u.washington.edu (James ...,0
4,Let's look at the effects of inflation on 1930...,3


In [8]:
docs['target_names']

['alt.atheism',
 'comp.windows.x',
 'rec.autos',
 'rec.sport.baseball',
 'sci.electronics',
 'sci.space']

In [9]:
# preprocess docs
import nltk 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')  
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def process_pos(pos):
    if pos.startswith('J'): # adjectives
        return wordnet.ADJ
    elif pos.startswith('V'): # verbes
        return wordnet.VERB
    elif pos.startswith('N'): # nouns
        return wordnet.NOUN
    elif pos.startswith('R'): # adverbs
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import string
punctuation = [punc for punc in string.punctuation]

def process_text(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word, pos=process_pos(pos)) 
                        for word,pos in nltk.pos_tag(words) 
                        if word not in stop_words # remove stop words
                        and word not in punctuation # remove punctuations
 ] 
    return ' '.join(lemmatized_words)

[nltk_data] Downloading package wordnet to C:\Users\javier.perez-
[nltk_data]     alvaro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\javier.perez-
[nltk_data]     alvaro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to C:\Users\javier.perez-
[nltk_data]     alvaro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\javier.perez-
[nltk_data]     alvaro\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to C:\Users\javier.perez-
[nltk_data]     alvaro\AppData\Roaming\nltk_data...


In [10]:
data['processed_doc'] = data.doc.apply(process_text)

In [11]:
data.processed_doc[0]

"mcdonnell douglas roll dc-x ... ssto research remain cloudy sdi organization -- pay 60 million dc-x -- ca n't afford fund full development follow-on vehicle get necessary hundred million require little peculiar way put sdio 's budget year 3-4 billion _could_ fund dc development one year budget course iron fire launcher development primary purpose dc development could easily pay divert money divert comparable sts ops budget ... oh flame start applaud sdio fund dc-x devlopment hope work launcher development nasas primary goal either imho suppose provide enable technology research others launcher development secondarily operate launcher require that's"

In [315]:
# word frequencies
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(stop_words='english', ngram_range = (1,1),max_df=0.8, max_features=15000)

In [316]:
X = vect.fit_transform(data.processed_doc)

In [317]:
k = 15
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
labels = kmeans.labels_

In [336]:
# clusters = topics?
cluster = 14
data.loc[labels==cluster,'category'].value_counts()

2    382
4     15
0      3
3      1
5      1
Name: category, dtype: int64

In [319]:
# put word frequencies into a dataframe
words_df = pd.DataFrame(data = X.toarray(),columns=vect.get_feature_names_out())
words_df

Unnamed: 0,00,000,0000,00000,00000074,001,0060,007,01,011,...,zombie,zond,zone,zoo,zoom,zooming,zt,zupcic,zw,zx
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.491384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5737,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5738,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5739,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5740,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [320]:
words_df[labels==5].mean().sort_values(ascending=False).head(10)

detector     0.167510
radar        0.120499
morality     0.070596
moral        0.055562
objective    0.047309
animal       0.042237
natural      0.041878
goal         0.039491
specie       0.030238
claim        0.026860
dtype: float64

In [321]:
# top 10 words in each cluster 
for i in range(k):
    print('cluster '+str(i))
    print('---------------------------')
    print(words_df[labels==i].mean().sort_values(ascending=False).head(10).index.to_list())
    print('---------------------------')

cluster 0
---------------------------
['life', 'sky', 'baden', 'bison', 'ca', 'people', 'mb', 'human', 'act', 'sys6626']
---------------------------
cluster 1
---------------------------
['god', 'believe', 'say', 'belief', 'atheist', 'atheism', 'exist', 'faith', 'ico', 'bobbe']
---------------------------
cluster 2
---------------------------
['use', 'know', 'like', 'work', 'make', 'think', 'edu', 'look', 'want', 'need']
---------------------------
cluster 3
---------------------------
['widget', 'motif', 'use', 'software', 'program', 'copy', 'code', 'application', 'user', 'like']
---------------------------
cluster 4
---------------------------
['space', 'nasa', 'launch', 'shuttle', 'cost', 'program', 'station', 'think', 'moon', 'people']
---------------------------
cluster 5
---------------------------
['detector', 'radar', 'morality', 'moral', 'objective', 'animal', 'natural', 'goal', 'specie', 'claim']
---------------------------
cluster 6
---------------------------
['window', 'se

## Example 4: Customer segmentation

Customer segmentation is the process of dividing customers into groups based on common characteristics

In [None]:
data = pd.read_csv('Mall_Customers.csv', index_col='CustomerID')
data.columns = ['gender','age','income','score']
data.head()

The dataset contains the basic information (ID, age, gender, income, spending score) about the customers.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.cluster import KMeans

In [None]:
processor = ColumnTransformer(transformers=[
    ('encoder', OneHotEncoder(),['gender']),
    ('scaler', MinMaxScaler(),['age','income','score'])
])


k = 5
pipe = Pipeline(steps=[
    ('processor',processor),
    ('clusterer',KMeans(n_clusters=k))
])

In [None]:
pipe.fit(data)

In [None]:
labels = pipe['clusterer'].labels_

- cluster 0: female, young, medium annual income, high score
- cluster 1: male, young, medium annual income, high score
- cluster 2: male, old, medium annual income, low score
- cluster 3: female, middle age, medium annual income, low score
- cluster 4: male, middle age, high annual income, low score

In [None]:
cluster = 4
data.loc[labels==cluster,'gender'].value_counts()

In [None]:
cluster = 4
data.loc[labels==cluster,'age'].mean()

In [None]:
cluster = 4
data.loc[labels==cluster,'income'].mean()

In [None]:
cluster = 4
data.loc[labels==cluster,'score'].mean()

In [None]:
data.score.max(), data.score.min()