# Product Segmentation

In [1]:
#!pip install yellowbrick

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
data_file = "https://raw.githubusercontent.com/surajdwivedi0307/UnsupervisedLearning/main/Clustering/beer.csv"

In [10]:
beer_df = pd.read_csv( data_file )

In [11]:
beer_df

Unnamed: 0,name,calories,sodium,alcohol,cost
0,Budweiser,144,15,4.7,0.43
1,Schlitz,151,19,4.9,0.43
2,Kronenbourg,170,7,5.2,0.73
3,Heineken,152,11,5.0,0.77
4,Old_Milwaukee,145,23,4.6,0.28
5,Augsberger,175,24,5.5,0.4
6,Srohs_Bohemian_Style,149,27,4.7,0.42
7,Miller_Lite,99,10,4.3,0.43
8,Budweiser_Light,113,8,3.7,0.4
9,Coors,140,18,4.6,0.44


In [None]:
sn.kdeplot(beer_df.calories);

In [None]:
sn.kdeplot(beer_df.sodium);

In [None]:
sn.scatterplot(data = beer_df,
               x = 'alcohol',
               y = 'calories');

## Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_beer_df = scaler.fit_transform( beer_df[['calories',
                                                'sodium',
                                                'alcohol',
                                                'cost']] )

In [None]:
scaled_beer_df = pd.DataFrame(scaled_beer_df,
                              columns = ['calories',
                                         'sodium',
                                         'alcohol',
                                         'cost'])

In [None]:
sn.kdeplot(scaled_beer_df.sodium);
sn.kdeplot(scaled_beer_df.calories);

## How many clusters exist?

#### Using Dendrogram

In [None]:
cmap = sn.cubehelix_palette(as_cmap=True, rot=-.3, light=1)
sn.clustermap(scaled_beer_df,
              cmap=cmap,
              linewidths=.2,
              figsize = (6,6) );

In [None]:
beer_df.iloc[[9,15]]

In [None]:
beer_df.iloc[[16,14]]

In [None]:
from sklearn.cluster import KMeans

In [None]:
cluster_range = range( 1, 10 )
cluster_errors = []

for num_clusters in cluster_range:
  clusters = KMeans( num_clusters )
  clusters.fit( scaled_beer_df )
  cluster_errors.append( clusters.inertia_ )

plt.figure(figsize=(8,4))
plt.plot( cluster_range, cluster_errors, marker = "o" );

## Evaluating the quality of cluster using Silhouette score

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

fig, ax = plt.subplots(2, 2, figsize=(15,8))
num_clusters = [2, 3, 4, 5]
for i, k in enumerate(num_clusters):
    km = KMeans(n_clusters=k,
                random_state=42)
    q, mod = divmod(i, 2)
    visualizer = SilhouetteVisualizer(km,
                                      colors='yellowbrick',
                                      ax=ax[q-1][mod])
    visualizer.fit(scaled_beer_df)

#### Rescaling the dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaled_beer_df = scaler.fit_transform( beer_df[['calories',
                                                'sodium',
                                                'alcohol',
                                                'cost']] )

#### Creating 3 Clusters

We will set k to 3 for running *KMeans* algorithm and create a new column *clusterid* in *beer_df* to capture the cluster number it is assigned to.

In [None]:
k = 4

clusters = KMeans( k, random_state = 42 )
clusters.fit( scaled_beer_df )
beer_df["clusterid"] = clusters.labels_

In [None]:
clusters.labels_

## Understanding the clusters

In [None]:
sn.violinplot(data = beer_df,
              y = 'calories',
              x = 'clusterid');

In [None]:
sn.violinplot(data = beer_df,
              y = 'cost',
              x = 'clusterid');

In [None]:
sn.scatterplot(data = beer_df,
               x = 'cost',
               y = 'calories',
               hue = 'clusterid');

#### Cluster 0

In [None]:
beer_df[beer_df.clusterid == 0]

#### Cluster 1

In [None]:
beer_df[beer_df.clusterid == 1]

#### Cluster 2

In [None]:
beer_df[beer_df.clusterid == 2]

#### Cluster 3

In [None]:
beer_df[beer_df.clusterid == 3]

# Evaluating Cluster Performane

## Calinski-Harabasz Index

In [None]:
from sklearn.metrics import calinski_harabasz_score

In [None]:
cluster_range = range( 3, 6 )
ch_scores = []

for num_clusters in cluster_range:
  clusters = KMeans( num_clusters )
  clusters.fit( scaled_beer_df )
  print(f"n_cluster: {num_clusters} - CH Score: {calinski_harabasz_score(scaled_beer_df, clusters.labels_)}" )

## Davies-Bouldin Index

In [None]:
from sklearn.metrics import davies_bouldin_score

In [None]:
cluster_range = range( 3, 6 )
ch_scores = []

for num_clusters in cluster_range:
  clusters = KMeans( num_clusters )
  clusters.fit( scaled_beer_df )
  print(f"n_cluster: {num_clusters} - CH Score: {davies_bouldin_score(scaled_beer_df, clusters.labels_)}" )

# Hierarchical Clustering

Cluster method:

https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html

In [None]:
cmap = sn.cubehelix_palette(as_cmap=True, rot=-.3, light=1)
sn.clustermap(scaled_beer_df,
              cmap=cmap,
              linewidths=.2,
              figsize = (6,6) );

In [None]:
from sklearn.cluster import AgglomerativeClustering

Try different metric and linkage

- metric: “euclidean”, “manhattan”, “cosine”, "minkowski"
- linkage: single, complete, ward, average

In [None]:
h_cluster = AgglomerativeClustering(n_clusters = 4,
                                    metric = 'euclidean',
                                    linkage = 'ward',
                                    compute_distances = True)

In [None]:
#h_cluster = AgglomerativeClustering(n_clusters = 4,
#                                    metric = 'manhattan',
#                                    linkage = 'complete')

In [None]:
h_cluster.fit(scaled_beer_df)

In [None]:
h_cluster.labels_

In [None]:
beer_df['hcluster_clusterid'] = h_cluster.labels_

In [None]:
beer_df.sort_values('hcluster_clusterid')

In [None]:
h_cluster.n_leaves_

In [None]:
h_cluster.children_

In [None]:
h_cluster.distances_

# HDBScan Clustering

In [None]:
from sklearn.cluster import HDBSCAN

In [None]:
hdbscan = HDBSCAN(min_cluster_size=2)

In [None]:
hdbscan.fit(scaled_beer_df)

In [None]:
hdbscan.labels_

In [None]:
beer_df['hdbscan_clusterid'] = hdbscan.labels_

In [None]:
beer_df[beer_df.hdbscan_clusterid == 0]

In [None]:
beer_df[beer_df.hdbscan_clusterid == 1]

In [None]:
beer_df[beer_df.hdbscan_clusterid == 2]

In [None]:
beer_df[beer_df.hdbscan_clusterid == 3]

In [None]:
beer_df[beer_df.hdbscan_clusterid == -1]