# 1. Extracting the clusters

In [20]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
import umap.umap_ as umap
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [22]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)
pd.set_option('display.max_colwidth', None)

In [23]:
clustering_customer_info = pd.read_csv('../data/raw/cleanerer_customer_info.csv', index_col="customer_id").drop(columns=[
    'customer_gender',
    'age',
    'percentage_of_products_bought_promotion',
    'typical_hour',
    'distinct_stores_visited',
    'number_complaints',
    "kids_home",
    "teens_home"])

customer_info = pd.read_csv('../data/raw/cleanerer_customer_info.csv', index_col="customer_id")

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type([values.dtype, comps_array.dtype], [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type([values.dtype, comps_array.dtype], [])


In [24]:
n_clusters = 7

hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
hierarchical_labels = hierarchical.fit_predict(clustering_customer_info)

reducer = umap.UMAP(random_state=42)
embedding_hierarchical = reducer.fit_transform(clustering_customer_info)

# using the hierarchical cluster centers as initialization for KMeans
# To get centers, we calculate the mean of each hierarchical cluster
cluster_centers = np.array([clustering_customer_info[hierarchical_labels == i].mean(axis=0) for i in range(n_clusters)])

# run KMeans initialized with  the hierarchical centers
kmeans = KMeans(n_clusters=n_clusters, init=cluster_centers, n_init=1, random_state=42)
kmeans_labels = kmeans.fit_predict(clustering_customer_info)

  warn(


In [25]:
customer_info['cluster'] = kmeans_labels

In [26]:
# Exporting the clusters to each customer
assigned_clusters = customer_info[['cluster']].to_csv('../data/raw/assigned_clusters.csv', index_label='customer_id')

# 2. Cluster Analysis

## 2.1. Association Rules

In our implementation, we will make use of the Apriori algorithm, as it gives us a better overview by evaluating more metrics.

### 2.1.1 Extracting the transactions of each cluster

In [27]:
customer_basket = pd.read_csv('../data/raw/customer_basket.csv')

# Linking each transaction to each cluster
customer_basket = customer_basket.merge(customer_info[['cluster']], on='customer_id', how='left')

# Grouping all transactions of each cluster
cluster_transactions = {}
for cluster_label in customer_basket['cluster'].unique():
    cluster_data = customer_basket[customer_basket['cluster'] == cluster_label]['list_of_goods'].apply(eval).tolist()
    cluster_transactions[cluster_label] = cluster_data

### Cluster Zero: Hyper-Hygienic

In [28]:
transactions_0 = cluster_transactions[0]

te = TransactionEncoder()
te_fit = te.fit_transform(transactions_0)
transactions_0 = pd.DataFrame(te_fit, columns=te.columns_)

In [29]:
frequent_itemsets_grocery = apriori(
    transactions_0, min_support=0.05, use_colnames=True
    )

In [30]:
rules_grocery = association_rules(frequent_itemsets_grocery,
                                  metric="confidence",
                                  min_threshold=0.2,
                                  num_itemsets = len(frequent_itemsets_grocery))
rules_grocery.sort_values(by='lift', ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
43,(white wine),(cider),0.10687,0.077683,0.053658,0.50209,6.463327,1.0,0.045356,1.852378,0.946425,0.409936,0.460153,0.596412
42,(cider),(white wine),0.077683,0.10687,0.053658,0.690735,6.463327,1.0,0.045356,2.887911,0.916475,0.409936,0.653729,0.596412
2,(babies food),(ratchet & clank),0.136305,0.077187,0.050481,0.370357,4.798206,1.0,0.039961,1.465614,0.916514,0.309683,0.317692,0.512188
3,(ratchet & clank),(babies food),0.077187,0.136305,0.050481,0.654019,4.798206,1.0,0.039961,2.496368,0.857799,0.309683,0.599418,0.512188
198,"(cologne, oil)","(barbecue sauce, chicken)",0.267696,0.128959,0.059218,0.221213,1.715377,1.0,0.024696,1.118459,0.569487,0.175493,0.105912,0.340206
201,"(barbecue sauce, chicken)","(cologne, oil)",0.128959,0.267696,0.059218,0.459199,1.715377,1.0,0.024696,1.354111,0.478781,0.175493,0.261508,0.340206
220,"(cologne, ham)","(barbecue sauce, oil)",0.089,0.338578,0.051574,0.579476,1.711497,1.0,0.02144,1.57285,0.45633,0.137162,0.364212,0.3659
221,"(oil, ham)","(cologne, barbecue sauce)",0.15482,0.196019,0.051574,0.33312,1.699424,1.0,0.021226,1.205585,0.486956,0.172334,0.170527,0.298112
219,"(cologne, barbecue sauce)","(oil, ham)",0.196019,0.15482,0.051574,0.263105,1.699424,1.0,0.021226,1.146947,0.51191,0.172334,0.12812,0.298112
199,"(cologne, barbecue sauce)","(chicken, oil)",0.196019,0.178001,0.059218,0.302102,1.697195,1.0,0.024326,1.177821,0.510948,0.188111,0.150975,0.317392


### Cluster One: Discount Driven

We may now define a function to automate this process and check which items are more often bought together in each cluster: 

In [31]:
def associationRules(clusternum):
    transactions = cluster_transactions[clusternum]
    te = TransactionEncoder()
    te_fit = te.fit_transform(transactions)
    transactions = pd.DataFrame(te_fit, columns=te.columns_)
    frequent_itemsets_grocery = apriori(
    transactions, min_support=0.05, use_colnames=True
    )
    rules = association_rules(frequent_itemsets_grocery,
                                  metric="confidence",
                                  min_threshold=0.2,
                                  num_itemsets = len(frequent_itemsets_grocery))
    return rules.sort_values(by='lift', ascending=False).head(10)

In [32]:
associationRules(1)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
410,"(oil, muffins)","(cake, tea)",0.178101,0.214206,0.051081,0.286811,1.33895,1.0,0.012931,1.101803,0.308001,0.1497,0.092397,0.26264
408,"(cake, tea)","(oil, muffins)",0.214206,0.178101,0.051081,0.238468,1.33895,1.0,0.012931,1.079271,0.322153,0.1497,0.073449,0.26264
409,"(cake, muffins)","(oil, tea)",0.100112,0.388143,0.051081,0.510242,1.314572,1.0,0.012224,1.249305,0.265918,0.116844,0.199555,0.320923
437,"(oil, muffins)","(tea, cooking oil)",0.178101,0.275727,0.064256,0.360782,1.308474,1.0,0.015148,1.13306,0.286837,0.164939,0.117434,0.296911
438,"(tea, cooking oil)","(oil, muffins)",0.275727,0.178101,0.064256,0.23304,1.308474,1.0,0.015148,1.071633,0.3255,0.164939,0.066844,0.296911
439,"(cooking oil, muffins)","(oil, tea)",0.128076,0.388143,0.064256,0.501698,1.29256,1.0,0.014544,1.227884,0.259588,0.14217,0.185591,0.333622
412,(muffins),"(cake, oil, tea)",0.220234,0.180462,0.051081,0.231941,1.285262,1.0,0.011337,1.067025,0.284634,0.146107,0.062815,0.2575
404,"(cake, oil, tea)",(muffins),0.180462,0.220234,0.051081,0.283058,1.285262,1.0,0.011337,1.087628,0.270821,0.146107,0.080568,0.2575
399,"(gums, cake)","(oil, tea)",0.113721,0.388143,0.056426,0.496175,1.278329,1.0,0.012285,1.214423,0.245667,0.126674,0.176564,0.320774
428,"(gums, oil)","(tea, cooking oil)",0.207494,0.275727,0.073018,0.351902,1.276268,1.0,0.015806,1.117536,0.273141,0.178003,0.105174,0.30836


### Cluster two: Coupon Karen

In [33]:
associationRules(2)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
15,(beer),(salt),0.171701,0.096735,0.067081,0.390686,4.03873,1.0,0.050472,1.482429,0.908365,0.333149,0.325431,0.542071
16,(salt),(beer),0.096735,0.171701,0.067081,0.693456,4.03873,1.0,0.050472,2.702053,0.832975,0.333149,0.629911,0.542071
18,(white wine),(beer),0.134163,0.171701,0.088516,0.659768,3.842531,1.0,0.06548,2.434512,0.854381,0.407256,0.58924,0.587646
17,(beer),(white wine),0.171701,0.134163,0.088516,0.515524,3.842531,1.0,0.06548,1.787162,0.893102,0.407256,0.440454,0.587646
13,(beer),(cider),0.171701,0.095402,0.05542,0.322768,3.383244,1.0,0.039039,1.335729,0.850449,0.261805,0.251345,0.451838
14,(cider),(beer),0.095402,0.171701,0.05542,0.580908,3.383244,1.0,0.039039,1.976412,0.778717,0.261805,0.494033,0.451838
210,"(asparagus, cooking oil)","(cake, oil)",0.12761,0.291537,0.057641,0.451697,1.549364,1.0,0.020438,1.292101,0.40644,0.159447,0.226067,0.324706
248,"(napkins, cooking oil)","(oil, candy bars)",0.143159,0.227454,0.0502,0.350659,1.541669,1.0,0.017638,1.189739,0.410055,0.156672,0.159479,0.285681
249,"(oil, candy bars)","(napkins, cooking oil)",0.227454,0.143159,0.0502,0.220703,1.541669,1.0,0.017638,1.099506,0.454798,0.156672,0.0905,0.285681
246,"(napkins, oil)","(candy bars, cooking oil)",0.189249,0.172812,0.0502,0.265258,1.534952,1.0,0.017495,1.125821,0.429865,0.160969,0.11176,0.277873


### Cluster three: Gadget Geeks

In [34]:
associationRules(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
20,(cider),(white wine),0.123529,0.156078,0.074608,0.603968,3.869646,1.0,0.055328,2.130944,0.846096,0.363941,0.530724,0.540992
21,(white wine),(cider),0.156078,0.123529,0.074608,0.478015,3.869646,1.0,0.055328,1.679111,0.878729,0.363941,0.404447,0.540992
22,(white wine),(dessert wine),0.156078,0.101373,0.055686,0.356784,3.519532,1.0,0.039864,1.397085,0.848268,0.275996,0.284224,0.453053
23,(dessert wine),(white wine),0.101373,0.156078,0.055686,0.549323,3.519532,1.0,0.039864,1.872564,0.796627,0.275996,0.465973,0.453053
231,"(pancakes, energy bar)","(energy drink, protein bar)",0.128824,0.265588,0.057843,0.449011,1.690627,1.0,0.023629,1.332896,0.46891,0.171861,0.249754,0.333402
232,"(energy drink, protein bar)","(pancakes, energy bar)",0.265588,0.128824,0.057843,0.217793,1.690627,1.0,0.023629,1.113741,0.556232,0.171861,0.102125,0.333402
211,"(gadget for tiktok streaming, pancakes)","(energy drink, airpods)",0.153529,0.200196,0.051373,0.33461,1.671414,1.0,0.020637,1.202009,0.474564,0.169909,0.168059,0.295611
215,"(energy drink, airpods)","(gadget for tiktok streaming, pancakes)",0.200196,0.153529,0.051373,0.256611,1.671414,1.0,0.020637,1.138665,0.502253,0.169909,0.121778,0.295611
240,"(gadget for tiktok streaming, pancakes)","(energy drink, protein bar)",0.153529,0.265588,0.067647,0.440613,1.659008,1.0,0.026871,1.312887,0.469278,0.192469,0.23832,0.34766
244,"(energy drink, protein bar)","(gadget for tiktok streaming, pancakes)",0.265588,0.153529,0.067647,0.254707,1.659008,1.0,0.026871,1.135755,0.540882,0.192469,0.119528,0.34766


### Cluster four: Anti-PAN Families

In [35]:
associationRules(4)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
32,(white wine),(cider),0.145778,0.113215,0.077054,0.528571,4.668753,1.0,0.06055,1.88106,0.919913,0.423517,0.468385,0.604587
31,(cider),(white wine),0.113215,0.145778,0.077054,0.680602,4.668753,1.0,0.06055,2.674475,0.886133,0.423517,0.626095,0.604587
42,(dessert wine),(white wine),0.093336,0.145778,0.057554,0.616633,4.229941,1.0,0.043948,2.228208,0.842197,0.316997,0.551209,0.505719
41,(white wine),(dessert wine),0.145778,0.093336,0.057554,0.394805,4.229941,1.0,0.043948,1.498136,0.893901,0.316997,0.332504,0.505719
22,(white wine),(beer),0.145778,0.089549,0.050549,0.346753,3.8722,1.0,0.037495,1.393732,0.868333,0.273566,0.282502,0.455618
21,(beer),(white wine),0.089549,0.145778,0.050549,0.564482,3.8722,1.0,0.037495,1.961393,0.814705,0.273566,0.490158,0.455618
29,(champagne),(fresh tuna),0.175123,0.110943,0.066073,0.377297,3.400826,1.0,0.046645,1.427739,0.855829,0.300344,0.299592,0.48643
30,(fresh tuna),(champagne),0.110943,0.175123,0.066073,0.595563,3.400826,1.0,0.046645,2.039569,0.794048,0.300344,0.5097,0.48643
24,(champagne),(bluetooth headphones),0.175123,0.107535,0.060772,0.347027,3.227107,1.0,0.041941,1.366772,0.83664,0.273891,0.268349,0.456084
23,(bluetooth headphones),(champagne),0.107535,0.175123,0.060772,0.565141,3.227107,1.0,0.041941,1.896883,0.77328,0.273891,0.472819,0.456084


### Cluster five: Vegetarians

In [36]:
associationRules(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
358,"(asparagus, shallot)","(tomatoes, carrots)",0.094832,0.459651,0.054596,0.57571,1.252492,1.0,0.011006,1.273535,0.222712,0.109216,0.214784,0.347243
305,"(tomatoes, cauliflower)","(asparagus, carrots)",0.165433,0.390846,0.080622,0.487342,1.24689,1.0,0.015964,1.188227,0.237254,0.169497,0.15841,0.346809
304,"(asparagus, carrots)","(tomatoes, cauliflower)",0.390846,0.165433,0.080622,0.206276,1.24689,1.0,0.015964,1.051458,0.325048,0.169497,0.04894,0.346809
306,"(carrots, cauliflower)","(asparagus, tomatoes)",0.120859,0.539975,0.080622,0.667079,1.23539,1.0,0.015362,1.381786,0.216733,0.138953,0.276299,0.408193
368,"(zucchini, carrots)","(asparagus, tomatoes)",0.100965,0.539975,0.066861,0.662222,1.226395,1.0,0.012343,1.361917,0.205334,0.116467,0.265741,0.393022
303,"(asparagus, cauliflower)","(tomatoes, carrots)",0.143146,0.459651,0.080622,0.563218,1.225316,1.0,0.014825,1.237114,0.214604,0.154397,0.191667,0.369309
367,"(zucchini, tomatoes)","(asparagus, carrots)",0.140154,0.390846,0.066861,0.477054,1.220569,1.0,0.012082,1.164852,0.210166,0.144054,0.141522,0.324061
359,"(tomatoes, shallot)","(asparagus, carrots)",0.114651,0.390846,0.054596,0.47619,1.218359,1.0,0.009785,1.162931,0.202433,0.121081,0.140103,0.307938
296,"(avocado, tomatoes)","(asparagus, carrots)",0.118166,0.390846,0.056241,0.475949,1.217742,1.0,0.010056,1.162396,0.202768,0.124215,0.139708,0.309923
300,"(asparagus, tomatoes, carrots)",(cauliflower),0.333034,0.198863,0.080622,0.242084,1.217339,1.0,0.014394,1.057026,0.267684,0.178654,0.053949,0.32375


### Cluster six: Regular Families

In [37]:
associationRules(6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
153,"(spaghetti, bluetooth headphones)",(fresh tuna),0.080245,0.275775,0.051824,0.645822,2.341847,1.0,0.029694,2.044808,0.622977,0.170363,0.510956,0.416871
177,(fresh tuna),"(spaghetti, champagne)",0.275775,0.137592,0.088467,0.320795,2.331491,1.0,0.050523,1.269731,0.788553,0.272291,0.212432,0.481881
172,"(spaghetti, champagne)",(fresh tuna),0.137592,0.275775,0.088467,0.642967,2.331491,1.0,0.050523,2.028452,0.662204,0.272291,0.507013,0.481881
158,"(cottage cheese, champagne)",(fresh tuna),0.0891,0.275775,0.056884,0.638429,2.315036,1.0,0.032312,2.002994,0.623604,0.184693,0.500748,0.422349
162,(fresh tuna),"(cottage cheese, champagne)",0.275775,0.0891,0.056884,0.206269,2.315036,1.0,0.032312,1.147619,0.784344,0.184693,0.12863,0.422349
167,(frozen smoothie),"(champagne, fresh tuna)",0.14649,0.218385,0.073919,0.504606,2.310624,1.0,0.041928,1.577763,0.664569,0.254058,0.366191,0.421544
163,"(champagne, fresh tuna)",(frozen smoothie),0.218385,0.14649,0.073919,0.338482,2.310624,1.0,0.041928,1.290231,0.725698,0.254058,0.224945,0.421544
166,(fresh tuna),"(champagne, frozen smoothie)",0.275775,0.116213,0.073919,0.268043,2.306471,1.0,0.041871,1.207429,0.782129,0.232401,0.171794,0.452055
164,"(champagne, frozen smoothie)",(fresh tuna),0.116213,0.275775,0.073919,0.636067,2.306471,1.0,0.041871,1.989995,0.640921,0.232401,0.497486,0.452055
168,"(laptop, champagne)",(fresh tuna),0.07898,0.275775,0.050221,0.635878,2.305788,1.0,0.028441,1.988964,0.614871,0.164913,0.497226,0.408994
