# 1. Extracting the clusters

In [1]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)
pd.set_option('display.max_colwidth', None)

In [4]:
clustering_customer_info = pd.read_csv('../data/raw/cleanerer_customer_info.csv', index_col="customer_id").drop(columns=[
    'customer_gender',
    'age',
    'percentage_of_products_bought_promotion',
    'typical_hour',
    'distinct_stores_visited',
    'number_complaints',
    "kids_home",
    "teens_home"])

customer_info = pd.read_csv('../data/raw/cleanerer_customer_info.csv', index_col="customer_id")

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type([values.dtype, comps_array.dtype], [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type([values.dtype, comps_array.dtype], [])


In [5]:
k_large = 200
kmeans = KMeans(n_clusters=k_large, random_state=42, n_init=10)
kmeans.fit(clustering_customer_info)
kmeans_labels = kmeans.labels_

agg = AgglomerativeClustering(n_clusters=7)
agg.fit(kmeans.cluster_centers_)
hier_labels_centroids = agg.labels_

label_map = dict(zip(range(k_large), hier_labels_centroids))
final_labels = np.array([label_map[label] for label in kmeans_labels])

In [6]:
customer_info['cluster'] = final_labels

In [7]:
# Exporting the clusters to each customer
assigned_clusters = customer_info[['cluster']].to_csv('../data/raw/assigned_clusters.csv', index_label='customer_id')

# 2. Cluster Analysis

## 2.1. Cluster Sizes

In [8]:
cluster_sizes = customer_info['cluster'].value_counts().sort_index()
print(cluster_sizes)

0    16308
1     1746
2     6256
3     3009
4      769
5     4500
6      739
Name: cluster, dtype: int64


We see that most of our clusters are somewhat evenly distributed, even if our cluster zero (Hyper Hygienic) has higher number of customers in it, we find that this distribution is somewhat fair. 

## 2.2. Association Rules

In our implementation, we will make use of the Apriori algorithm, as it gives us a better overview by evaluating more metrics.

### 2.2.1 Extracting the transactions of each cluster

In [9]:
customer_basket = pd.read_csv('../data/raw/customer_basket.csv')

# Linking each transaction to each cluster
customer_basket = customer_basket.merge(customer_info[['cluster']], on='customer_id', how='left')

# Grouping all transactions of each cluster
cluster_transactions = {}
for cluster_label in customer_basket['cluster'].unique():
    cluster_data = customer_basket[customer_basket['cluster'] == cluster_label]['list_of_goods'].apply(eval).tolist()
    cluster_transactions[cluster_label] = cluster_data

### Cluster Zero: Hyper-Hygienic

In [10]:
transactions_0 = cluster_transactions[0]

te = TransactionEncoder()
te_fit = te.fit_transform(transactions_0)
transactions_0 = pd.DataFrame(te_fit, columns=te.columns_)

In [11]:
frequent_itemsets_grocery = apriori(
    transactions_0, min_support=0.05, use_colnames=True
    )

In [12]:
rules_grocery = association_rules(frequent_itemsets_grocery,
                                  metric="confidence",
                                  min_threshold=0.2,
                                  num_itemsets = len(frequent_itemsets_grocery))
rules_grocery.sort_values(by='lift', ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
98,(bluetooth headphones),"(champagne, fresh tuna)",0.140889,0.125565,0.061855,0.439035,3.496464,1.0,0.044164,1.558805,0.831087,0.302323,0.358483,0.465823
97,"(champagne, fresh tuna)",(bluetooth headphones),0.125565,0.140889,0.061855,0.492612,3.496464,1.0,0.044164,1.693203,0.816524,0.302323,0.409403,0.465823
99,(champagne),"(bluetooth headphones, fresh tuna)",0.241812,0.073968,0.061855,0.255798,3.458246,1.0,0.043969,1.244329,0.937546,0.243596,0.196354,0.546022
96,"(bluetooth headphones, fresh tuna)",(champagne),0.073968,0.241812,0.061855,0.836246,3.458246,1.0,0.043969,4.630036,0.767615,0.243596,0.784019,0.546022
84,"(cologne, oil)",(barbecue sauce),0.119916,0.189755,0.076011,0.633866,3.340439,1.0,0.053256,2.212976,0.796103,0.325303,0.54812,0.517219
89,(barbecue sauce),"(cologne, oil)",0.189755,0.119916,0.076011,0.400571,3.340439,1.0,0.053256,1.468205,0.864724,0.325303,0.318896,0.517219
93,(barbecue sauce),"(oil, deodorant)",0.189755,0.094106,0.058394,0.307735,3.270072,1.0,0.040537,1.308593,0.856774,0.258992,0.23582,0.464124
92,"(oil, deodorant)",(barbecue sauce),0.094106,0.189755,0.058394,0.620514,3.270072,1.0,0.040537,2.13511,0.766311,0.258992,0.53164,0.464124
86,"(barbecue sauce, oil)",(cologne),0.150374,0.161278,0.076011,0.505476,3.134201,1.0,0.051759,1.69602,0.801458,0.322569,0.410384,0.48839
87,(cologne),"(barbecue sauce, oil)",0.161278,0.150374,0.076011,0.471303,3.134201,1.0,0.051759,1.607018,0.811877,0.322569,0.37773,0.48839


### Cluster One: Discount Driven

We may now define a function to automate this process and check which items are more often bought together in each cluster: 

In [13]:
def associationRules(clusternum):
    transactions = cluster_transactions[clusternum]
    te = TransactionEncoder()
    te_fit = te.fit_transform(transactions)
    transactions = pd.DataFrame(te_fit, columns=te.columns_)
    frequent_itemsets_grocery = apriori(
    transactions, min_support=0.05, use_colnames=True
    )
    rules = association_rules(frequent_itemsets_grocery,
                                  metric="confidence",
                                  min_threshold=0.2,
                                  num_itemsets = len(frequent_itemsets_grocery))
    return rules.sort_values(by='lift', ascending=False).head(10)

In [14]:
associationRules(1)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
103,(beer),"(salt, white wine)",0.309548,0.088844,0.074774,0.241558,2.718899,1.0,0.047272,1.201353,0.915638,0.231056,0.167605,0.541594
102,"(salt, white wine)",(beer),0.088844,0.309548,0.074774,0.841629,2.718899,1.0,0.047272,4.359713,0.693848,0.231056,0.770627,0.541594
100,"(beer, white wine)",(salt),0.180101,0.15799,0.074774,0.415179,2.62788,1.0,0.04632,1.439773,0.755538,0.283969,0.305446,0.444231
105,(salt),"(beer, white wine)",0.15799,0.180101,0.074774,0.473282,2.62788,1.0,0.04632,1.556621,0.735698,0.283969,0.357583,0.444231
98,(white wine),"(beer, dessert wine)",0.26794,0.097889,0.066533,0.248312,2.536658,1.0,0.040304,1.200113,0.827501,0.222297,0.166745,0.463992
95,"(beer, dessert wine)",(white wine),0.097889,0.26794,0.066533,0.679671,2.536658,1.0,0.040304,2.285342,0.671515,0.222297,0.562429,0.463992
94,"(beer, white wine)",(dessert wine),0.180101,0.150352,0.066533,0.36942,2.457036,1.0,0.039454,1.347407,0.723266,0.252094,0.257834,0.405967
99,(dessert wine),"(beer, white wine)",0.150352,0.180101,0.066533,0.442513,2.457036,1.0,0.039454,1.470707,0.697942,0.252094,0.320055,0.405967
19,(beer),(salt),0.309548,0.15799,0.119598,0.386364,2.445495,1.0,0.070693,1.372165,0.856084,0.343732,0.271224,0.571681
20,(salt),(beer),0.15799,0.309548,0.119598,0.756997,2.445495,1.0,0.070693,2.841338,0.701993,0.343732,0.648053,0.571681


### Cluster two: Coupon Karen

In [15]:
associationRules(2)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
401,"(tea, cooking oil)","(oil, muffins)",0.246627,0.162233,0.057847,0.234551,1.445769,1.0,0.017836,1.094479,0.409262,0.164799,0.086323,0.295559
400,"(oil, muffins)","(tea, cooking oil)",0.162233,0.246627,0.057847,0.356566,1.445769,1.0,0.017836,1.170862,0.368034,0.164799,0.145929,0.295559
402,"(cooking oil, muffins)","(oil, tea)",0.117496,0.346425,0.057847,0.492329,1.421172,1.0,0.017143,1.287399,0.335812,0.142454,0.22324,0.329656
404,(muffins),"(oil, tea, cooking oil)",0.200798,0.204949,0.057847,0.288085,1.405642,1.0,0.016693,1.116778,0.361087,0.166274,0.104567,0.285167
396,"(oil, tea, cooking oil)",(muffins),0.204949,0.200798,0.057847,0.282249,1.405642,1.0,0.016693,1.113482,0.362972,0.166274,0.101917,0.285167
372,"(gums, cake)","(oil, tea)",0.103403,0.346425,0.050145,0.484945,1.399855,1.0,0.014323,1.268941,0.318583,0.125461,0.211941,0.314847
391,"(gums, oil)","(tea, cooking oil)",0.189436,0.246627,0.065221,0.344291,1.395998,1.0,0.018501,1.148944,0.349962,0.175873,0.129635,0.304371
394,"(tea, cooking oil)","(gums, oil)",0.246627,0.189436,0.065221,0.264452,1.395998,1.0,0.018501,1.101987,0.376528,0.175873,0.092548,0.304371
373,"(cake, tea)","(gums, oil)",0.190474,0.189436,0.050145,0.263264,1.389725,1.0,0.014062,1.100209,0.346416,0.152062,0.091082,0.263985
374,"(gums, oil)","(cake, tea)",0.189436,0.190474,0.050145,0.264706,1.389725,1.0,0.014062,1.100956,0.345973,0.152062,0.091698,0.263985


### Cluster three: Gadget Geeks

In [16]:
associationRules(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
346,"(energy drink, protein bar)","(gadget for tiktok streaming, pancakes)",0.323499,0.185742,0.083343,0.25763,1.387028,1.0,0.023255,1.096835,0.412466,0.195687,0.088286,0.353166
342,"(gadget for tiktok streaming, pancakes)","(energy drink, protein bar)",0.185742,0.323499,0.083343,0.448702,1.387028,1.0,0.023255,1.227106,0.342685,0.195687,0.185074,0.353166
334,"(energy drink, protein bar)","(pancakes, energy bar)",0.323499,0.158306,0.070945,0.219304,1.385324,1.0,0.019733,1.078134,0.411155,0.172674,0.072472,0.333728
333,"(pancakes, energy bar)","(energy drink, protein bar)",0.158306,0.323499,0.070945,0.448151,1.385324,1.0,0.019733,1.22588,0.330461,0.172674,0.18426,0.333728
354,"(pancakes, iphone 10)","(energy drink, protein bar)",0.122718,0.323499,0.054988,0.448082,1.385112,1.0,0.015289,1.225728,0.31693,0.140552,0.184158,0.309031
303,"(gadget for tiktok streaming, pancakes)","(energy drink, airpods)",0.185742,0.243944,0.062565,0.336836,1.380788,1.0,0.017254,1.140073,0.338684,0.170419,0.122863,0.296653
307,"(energy drink, airpods)","(gadget for tiktok streaming, pancakes)",0.243944,0.185742,0.062565,0.256471,1.380788,1.0,0.017254,1.095125,0.364756,0.170419,0.086862,0.296653
295,"(energy drink, airpods)","(pancakes, energy bar)",0.243944,0.158306,0.053151,0.217882,1.37634,1.0,0.014533,1.076174,0.361661,0.152253,0.070782,0.276816
294,"(pancakes, energy bar)","(energy drink, airpods)",0.158306,0.243944,0.053151,0.335751,1.37634,1.0,0.014533,1.13821,0.324863,0.152253,0.121428,0.276816
171,"(phone car charger, energy drink)",(energy bar),0.141316,0.263001,0.05097,0.360682,1.371412,1.0,0.013804,1.15279,0.315395,0.14425,0.13254,0.277242


### Cluster four: Anti-PAN Families

In [17]:
associationRules(4)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
37,(dessert wine),(cider),0.113918,0.140957,0.05984,0.525292,3.726599,1.0,0.043783,1.809622,0.825724,0.306818,0.447399,0.47491
36,(cider),(dessert wine),0.140957,0.113918,0.05984,0.424528,3.726599,1.0,0.043783,1.539748,0.851714,0.306818,0.350543,0.47491
39,(white wine),(cider),0.193706,0.140957,0.101064,0.521739,3.701395,1.0,0.07376,1.79618,0.905168,0.432638,0.443263,0.61936
38,(cider),(white wine),0.140957,0.193706,0.101064,0.716981,3.701395,1.0,0.07376,2.848907,0.849587,0.432638,0.648988,0.61936
25,(cider),(beer),0.140957,0.109043,0.054521,0.386792,3.54717,1.0,0.039151,1.452946,0.835913,0.278912,0.311743,0.443396
24,(beer),(cider),0.109043,0.140957,0.054521,0.5,3.54717,1.0,0.039151,1.718085,0.80597,0.278912,0.417957,0.443396
27,(white wine),(beer),0.193706,0.109043,0.074025,0.382151,3.504605,1.0,0.052903,1.442031,0.886353,0.323643,0.306534,0.530506
26,(beer),(white wine),0.109043,0.193706,0.074025,0.678862,3.504605,1.0,0.052903,2.510739,0.802127,0.323643,0.601711,0.530506
49,(white wine),(dessert wine),0.193706,0.113918,0.076684,0.395881,3.475127,1.0,0.054618,1.466733,0.883351,0.332054,0.318213,0.534516
50,(dessert wine),(white wine),0.113918,0.193706,0.076684,0.673152,3.475127,1.0,0.054618,2.466877,0.803809,0.332054,0.594629,0.534516


### Cluster five: Vegetarians

In [18]:
associationRules(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
358,"(asparagus, shallot)","(tomatoes, carrots)",0.094981,0.460375,0.054682,0.57571,1.250525,1.0,0.010955,1.271831,0.221361,0.109216,0.213732,0.347243
304,"(asparagus, carrots)","(tomatoes, cauliflower)",0.391461,0.165693,0.080749,0.206276,1.244932,1.0,0.015887,1.05113,0.323304,0.169497,0.048643,0.346809
305,"(tomatoes, cauliflower)","(asparagus, carrots)",0.165693,0.391461,0.080749,0.487342,1.244932,1.0,0.015887,1.187027,0.235816,0.169497,0.157559,0.346809
306,"(carrots, cauliflower)","(asparagus, tomatoes)",0.121049,0.540824,0.080749,0.667079,1.23345,1.0,0.015283,1.379235,0.215331,0.138953,0.27496,0.408193
368,"(zucchini, carrots)","(asparagus, tomatoes)",0.101124,0.540824,0.066966,0.662222,1.224469,1.0,0.012276,1.359403,0.203943,0.116467,0.264383,0.393022
303,"(asparagus, cauliflower)","(tomatoes, carrots)",0.143371,0.460375,0.080749,0.563218,1.223392,1.0,0.014745,1.235458,0.213161,0.154397,0.190584,0.369309
367,"(zucchini, tomatoes)","(asparagus, carrots)",0.140375,0.391461,0.066966,0.477054,1.218652,1.0,0.012015,1.163676,0.20872,0.144054,0.140654,0.324061
359,"(tomatoes, shallot)","(asparagus, carrots)",0.114831,0.391461,0.054682,0.47619,1.216445,1.0,0.00973,1.161757,0.201015,0.121081,0.139235,0.307938
296,"(avocado, tomatoes)","(asparagus, carrots)",0.118352,0.391461,0.05633,0.475949,1.215829,1.0,0.009999,1.161222,0.201346,0.124215,0.138839,0.309923
300,"(asparagus, tomatoes, carrots)",(cauliflower),0.333558,0.199176,0.080749,0.242084,1.215427,1.0,0.014312,1.056613,0.265956,0.178654,0.05358,0.32375


### Cluster six: Regular Families

In [19]:
associationRules(6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
264,(tea),"(gums, oil)",0.18672,0.098485,0.052139,0.279236,2.835322,1.0,0.03375,1.250778,0.795921,0.223709,0.200497,0.404324
260,"(gums, oil)",(tea),0.098485,0.18672,0.052139,0.529412,2.835322,1.0,0.03375,1.72822,0.718021,0.223709,0.42137,0.404324
263,(gums),"(oil, tea)",0.118984,0.154635,0.052139,0.438202,2.833792,1.0,0.03374,1.50475,0.734511,0.235412,0.335438,0.387689
262,"(oil, tea)",(gums),0.154635,0.118984,0.052139,0.337176,2.833792,1.0,0.03374,1.329185,0.765487,0.235412,0.247659,0.387689
77,(gums),(tea),0.118984,0.18672,0.06016,0.505618,2.707892,1.0,0.037944,1.645043,0.715888,0.245009,0.392113,0.413907
78,(tea),(gums),0.18672,0.118984,0.06016,0.322196,2.707892,1.0,0.037944,1.299809,0.775513,0.245009,0.230656,0.413907
4,(asparagus),(carrots),0.247772,0.140374,0.05303,0.214029,1.5247,1.0,0.018249,1.093711,0.457485,0.158245,0.085682,0.295903
5,(carrots),(asparagus),0.140374,0.247772,0.05303,0.377778,1.5247,1.0,0.018249,1.208938,0.400329,0.158245,0.172828,0.295903
136,"(oil, tomatoes)",(asparagus),0.143939,0.247772,0.051248,0.356037,1.436956,1.0,0.015584,1.168124,0.355214,0.150524,0.143926,0.281436
137,(asparagus),"(oil, tomatoes)",0.247772,0.143939,0.051248,0.206835,1.436956,1.0,0.015584,1.079296,0.404245,0.150524,0.07347,0.281436
