### The purpose of this notebook is to showcase how to do association rule mining

* we will need to install mlxtend:
 - conda install -c conda-forge mlxtend

In [45]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import fpgrowth

In [46]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from datetime import datetime, timedelta
import collections


 - first, prepare the data

In [47]:
from math import floor
def prep_data(filename, dropna,datecol):
    baskets = pd.read_csv(filename, parse_dates = [datecol])
    if dropna:
        baskets.dropna(inplace=True)
    for s in baskets.columns:
        if ("id" in s):
            baskets.loc[:,s] = pd.Categorical(baskets.loc[:,s].apply(lambda x: floor(x)))
    if datecol:
        baskets['date'] = baskets[datecol].dt.date
        baskets['year'] = baskets[datecol].dt.year
        baskets['month'] = baskets[datecol].dt.month
        baskets['day'] = baskets[datecol].dt.day
        baskets['hour'] = baskets[datecol].dt.hour
        baskets['weekday'] = baskets[datecol].dt.weekday
        baskets["spent"] = baskets["qty"] * baskets["price"]
    return baskets

def make_merchants(baskets):
    merchants = baskets.groupby(['merchant_id']).agg(
        total_spent = ('spent', 'sum'), 
        num_orders = ('order_id', 'nunique'), 
        num_days = ('date', 'nunique'), 
        num_skus = ('sku_id','nunique'), 
        num_top_cats = ('top_cat_id','nunique'), 
        num_sub_cats = ('sub_cat_id','nunique'),
    ).reset_index()
    merchants['avg_spent_per_order'] = merchants.total_spent / merchants.num_orders
    return merchants

def make_skus(baskets):
    skus_by_day = baskets.groupby(['sku_id','date']).agg(
        avg_price_by_day = ('price','mean'),
        num_order_by_day = ('order_id', 'nunique'), 
        num_merchants_by_day = ('merchant_id', 'nunique'),
    ).reset_index()
    return skus_by_day

def make_top_cats(baskets):
    top_cats = baskets.groupby(['top_cat_id']).agg(
        avg_price = ('price', 'mean'),
        total_spent = ('spent', 'sum'),
        total_quantity = ('qty' , 'sum'),
        num_orders = ('order_id', 'nunique'), 
        num_days = ('date' , 'nunique'),
        num_merchants = ('merchant_id', 'nunique')
    ).reset_index()
    return top_cats


In [48]:
dropna = True
datecol = 'placed_at'
filename = 'new_baskets_full.csv'

baskets = prep_data(filename, dropna,datecol)
merchants = make_merchants(baskets)
skus = make_skus(baskets)
top_cats = make_top_cats(baskets)

In [49]:
df = baskets
df.groupby('order_id').all()

Unnamed: 0_level_0,id,placed_at,merchant_id,sku_id,top_cat_id,sub_cat_id,qty,price,date,year,month,day,hour,weekday,spent
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62044,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True
62045,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True
62046,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True
62047,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True


In [50]:
transaction_data = []
for groups in df.groupby('order_id').groups.values():
    transaction_data.append(list(df.loc[groups]['sku_id'].to_numpy()))

In [66]:
transaction_data[0]

[470,
 237,
 236,
 238,
 547,
 253,
 355,
 1354,
 281,
 425,
 204,
 1367,
 570,
 276,
 269,
 271,
 569,
 493,
 404,
 235,
 1558,
 1300,
 299,
 438,
 589,
 435,
 432,
 1545,
 460,
 1544,
 481,
 485,
 762,
 327,
 267,
 336,
 184,
 337,
 710,
 717,
 501,
 1577,
 1335,
 262]

In [52]:
def oneHotCoding(transaction_data):
    te = TransactionEncoder()
    te_ary = te.fit(transaction_data).transform(transaction_data)
    transaction_df = pd.DataFrame(te_ary, columns=te.columns_)
    return transaction_df

transaction_sku_data = oneHotCoding(transaction_data)

In [65]:
transaction_sku_data[1]

0        False
1        False
2        False
3        False
4        False
         ...  
62042    False
62043    False
62044    False
62045    False
62046    False
Name: 1, Length: 62047, dtype: bool

In [54]:
def frq_sku_itemsets(data,suppVar):
    min_support = suppVar/data.shape[0]
    frequent_itemsets = fpgrowth(data, min_support=min_support, use_colnames=True)
    return frequent_itemsets

freq_itemsets = frq_sku_itemsets(transaction_sku_data,30)

In [55]:
freq_itemsets.head(50)

Unnamed: 0,support,itemsets
0,0.12086,(327)
1,0.090077,(438)
2,0.059906,(276)
3,0.056054,(184)
4,0.055313,(432)
5,0.052476,(485)
6,0.05159,(336)
7,0.049463,(547)
8,0.042307,(337)
9,0.041549,(1300)


In [56]:
# compute and print the association rules

def basket_rules(freq_itemsets,metrics,threshold):
    asso_rules = association_rules(freq_itemsets, metric=metrics, min_threshold=threshold)
    return asso_rules.sort_values(by='lift', ascending=False)[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

fp_rules = basket_rules(freq_itemsets,"confidence",0.10)
fp_rules.head(60)

Unnamed: 0,antecedents,consequents,support,confidence,lift
884882,(1245),(1244),0.000564,0.564516,875.663306
884881,(1244),(1245),0.000564,0.875,875.663306
872749,"(432, 858)","(859, 435)",0.000484,0.491803,726.545667
872746,"(859, 435)","(432, 858)",0.000484,0.714286,726.545667
872747,"(432, 859)","(858, 435)",0.000484,0.454545,705.079545
872748,"(858, 435)","(432, 859)",0.000484,0.75,705.079545
884369,"(1105, 1107)",(1106),0.000548,0.944444,681.394703
884372,(1106),"(1105, 1107)",0.000548,0.395349,681.394703
881905,"(977, 978)","(976, 975)",0.000548,0.723404,669.926326
881904,"(976, 975)","(977, 978)",0.000548,0.507463,669.926326


In [57]:
fp_rules

Unnamed: 0,antecedents,consequents,support,confidence,lift
884882,(1245),(1244),0.000564,0.564516,875.663306
884881,(1244),(1245),0.000564,0.875000,875.663306
872749,"(432, 858)","(859, 435)",0.000484,0.491803,726.545667
872746,"(859, 435)","(432, 858)",0.000484,0.714286,726.545667
872747,"(432, 859)","(858, 435)",0.000484,0.454545,705.079545
...,...,...,...,...,...
683866,(283),(327),0.000967,0.111524,0.922755
683868,"(282, 283)",(327),0.000532,0.104762,0.866804
866618,(626),(327),0.000580,0.104651,0.865888
793910,"(419, 420)",(327),0.001451,0.103093,0.852993


In [58]:
def predict(antecedent, rules, max_results= 6):
    
    # get the rules for this antecedent
    preds = rules[rules['antecedents'] == antecedent]
    
    # a way to convert a frozen set with one element to string
    preds = preds['consequents'].apply(iter).apply(next)
    
    return preds[:max_results]

In [67]:
preds = predict({859, 435}, fp_rules)
preds

872746    432
872739    858
872384    432
Name: consequents, dtype: int64

In [60]:
fp_rules.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 885767 entries, 884882 to 793902
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   antecedents  885767 non-null  object 
 1   consequents  885767 non-null  object 
 2   support      885767 non-null  float64
 3   confidence   885767 non-null  float64
 4   lift         885767 non-null  float64
dtypes: float64(3), object(2)
memory usage: 40.5+ MB
