# Market Basket Analysis - 'Product' level

Based on https://github.com/chris1610/pbpython/blob/master/notebooks/Market_Basket_Intro.ipynb

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

pd.set_option('display.float_format', lambda x: '%.3f' % x)

%matplotlib inline

## Load sales data

In [2]:
def load_sales_data():
    hours = ['0'+str(x)+':00' if x < 10 else str(x)+':00' for x in range(24)]
    hour_type = pd.CategoricalDtype(categories=hours, ordered=True)

    dtype={'GUESTCHECKID': object,
           'Date': str,
           'HourName': hour_type,
           'QuarterName': "category",
           'Product': "category",
           'FamilyGroup': "category",
           'MajorGroup': "category",
           'MPK': object,
           'Restaurant': object,
           'LocationType': "category",
           'Concept': "category",
           'ItemType': "category",
           'ComboMealNum': np.float64,
           'ile_razy': np.float64,
           'SalesChannel': "category"
           }
    parse_dates = ['Date']

    data = pd.read_csv(os.path.join(os.environ['DATA_PATH'],
                                    'kiosk_produkty/KIOSK_Produkty.csv'),
                       delimiter=";", thousands=',',
                       dtype=dtype,
                       parse_dates=parse_dates)
    return data

In [3]:
%%time
data = load_sales_data()

CPU times: user 1min 10s, sys: 6.06 s, total: 1min 16s
Wall time: 1min 17s


## Analysis - 'Product' level
We will create baskets of individual products and see if we can find some interesing relationships.

### Prepare basket data

In [4]:
# https://github.com/pandas-dev/pandas/issues/19136#issuecomment-380908428
def reset_index(df):
    '''Returns DataFrame with index as columns'''
    index_df = df.index.to_frame(index=False)
    df = df.reset_index(drop=True)
    #  In merge is important the order in which you pass the dataframes
    # if the index contains a Categorical. 
    # pd.merge(df, index_df, left_index=True, right_index=True) does not work
    return pd.merge(index_df, df, left_index=True, right_index=True)


# Convert the units to 1 hot encoded values
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1


def create_basket_sets(data, variable):
    
    basket = (data.groupby(['GUESTCHECKID', variable])['ile_razy']
          .sum().unstack())
    
    basket = reset_index(basket).fillna(0).set_index('GUESTCHECKID').applymap(encode_units)
    
    basket_sets = basket.applymap(encode_units)
    
    return basket_sets

In [None]:
%%time
basket_sets = create_basket_sets(data, 'Product')

In [24]:
%%time
basket_sets.to_csv(os.path.join(os.environ['DATA_PATH'], 'basket_sets/basket_sets_product.csv'))

CPU times: user 24min 32s, sys: 5.11 s, total: 24min 38s
Wall time: 24min 46s


### Load data

In [2]:
%%time
basket_sets = pd.read_csv(os.path.join(os.environ['DATA_PATH'], 'basket_sets/basket_sets_product.csv'), index_col=0)

  mask |= (ar1 == a)


CPU times: user 2min 24s, sys: 14.7 s, total: 2min 38s
Wall time: 2min 39s


In [90]:
basket_sets = basket_sets.drop(columns=['Customer'])

In [91]:
basket_sets.head()

Unnamed: 0_level_0,10HotWings,10Strips,10xCOBHS,10xCOBKent,10xHotWings,10xStripsHS,11HotWings,11Strips,12xHotWings,12xStripsHS,...,Zinger_wege,app_2xCheesburg,app_2xSmallCoff,app_Bites&Fries,Halloumi,TwisteHaluBox,TwisteHalum,mHalloumi,mHalloumiBox,mTwisteHalum
GUESTCHECKID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000000246,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000000247,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000000406,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000000415,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000000416,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Create basket rules

In [107]:
def analyze_basket(basket_sets):
    # Build up the frequent items
    frequent_itemsets = apriori(basket_sets, min_support=0.005, use_colnames=True)
    
    display(frequent_itemsets.sort_values('support', ascending=False).head(25))
    
    # Create the rules
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
    
    return rules

In [None]:
%%time
rules = analyze_basket(basket_sets)

### Explore basket rules

#### Metric explanations
- Say, that lift for an association rule “if Toast then Coffee” is 1.48 because the confidence is 70%. This means that consumers who purchase Toast are 1.48 times more likely to purchase Coffee than randomly chosen customers. Larger lift means more interesting rules. Association rules with high support are potentially interesting rules. Similarly, rules with high confidence would be interesting rules as well.

In [None]:
rules_sic = rules[~rules['consequents'].astype(str).str.contains(',', regex=False)]
print(f"Total number of rules found: {len(rules)} Number of rules with single item consequents: {len(rules_sic)} ")
rules = rules_sic

In [None]:
rules = rules[~(rules['consequents'].astype(str).str.contains('Fries', regex=False) |
                rules['consequents'].astype(str).str.contains('LargeFries', regex=False) |
                rules['consequents'].astype(str).str.contains('PepsiRefill', regex=False)
               )]

In [None]:
rules.query('4 >lift > 1.2 and confidence > 0.3').sort_values('lift', ascending=False).head(300)

### Remove standard sets - related associations
Example: If someone buys Bsmart then we expect fries or sandwitch to be in the basket too.

Those kinds of associations are not interesting and we will remove them.

In [110]:
def phrase_filter(rules, phrase):
    rows_w_phrase = (rules['antecedents'].astype(str).str.contains(phrase, regex=False) | rules['consequents'].astype(str).str.contains(phrase, regex=False))
    
    return (rows_w_phrase)

In [111]:
len(rules)

8996

In [112]:
fillter = (~phrase_filter(rules, 'Bsmart') & ~phrase_filter(rules, 'app_bucketfor1') & ~phrase_filter(rules, 'ex_45BitesAddon'))
display(len(rules[fillter]))
rules[fillter].sort_values('lift', ascending=False).head(50)

6070

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
6304,"(DipMajoGarlic, 8Strips (new))","(8HotWings, 2ChickenKent., Fries)",0.006,0.006,0.006,0.855,150.605,0.005,6.868
6301,"(8HotWings, 2ChickenKent., Fries)","(DipMajoGarlic, 8Strips (new))",0.006,0.006,0.006,0.974,150.605,0.005,37.869
8420,"(DipMajoGarlic, 8Strips (new))","(8HotWings, mClassicBucketXL, 2ChickenKent., F...",0.006,0.006,0.005,0.834,150.549,0.005,5.988
8397,"(8HotWings, mClassicBucketXL, 2ChickenKent., F...","(DipMajoGarlic, 8Strips (new))",0.006,0.006,0.005,0.973,150.549,0.005,37.356
6302,"(8HotWings, DipMajoGarlic, 2ChickenKent.)","(8Strips (new), Fries)",0.006,0.007,0.006,0.991,150.541,0.005,107.802
6303,"(8Strips (new), Fries)","(8HotWings, DipMajoGarlic, 2ChickenKent.)",0.007,0.006,0.006,0.84,150.541,0.005,6.222
8419,"(8Strips (new), Fries)","(8HotWings, DipMajoGarlic, mClassicBucketXL, 2...",0.007,0.005,0.005,0.819,150.515,0.005,5.503
8398,"(8HotWings, DipMajoGarlic, mClassicBucketXL, 2...","(8Strips (new), Fries)",0.005,0.007,0.005,0.991,150.515,0.005,105.818
7815,"(8Strips (new), Fries)","(8HotWings, DipMajoGarlic, mClassicBucketXL)",0.007,0.006,0.006,0.955,150.511,0.006,22.145
7814,"(8HotWings, DipMajoGarlic, mClassicBucketXL)","(8Strips (new), Fries)",0.006,0.007,0.006,0.991,150.511,0.006,105.551


#### Conclusions
- Looks like association analysis gives blurry image of the situation due to the obvious patterns that are very frequent:
        - Bsmart: Fries, some main (longer, 2strips, iTwistB)
        - app_bucketfor1: ex_45BitesAddon, app_bucketfor1, 2HotWings, DrumstickKent., fries, 
        - etc
- Solution could be in removing such obvious product combinations from the data by hand

### TODO:  Filtering data
We will fillter out less popular products and those which are no actual products.

In [13]:
limit = 1000
val_counts = data['Product'].value_counts()
less_popular_products = list(val_counts[val_counts < limit].index)
val_to_filter = ['Customer'] + less_popular_products

In [14]:
len(val_counts)

585

In [15]:
data_prod_filtered = data[~data['Product'].isin(val_to_filter)]

In [18]:
# Standard sets
{'Bsmart': ['mStripsBsmart', 'mMiniTwistBsmart', 'mLongerBsmart', 'Fries', '2Strips', 'Longer', 'iTwistB'],
'app_bucketfor1': ['DrumstickKent.','2HotWings','ex_45BitesAddon', 'Fries']
}

{'Bsmart': ['mStripsBsmart',
  'mMiniTwistBsmart',
  'mLongerBsmart',
  'Fries',
  '2Strips',
  'Longer',
  'iTwistB'],
 'app_bucketfor1': ['DrumstickKent.', '2HotWings', 'ex_45BitesAddon', 'Fries']}

## Analysis - 'FamilyGroup' level

In [19]:
# Delete unnecesary data and save memory
del basket_sets

### Prepare basket data

In [20]:
%%time
basket_sets = create_basket_sets(data, 'FamilyGroup')

CPU times: user 51.9 s, sys: 420 ms, total: 52.4 s
Wall time: 52.5 s


In [21]:
%%time
basket_sets.to_csv(os.path.join(os.environ['DATA_PATH'], 'basket_sets/basket_sets_familygroup.csv'))

CPU times: user 47.5 s, sys: 216 ms, total: 47.7 s
Wall time: 48.2 s


In [25]:
%%time
# Load data (if prepared before)
basket_sets = pd.read_csv(os.path.join(os.environ['DATA_PATH'], 'basket_sets/basket_sets_familygroup.csv'), index_col=0)

CPU times: user 2.41 s, sys: 24 ms, total: 2.44 s
Wall time: 2.44 s


In [23]:
basket_sets.head()

Unnamed: 0_level_0,Box,Bucket,Burgers & Sandwiches,Chicken,Cold Beverages,Condiments,Desserts,Hot Beverages,Menus,Promos,Salads,Sides,Value,Coupons,Breakfast,LSM
GUESTCHECKID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1000000246,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0
1000000247,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
1000000406,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
1000000415,0,1,0,1,1,0,0,0,0,1,0,1,0,0,0,0
1000000416,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0


### Create basket rules

In [79]:
%%time
rules = analyze_basket(basket_sets)

Unnamed: 0,support,itemsets
8,0.998,(Promos)
9,0.752,(Sides)
38,0.75,"(Sides, Promos)"
2,0.641,(Burgers & Sandwiches)
22,0.64,"(Promos, Burgers & Sandwiches)"
23,0.495,"(Sides, Burgers & Sandwiches)"
59,0.494,"(Sides, Promos, Burgers & Sandwiches)"
3,0.443,(Chicken)
27,0.442,"(Chicken, Promos)"
28,0.408,"(Sides, Chicken)"


CPU times: user 6.21 s, sys: 8 ms, total: 6.22 s
Wall time: 6.28 s


### Explore basket rules

In [80]:
basket_sets.columns.values

array(['Box', 'Bucket', 'Burgers & Sandwiches', 'Chicken',
       'Cold Beverages', 'Condiments', 'Desserts', 'Hot Beverages',
       'Menus', 'Promos', 'Salads', 'Sides', 'Value', 'Coupons',
       'Breakfast', 'LSM'], dtype=object)

In [81]:
len(rules)

472

In [82]:
pd.set_option('display.max_rows', 300)
fillter = (~phrase_filter(rules, 'Promos'))
rules[fillter].sort_values('lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
308,(Menus),"(Sides, Burgers & Sandwiches, Cold Beverages)",0.173,0.24,0.135,0.777,3.234,0.093,3.407
297,"(Sides, Burgers & Sandwiches, Cold Beverages)",(Menus),0.24,0.173,0.135,0.56,3.234,0.093,1.879
59,"(Sides, Cold Beverages)",(Box),0.312,0.072,0.071,0.227,3.135,0.048,1.2
62,(Box),"(Sides, Cold Beverages)",0.072,0.312,0.071,0.977,3.135,0.048,29.529
304,"(Burgers & Sandwiches, Cold Beverages)","(Sides, Menus)",0.258,0.171,0.135,0.521,3.046,0.09,1.729
301,"(Sides, Menus)","(Burgers & Sandwiches, Cold Beverages)",0.171,0.258,0.135,0.787,3.046,0.09,3.485
100,(Menus),"(Burgers & Sandwiches, Cold Beverages)",0.173,0.258,0.136,0.786,3.042,0.091,3.467
97,"(Burgers & Sandwiches, Cold Beverages)",(Menus),0.258,0.173,0.136,0.527,3.042,0.091,1.747
303,"(Burgers & Sandwiches, Menus)","(Sides, Cold Beverages)",0.143,0.312,0.135,0.942,3.024,0.09,11.88
302,"(Sides, Cold Beverages)","(Burgers & Sandwiches, Menus)",0.312,0.143,0.135,0.432,3.024,0.09,1.509


In [77]:
rules.sort_values('lift', ascending=False).head(20).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
286,"(Sides, Promos, Burgers & Sandwiches, Cold Bev...",(Menus),0.24,0.173,0.134,0.56,3.235,0.093,1.88
315,(Menus),"(Sides, Promos, Burgers & Sandwiches, Cold Bev...",0.173,0.24,0.134,0.776,3.235,0.093,3.387
310,"(Promos, Menus)","(Sides, Burgers & Sandwiches, Cold Beverages)",0.173,0.24,0.134,0.777,3.234,0.093,3.407
291,"(Sides, Burgers & Sandwiches, Cold Beverages)","(Promos, Menus)",0.24,0.173,0.134,0.559,3.234,0.093,1.876
190,(Menus),"(Sides, Burgers & Sandwiches, Cold Beverages)",0.173,0.24,0.135,0.777,3.234,0.093,3.407


In [33]:
display(rules[phrase_filter(rules,'Cold Beverages')].head())

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Box),(Cold Beverages),0.072,0.352,0.072,0.989,2.811,0.046,56.728
1,(Cold Beverages),(Box),0.352,0.072,0.072,0.203,2.811,0.046,1.164
12,(Burgers & Sandwiches),(Cold Beverages),0.641,0.352,0.258,0.403,1.146,0.033,1.086
13,(Cold Beverages),(Burgers & Sandwiches),0.352,0.641,0.258,0.735,1.146,0.033,1.352
22,(Chicken),(Cold Beverages),0.443,0.352,0.175,0.395,1.124,0.019,1.072


In [36]:
display(rules[phrase_filter(rules,'Salads')].head())

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


#### Conclusions

- some obvious associacions can be found when not filtering data. It is not surprising that people who bought chicken, they migh also buy fries or cold beverage.
- it is somehow interesting that people who bought:
    - chicken are 2x likely to buy salad


In the next step we would like to explore less popular groups of products like hot beverages, desserts, salads.

#### Keep carts with at least one item buought in category: hot beverages, desserts, salads, 'Breakfast', 'LSM'

In [50]:
product_groups = ['Hot Beverages', 'Desserts', 'Salads', 'Breakfast', 'LSM']
rules = analyze_basket(basket_sets[(basket_sets[product_groups] !=0).any(1)])

Unnamed: 0,support,itemsets
0,0.076,(Bucket)
1,0.473,(Burgers & Sandwiches)
2,0.296,(Chicken)
3,0.214,(Cold Beverages)
4,0.106,(Condiments)


In [51]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Chicken),(Bucket),0.296,0.076,0.076,0.256,3.377,0.053,1.242
1,(Bucket),(Chicken),0.076,0.296,0.076,1.0,3.377,0.053,25320.651
2,(Promos),(Bucket),0.998,0.076,0.076,0.076,1.0,0.0,1.0
3,(Bucket),(Promos),0.076,0.998,0.076,0.998,1.0,0.0,1.188
4,(Chicken),(Burgers & Sandwiches),0.296,0.473,0.144,0.485,1.027,0.004,1.024


In [53]:
for product in product_groups:
    display(product)
    display(rules[phrase_filter(rules,product)].sort_values('lift', ascending=False).head(50))

'Hot Beverages'

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
575,"(Sides, Promos)","(Chicken, Hot Beverages)",0.493,0.078,0.072,0.147,1.879,0.034,1.081
578,"(Chicken, Hot Beverages)","(Sides, Promos)",0.078,0.493,0.072,0.928,1.879,0.034,6.987
210,(Sides),"(Chicken, Hot Beverages)",0.495,0.078,0.073,0.147,1.879,0.034,1.081
209,"(Chicken, Hot Beverages)",(Sides),0.078,0.495,0.073,0.93,1.879,0.034,7.169
579,(Sides),"(Chicken, Promos, Hot Beverages)",0.495,0.078,0.072,0.147,1.879,0.034,1.08
574,"(Chicken, Promos, Hot Beverages)",(Sides),0.078,0.495,0.072,0.929,1.879,0.034,7.167
572,"(Sides, Promos, Hot Beverages)",(Chicken),0.136,0.296,0.072,0.531,1.794,0.032,1.501
581,(Chicken),"(Sides, Promos, Hot Beverages)",0.296,0.136,0.072,0.245,1.794,0.032,1.143
208,"(Sides, Hot Beverages)",(Chicken),0.137,0.296,0.073,0.531,1.793,0.032,1.501
211,(Chicken),"(Sides, Hot Beverages)",0.296,0.137,0.073,0.245,1.793,0.032,1.144


'Desserts'

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
787,(Value),"(Sides, Promos, Burgers & Sandwiches, Desserts)",0.226,0.219,0.125,0.552,2.52,0.075,1.744
760,"(Sides, Promos, Burgers & Sandwiches, Desserts)",(Value),0.219,0.226,0.125,0.57,2.52,0.075,1.8
475,"(Sides, Burgers & Sandwiches, Desserts)",(Value),0.22,0.226,0.125,0.57,2.52,0.076,1.8
486,(Value),"(Sides, Burgers & Sandwiches, Desserts)",0.226,0.22,0.125,0.554,2.52,0.076,1.748
764,"(Sides, Burgers & Sandwiches, Desserts)","(Promos, Value)",0.22,0.226,0.125,0.569,2.52,0.075,1.795
783,"(Promos, Value)","(Sides, Burgers & Sandwiches, Desserts)",0.226,0.22,0.125,0.554,2.52,0.075,1.748
481,"(Burgers & Sandwiches, Value)","(Sides, Desserts)",0.172,0.299,0.125,0.726,2.43,0.074,2.561
480,"(Sides, Desserts)","(Burgers & Sandwiches, Value)",0.299,0.172,0.125,0.419,2.43,0.074,1.425
765,"(Sides, Promos, Desserts)","(Burgers & Sandwiches, Value)",0.298,0.172,0.125,0.419,2.43,0.074,1.425
782,"(Burgers & Sandwiches, Value)","(Sides, Promos, Desserts)",0.172,0.298,0.125,0.724,2.43,0.074,2.548


'Salads'

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
215,(Chicken),"(Promos, Salads)",0.296,0.136,0.079,0.266,1.956,0.039,1.177
214,"(Promos, Salads)",(Chicken),0.136,0.296,0.079,0.579,1.956,0.039,1.673
212,"(Chicken, Promos)",(Salads),0.295,0.136,0.079,0.267,1.956,0.039,1.178
217,(Salads),"(Chicken, Promos)",0.136,0.295,0.079,0.578,1.956,0.039,1.67
27,(Salads),(Chicken),0.136,0.296,0.079,0.579,1.956,0.039,1.672
26,(Chicken),(Salads),0.296,0.136,0.079,0.267,1.956,0.039,1.178
309,(Salads),"(Sides, Promos)",0.136,0.493,0.091,0.669,1.356,0.024,1.531
304,"(Sides, Promos)",(Salads),0.493,0.136,0.091,0.185,1.356,0.024,1.06
306,"(Promos, Salads)",(Sides),0.136,0.495,0.091,0.671,1.356,0.024,1.534
307,(Sides),"(Promos, Salads)",0.495,0.136,0.091,0.184,1.356,0.024,1.059


'Breakfast'

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


'LSM'

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


#### Conclusions


- **Some products groups are special**
    - Desserts, hot beverages or salads are rarely sold by themselves. Customers who already bought other items adds those to the cart. 
    - Desserts are usually bought when customer is having a coupon or if there is a promotion for it.
- **Juice with chicken nuggets**.
    - Orange juice is common next choice when someone buys chicken bites (nuggets).
- It might make sense to recommend dessert purchase, only when user already selected several other items or a hot beverage. We could recommend orange juice when it is likely that customers are parents. It seems that some product groups are an “extra” thing to buy, but never the main reason to enter restaurant, so there is little point in exposing them heavily at the early stage. 
