##### <p> Samuel Wolfe <br> November 718, 2023 <br> MSBA 207 <br> Chapter 14 </p>

In [44]:
# need to run "pip install mlxtend" first
# need to run "pip install surprise" first
# "conda install -c conda-forge scikit-surprise" in "terminal" (Mac) or "Anaconda Prompt" (Windows)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from IPython.display import clear_output

from pathlib import Path

import heapq
from collections import defaultdict

import pandas as pd
pd.set_option('display.width', 250)
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

In [45]:
# Working directory:
#
# We assume that data are kept in the same directory as the notebook. If you keep your 
# data in a different folder, replace the argument of the `Path`
DATA = Path('E:/Aliit/School/MSBA/206/MSBA-206/dmba/instacart')
#DATA = Path('C:/Users/Min Li/OneDrive/teaching/DS110/dmba')
# and then load data using 
#
# pd.read_csv(DATA / ‘filename.csv’)
# Load and preprocess data set 

In [46]:
file1 = pd.read_csv(DATA / 'order_products__train.csv')
file2 = pd.read_csv(DATA / 'products.csv')

In [47]:
df_instacart = pd.merge(left=file2, right=file1.head(700000),on='product_id')
df_instacart['product_name'] = df_instacart['product_name'].replace(' ','_',regex=True)
df_instacart

Unnamed: 0,product_id,product_name,aisle_id,department_id,order_id,add_to_cart_order,reordered
0,1,Chocolate_Sandwich_Cookies,61,19,6695,7,1
1,1,Chocolate_Sandwich_Cookies,61,19,48361,9,0
2,1,Chocolate_Sandwich_Cookies,61,19,63770,4,0
3,1,Chocolate_Sandwich_Cookies,61,19,75339,9,0
4,1,Chocolate_Sandwich_Cookies,61,19,240996,3,1
...,...,...,...,...,...,...,...
699995,49686,Artisan_Baguette,112,3,367913,9,1
699996,49686,Artisan_Baguette,112,3,674266,3,1
699997,49686,Artisan_Baguette,112,3,1258283,1,1
699998,49687,Smartblend_Healthy_Metabolism_Dry_Cat_Food,41,8,1092104,1,0


In [48]:
list_single_items = df_instacart.drop_duplicates(subset='order_id',keep=False)['order_id'].to_list()

In [49]:
df_instacart_multi = df_instacart[~df_instacart['order_id'].isin(list_single_items)]
df_instacart_multi

Unnamed: 0,product_id,product_name,aisle_id,department_id,order_id,add_to_cart_order,reordered
0,1,Chocolate_Sandwich_Cookies,61,19,6695,7,1
1,1,Chocolate_Sandwich_Cookies,61,19,48361,9,0
2,1,Chocolate_Sandwich_Cookies,61,19,63770,4,0
3,1,Chocolate_Sandwich_Cookies,61,19,75339,9,0
4,1,Chocolate_Sandwich_Cookies,61,19,240996,3,1
...,...,...,...,...,...,...,...
699995,49686,Artisan_Baguette,112,3,367913,9,1
699996,49686,Artisan_Baguette,112,3,674266,3,1
699997,49686,Artisan_Baguette,112,3,1258283,1,1
699998,49687,Smartblend_Healthy_Metabolism_Dry_Cat_Food,41,8,1092104,1,0


In [50]:
binary_instacart = df_instacart_multi.pivot_table(index='order_id',columns='product_name',values='product_id',fill_value=0,aggfunc='count')

In [51]:
binary_instacart

product_name,#2_Coffee_Filters,#4_Natural_Brown_Coffee_Filters,&_Go!_Hazelnut_Spread_+_Pretzel_Sticks,0%_Fat_Black_Cherry_Greek_Yogurt_y,0%_Fat_Blueberry_Greek_Yogurt,0%_Fat_Free_Organic_Milk,0%_Fat_Greek_Yogurt_Black_Cherry_on_the_Bottom,0%_Fat_Greek_Yogurt_Vanilla,0%_Fat_Organic_Greek_Vanilla_Yogurt,0%_Fat_Peach_Greek_Yogurt,...,with_Olive_Oil_Mayonnaise_Dressing,with_Seasoned_Roasted_Potatoes_Scrambled_Eggs_&_Sausage,with_Sweet_&_Smoky_BBQ_Sauce_Cheeseburger_Sliders,with_Xylitol_Cinnamon_18_Sticks_Sugar_Free_Gum,with_Xylitol_Minty_Sweet_Twist_18_Sticks_Sugar_Free_Gum,with_Xylitol_Original_Flavor_18_Sticks_Sugar_Free_Gum,with_Xylitol_Unwrapped_Original_Flavor_50_Sticks_Sugar_Free_Gum,with_Xylitol_Watermelon_Twist_18_Sticks_Sugar_Free_Gum,with_a_Splash_of_Mango_Coconut_Water,with_a_Splash_of_Pineapple_Coconut_Water
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720817,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1720859,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1720910,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1720932,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
# create frequent itemsets
itemsets = apriori(binary_instacart, min_support=0.004, use_colnames=True)
itemsets

Unnamed: 0,support,itemsets
0,0.010181,(100%_Raw_Coconut_Water)
1,0.010054,(100%_Recycled_Paper_Towels)
2,0.017222,(100%_Whole_Wheat_Bread)
3,0.004250,(100_Calorie__Per_Bag_Popcorn)
4,0.004773,(2%_Reduced_Fat_DHA_Omega-3_Reduced_Fat_Milk)
...,...,...
583,0.004282,"(Seedless_Red_Grapes, Strawberries)"
584,0.004409,"(Bag_of_Organic_Bananas, Organic_Baby_Spinach,..."
585,0.004361,"(Bag_of_Organic_Bananas, Organic_Hass_Avocado,..."
586,0.005677,"(Bag_of_Organic_Bananas, Organic_Hass_Avocado,..."


In [None]:
# and convert into rules
rules = association_rules(itemsets, metric='confidence', min_threshold=0.01)

In [81]:
rules.sort_values(by=['lift'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
243,(Lime_Sparkling_Water),(Sparkling_Water_Grapefruit),0.016334,0.027403,0.004472,0.273786,9.990986,0.004024,1.339271,0.914853
242,(Sparkling_Water_Grapefruit),(Lime_Sparkling_Water),0.027403,0.016334,0.004472,0.163194,9.990986,0.004024,1.175501,0.925265
198,(Green_Bell_Pepper),(Red_Peppers),0.020140,0.023169,0.004076,0.202362,8.734125,0.003609,1.224655,0.903707
199,(Red_Peppers),(Green_Bell_Pepper),0.023169,0.020140,0.004076,0.175907,8.734125,0.003609,1.189016,0.906510
363,(Organic_Italian_Parsley_Bunch),(Organic_Garlic),0.019855,0.033477,0.004123,0.207668,6.203274,0.003459,1.219845,0.855786
...,...,...,...,...,...,...,...,...,...,...
166,(Banana),(Organic_Yellow_Onion),0.149656,0.034714,0.005376,0.035922,1.034809,0.000181,1.001253,0.039558
98,(Bag_of_Organic_Bananas),(Strawberries),0.122776,0.052713,0.006502,0.052958,1.004639,0.000030,1.000258,0.005263
99,(Strawberries),(Bag_of_Organic_Bananas),0.052713,0.122776,0.006502,0.123345,1.004639,0.000030,1.000650,0.004874
152,(Organic_Hass_Avocado),(Banana),0.058248,0.149656,0.007200,0.123605,0.825926,-0.001517,0.970275,-0.182872


In [79]:
sup_mean = rules.describe()['support'][1]
rules_best_sup = rules.loc[rules['support'] >= sup_mean].sort_values(by=['lift'], ascending=False)
rules_best_sup_no_lap = rules_best_sup.drop(index=rules_best_sup.index[::2])
rules_best_sup_no_lap.head(10).drop(columns=['antecedent support','consequent support','conviction','zhangs_metric'])

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
368,(Organic_Garlic),(Organic_Yellow_Onion),0.006962,0.207958,5.990605,0.0058
248,(Limes),(Organic_Cilantro),0.007786,0.156569,5.5033,0.006372
208,(Large_Lemon),(Limes),0.013591,0.205911,4.140406,0.010308
394,(Organic_Strawberries),(Organic_Raspberries),0.012893,0.149311,3.391668,0.009092
379,(Organic_Raspberries),(Organic_Hass_Avocado),0.008294,0.188401,3.234458,0.00573
351,(Organic_Cucumber),(Organic_Hass_Avocado),0.006454,0.174379,2.993731,0.004298
220,(Organic_Garlic),(Large_Lemon),0.00655,0.195642,2.964148,0.00434
335,(Organic_Strawberries),(Organic_Blueberries),0.009912,0.114784,2.961564,0.006565
245,(Limes),(Organic_Avocado),0.008801,0.176977,2.952333,0.00582
211,(Organic_Avocado),(Large_Lemon),0.011291,0.18836,2.853818,0.007335


#### In order to make it a bit more interesting, I removed every other entry, otherwise we would really only be comparing 5 different combos.
#### Looking at the items, the sequence is the same. The support value is how likely the collection of antecedents and consequents are to appear in the dataset. Then we have confidence, which is, given the antecedent, the consequent has a chance that the item in the rhs value will be in the cart given the lhs item. The higher the confidence, the higher this chance. Then we have lift, which is a representation of the strength of item association in the given dataset. Given all of these rules have lifts over 1.0, we can safely assume they are all associated at least a little bit. Finally we have leverage, which is a range of -1,0,1. Where the closer to 0 for the value the more independent the items are. Negative means they are less likely to associate together, positive indicates they are more likely associate together.
#### All of these items are grocery based, so we can see some common pairings. Garlic and onion, both organic. Limes and cilantro, interestingly the limes do not appear to have an organic label. Lemon and limes. Its all various pairings of these common fruits and vegetables.
#### Looking at their support value, I am rather shocked at how low this number is overall for the rules. I would have expected a much higher value for this column. Confidence is also low. Lift is healthy. Leverage is low as well.
#### I think grocery items like these must suffer from what I like to call "commonality". They are the bare bones most common additives to home cooking, thus they will likely be in a lot of carts. Thus suggesting to someone who has Organic Garlic in their cart if they would like to add Organic Yellow Onion is a fairly safe move.