# Perform a Market Basket Analysis

## Step 1: Get an Intuition for Association Rule Mining

In [1]:
T1 = ['Beer', 'Water', 'Diapers']
T2 = ['Beer', 'Water']
T3 = ['Beer', 'Diapers']
T4 = ['Water', 'Fruits', 'Tea', 'Diapers']
T5 = ['Diapers', 'Beer']
T6 = ['Beer']

In [2]:
dataset = [T1, T2, T3, T4, T5, T6]
dataset

[['Beer', 'Water', 'Diapers'],
 ['Beer', 'Water'],
 ['Beer', 'Diapers'],
 ['Water', 'Fruits', 'Tea', 'Diapers'],
 ['Diapers', 'Beer'],
 ['Beer']]

## Step 2: Frequent Itemsets

In [3]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
dataset_one_hot = te.fit(dataset).transform(dataset)
dataset_one_hot

array([[ True,  True, False, False,  True],
       [ True, False, False, False,  True],
       [ True,  True, False, False, False],
       [False,  True,  True,  True,  True],
       [ True,  True, False, False, False],
       [ True, False, False, False, False]])

In [4]:
import pandas as pd
df_one_hot = pd.DataFrame(dataset_one_hot, columns=te.columns_)
df_one_hot

Unnamed: 0,Beer,Diapers,Fruits,Tea,Water
0,True,True,False,False,True
1,True,False,False,False,True
2,True,True,False,False,False
3,False,True,True,True,True
4,True,True,False,False,False
5,True,False,False,False,False


In [5]:
from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df_one_hot, min_support=0.2, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.833333,(Beer)
1,0.666667,(Diapers)
2,0.5,(Water)
3,0.5,"(Diapers, Beer)"
4,0.333333,"(Water, Beer)"
5,0.333333,"(Diapers, Water)"


## Step 3: Association Rules

In [6]:
from mlxtend.frequent_patterns import association_rules

association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Diapers),(Beer),0.666667,0.833333,0.5,0.75,0.9,-0.055556,0.666667


## Step 4: Load Online Retail Data

In [7]:
df = pd.read_csv("https://ucarecdn.com/8d8cd2ee-47d4-474f-b3a7-66eb9a20b43e/retail_data_clean.csv")
df

Unnamed: 0,CustomerID,InvoiceNo,InvoiceDate,StockCode,Quantity,UnitPrice,Revenue
0,13047,536367,2010-12-01 08:34:00,84879,32,1.69,54.08
1,13047,536367,2010-12-01 08:34:00,22745,6,2.10,12.60
2,13047,536367,2010-12-01 08:34:00,22748,6,2.10,12.60
3,13047,536367,2010-12-01 08:34:00,22749,8,3.75,30.00
4,13047,536367,2010-12-01 08:34:00,22310,6,1.65,9.90
...,...,...,...,...,...,...,...
91145,17581,581581,2011-12-09 12:20:00,23562,6,2.89,17.34
91146,17581,581581,2011-12-09 12:20:00,23561,6,2.89,17.34
91147,17581,581581,2011-12-09 12:20:00,23681,10,1.65,16.50
91148,17581,581582,2011-12-09 12:21:00,23552,6,2.08,12.48


In [8]:
stock_codes_df = pd.read_csv("https://ucarecdn.com/5cef20a8-c7d8-46e1-af8a-830388dc89c9/stock_codes.csv")
df = pd.merge(df, stock_codes_df, "left")
df

Unnamed: 0,CustomerID,InvoiceNo,InvoiceDate,StockCode,Quantity,UnitPrice,Revenue,Description
0,13047,536367,2010-12-01 08:34:00,84879,32,1.69,54.08,ASSORTED COLOUR BIRD ORNAMENT
1,13047,536367,2010-12-01 08:34:00,22745,6,2.10,12.60,POPPY'S PLAYHOUSE BEDROOM
2,13047,536367,2010-12-01 08:34:00,22748,6,2.10,12.60,POPPY'S PLAYHOUSE KITCHEN
3,13047,536367,2010-12-01 08:34:00,22749,8,3.75,30.00,FELTCRAFT PRINCESS CHARLOTTE DOLL
4,13047,536367,2010-12-01 08:34:00,22310,6,1.65,9.90,IVORY KNITTED MUG COSY
...,...,...,...,...,...,...,...,...
91145,17581,581581,2011-12-09 12:20:00,23562,6,2.89,17.34,SET OF 6 RIBBONS PERFECTLY PRETTY
91146,17581,581581,2011-12-09 12:20:00,23561,6,2.89,17.34,SET OF 6 RIBBONS PARTY
91147,17581,581581,2011-12-09 12:20:00,23681,10,1.65,16.50,LUNCH BAG RED VINTAGE DOILY
91148,17581,581582,2011-12-09 12:21:00,23552,6,2.08,12.48,BICYCLE PUNCTURE REPAIR KIT


## Step 5: Transform Dataset

In [9]:
dataset = df.groupby('InvoiceNo').apply(lambda x: x['Description'].unique().tolist())
dataset

InvoiceNo
536367    [ASSORTED COLOUR BIRD ORNAMENT, POPPY'S PLAYHO...
536368    [JAM MAKING SET WITH JARS, RED COAT RACK PARIS...
536369                           [BATH BUILDING BLOCK WORD]
536370    [ALARM CLOCK BAKELIKE PINK, ALARM CLOCK BAKELI...
536385    [SET 3 WICKER OVAL BASKETS W LIDS, JAM MAKING ...
                                ...                        
581572    [SET 6 SCHOOL MILK BOTTLES IN CRATE, MINT KITC...
581579    [JUMBO BAG PINK POLKADOT, JUMBO SHOPPER VINTAG...
581580    [IVORY WICKER HEART SMALL, VINTAGE SNAP CARDS,...
581581    [SET OF 6 RIBBONS PERFECTLY PRETTY  , SET OF 6...
581582    [BICYCLE PUNCTURE REPAIR KIT , CLASSIC BICYCLE...
Length: 4311, dtype: object

In [10]:
te = TransactionEncoder()
dataset_one_hot = te.fit(dataset).transform(dataset)
df_one_hot = pd.DataFrame(dataset_one_hot, columns=te.columns_)
df_one_hot

Unnamed: 0,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE OR GIFT BAG LARGE SPOT,SET 2 TEA TOWELS I LOVE LONDON OR SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,TRELLIS COAT RACK,...,ZINC HERB GARDEN CONTAINER OR METAL HERB GERDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC SWEETHEART SOAP DISH,ZINC SWEETHEART WIRE LETTER RACK,ZINC T-LIGHT HOLDER STAR LARGE OR ZINC T-LIGHT HOLDER STARS LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK,ZINC WIRE KITCHEN ORGANISER,ZINC WIRE SWEETHEART LETTER TRAY
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4306,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4307,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4308,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4309,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
dataset_one_hot = te.fit(dataset).transform(dataset, sparse=True)
df_one_hot_sparse = pd.DataFrame.sparse.from_spmatrix(dataset_one_hot, columns=te.columns_)
df_one_hot_sparse

Unnamed: 0,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE OR GIFT BAG LARGE SPOT,SET 2 TEA TOWELS I LOVE LONDON OR SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,TRELLIS COAT RACK,...,ZINC HERB GARDEN CONTAINER OR METAL HERB GERDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC SWEETHEART SOAP DISH,ZINC SWEETHEART WIRE LETTER RACK,ZINC T-LIGHT HOLDER STAR LARGE OR ZINC T-LIGHT HOLDER STARS LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK,ZINC WIRE KITCHEN ORGANISER,ZINC WIRE SWEETHEART LETTER TRAY
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4306,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4307,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4308,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4309,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Step 6: Find Frequent Itemsets in Retail Data

In [12]:
frequent_itemsets = apriori(df_one_hot, min_support=0.02, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.025980,( SET 2 TEA TOWELS I LOVE LONDON OR SET 2 TEA...
1,0.044073,(6 RIBBONS RUSTIC CHARM)
2,0.023660,(60 CAKE CASES VINTAGE CHRISTMAS)
3,0.032475,(60 TEATIME FAIRY CAKE CASES)
4,0.023892,(72 SWEETHEART FAIRY CAKE CASES)
...,...,...
238,0.023196,"(RED HANGING HEART T-LIGHT HOLDER, WHITE HANGI..."
239,0.022037,"(STRAWBERRY CHARLOTTE BAG, RED RETROSPOT CHARL..."
240,0.020877,"(WOODLAND CHARLOTTE BAG, RED RETROSPOT CHARLOT..."
241,0.021109,"(ROSES REGENCY TEACUP AND SAUCER , REGENCY CAK..."


In [13]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets.sort_values("length")

Unnamed: 0,support,itemsets,length
0,0.025980,( SET 2 TEA TOWELS I LOVE LONDON OR SET 2 TEA...,1
128,0.022269,(PLASTERS IN TIN STRONGMAN),1
129,0.024124,(PLASTERS IN TIN VINTAGE PAISLEY ),1
130,0.025980,(PLASTERS IN TIN WOODLAND ANIMALS),1
131,0.037810,(PLEASE ONE PERSON METAL SIGN),1
...,...,...,...
215,0.028996,(LUNCH BAG SUKI DESIGN OR LUNCH BAG SUKI DES...,2
216,0.020877,"(LUNCH BAG BLACK SKULL., LUNCH BAG WOODLAND)",2
217,0.020181,(LUNCH BAG RED RETROSPOT OR LUNCH BAG RED SPOT...,2
219,0.029460,"(LUNCH BAG CARS BLUE, LUNCH BAG RED RETROSPOT ...",2


In [14]:
frequent_itemsets.loc[242]['itemsets']

frozenset({'WOODEN FRAME ANTIQUE WHITE ', 'WOODEN PICTURE FRAME WHITE FINISH'})

## Step 7: Find Association Rules

In [15]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),0.035955,0.043841,0.023428,0.651613,14.862980,0.021852,2.744529
1,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN),0.043841,0.035955,0.023428,0.534392,14.862980,0.021852,2.070507
2,(SPACEBOY LUNCH BOX ),(DOLLY GIRL LUNCH BOX),0.044073,0.032939,0.021109,0.478947,14.540437,0.019657,1.855976
3,(DOLLY GIRL LUNCH BOX),(SPACEBOY LUNCH BOX ),0.032939,0.044073,0.021109,0.640845,14.540437,0.019657,2.661600
4,(GARDENERS KNEELING PAD KEEP CALM ),(GARDENERS KNEELING PAD CUP OF TEA ),0.040594,0.031547,0.022965,0.565714,17.932311,0.021684,2.229990
...,...,...,...,...,...,...,...,...,...
83,(RED RETROSPOT CHARLOTTE BAG),(WOODLAND CHARLOTTE BAG),0.043145,0.033635,0.020877,0.483871,14.385984,0.019426,1.872332
84,(ROSES REGENCY TEACUP AND SAUCER ),(REGENCY CAKESTAND 3 TIER),0.041290,0.096729,0.021109,0.511236,5.285224,0.017115,1.848071
85,(REGENCY CAKESTAND 3 TIER),(ROSES REGENCY TEACUP AND SAUCER ),0.096729,0.041290,0.021109,0.218225,5.285224,0.017115,1.226326
86,(WOODEN PICTURE FRAME WHITE FINISH),(WOODEN FRAME ANTIQUE WHITE ),0.042682,0.043609,0.024124,0.565217,12.960916,0.022263,2.199698


In [16]:
rules.sort_values("confidence", ascending = False).head(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
7,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.027836,0.035259,0.022037,0.791667,22.453125,0.021055,4.630759
9,(GREEN REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER ),0.035259,0.04129,0.02714,0.769737,18.642334,0.025684,4.163542
77,(PINK REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER ),0.027836,0.04129,0.020645,0.741667,17.9625,0.019496,3.711137


## Filter Rules by Individual Product Name

In [17]:
high_value_customers_top_items = pd.read_csv("https://ucarecdn.com/10f0c3b9-550a-4175-8cc5-9ec35b35a9a8/high_value_customers_top_items.csv")
high_value_customers_top_items

Unnamed: 0,StockCode,Revenue,Quantity,Description
0,22423,13677.25,1193,REGENCY CAKESTAND 3 TIER
1,85123A,8542.15,3169,WHITE HANGING HEART T-LIGHT HOLDER OR CREAM HA...
2,79321,7578.2,1556,CHILLI LIGHTS
3,84997D,6999.58,1882,PINK 3 PIECE POLKADOT CUTLERY SET OR CHILDRENS...
4,85099B,6813.54,3507,JUMBO BAG RED RETROSPOT
5,84879,6527.16,4044,ASSORTED COLOUR BIRD ORNAMENT
6,23084,5603.48,3070,RABBIT NIGHT LIGHT
7,84997C,5450.5,1462,BLUE 3 PIECE POLKADOT CUTLERY SET OR CHILDRENS...
8,47566,5411.85,1157,PARTY BUNTING
9,22086,4416.9,1622,PAPER CHAIN KIT 50'S CHRISTMAS


In [18]:
antecedent_filter = rules['antecedents'] == frozenset({high_value_customers_top_items['Description'][0]})
rules.loc[antecedent_filter ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
85,(REGENCY CAKESTAND 3 TIER),(ROSES REGENCY TEACUP AND SAUCER ),0.096729,0.04129,0.021109,0.218225,5.285224,0.017115,1.226326


## Step 9: Filter Rules by Multiple Product Names

In [19]:
ls = []
for i in range(0,len(high_value_customers_top_items)):
  antecedent_filter = rules['antecedents'] == frozenset({high_value_customers_top_items['Description'][i]})
  ls.append(rules.loc[antecedent_filter ])

filtered_rules = pd.concat(ls)
filtered_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
85,(REGENCY CAKESTAND 3 TIER),(ROSES REGENCY TEACUP AND SAUCER ),0.096729,0.04129,0.021109,0.218225,5.285224,0.017115,1.226326
79,(WHITE HANGING HEART T-LIGHT HOLDER OR CREAM H...,(RED HANGING HEART T-LIGHT HOLDER),0.107632,0.032939,0.023196,0.215517,6.542921,0.019651,1.232737
13,(JUMBO BAG RED RETROSPOT),(JUMBO BAG DOILEY PATTERNS OR JUMBO BAG VINTAG...,0.078172,0.055904,0.02598,0.332344,5.944962,0.02161,1.414047
19,(JUMBO BAG RED RETROSPOT),(JUMBO BAG PINK POLKADOT),0.078172,0.047089,0.028764,0.367953,7.814007,0.025083,1.507658
20,(JUMBO BAG RED RETROSPOT),(JUMBO BAG STRAWBERRY),0.078172,0.036882,0.022733,0.290801,7.884553,0.019849,1.358036
22,(JUMBO BAG RED RETROSPOT),(JUMBO SHOPPER VINTAGE RED PAISLEY),0.078172,0.042682,0.021805,0.278932,6.535189,0.018468,1.327639
74,(PARTY BUNTING),"(SPOTTY BUNTING OR BUNTING , SPOTTY )",0.074925,0.054048,0.022269,0.297214,5.49909,0.018219,1.346002
73,(PAPER CHAIN KIT 50'S CHRISTMAS ),(PAPER CHAIN KIT VINTAGE CHRISTMAS),0.0508,0.036418,0.022501,0.442922,12.162028,0.020651,1.729708
