## Purpose of script:
#### Light-weight market basket analysis
#### Will separately create a dockerized version
#### tutorial url:
#### https://www.geeksforgeeks.org/implementing-apriori-algorithm-in-python/

In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
orig = pd.read_excel('../Datasets/Online Retail.xlsx')
print(orig.shape)
orig.head(2)

(541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [16]:
data = orig.copy()

In [17]:
data.Country.value_counts()[:20]

United Kingdom     495478
Germany              9495
France               8557
EIRE                 8196
Spain                2533
Netherlands          2371
Belgium              2069
Switzerland          2002
Portugal             1519
Australia            1259
Norway               1086
Italy                 803
Channel Islands       758
Finland               695
Cyprus                622
Sweden                462
Unspecified           446
Austria               401
Denmark               389
Japan                 358
Name: Country, dtype: int64

In [18]:
# Stripping extra spaces in the description
data['Description'] = data['Description'].str.strip()
  
# Dropping the rows without any invoice number
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')
  
# Dropping all transactions which were done on credit
data = data[~data['InvoiceNo'].str.contains('C')]
data.shape

(532621, 8)

In [19]:
# Transactions done in the United Kingdom
basket_UK = (data[data['Country'] =="United Kingdom"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [20]:
def hot_encode(x):
    if(x<= 0):
        return 0
    if(x>= 1):
        return 1

In [21]:
basket_encoded = basket_UK.applymap(hot_encode)
basket_UK = basket_encoded

basket_UK.shape

(18667, 4175)

In [48]:
frq_items = apriori(basket_UK.astype('bool'), min_support = 0.01, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.shape)
rules.head(10)

(3760, 9)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
117,(BEADED CRYSTAL HEART PINK ON STICK),(DOTCOM POSTAGE),0.011036,0.037928,0.010768,0.975728,25.725872,0.010349,39.637371
2018,"(JAM MAKING SET PRINTED, SUKI SHOULDER BAG)",(DOTCOM POSTAGE),0.011625,0.037928,0.011196,0.963134,25.393807,0.010755,26.096206
2295,"(HERB MARKER THYME, HERB MARKER MINT)",(HERB MARKER ROSEMARY),0.010714,0.012375,0.010232,0.955,77.173095,0.010099,21.947227
2302,"(HERB MARKER ROSEMARY, HERB MARKER PARSLEY)",(HERB MARKER THYME),0.011089,0.012321,0.010553,0.951691,77.240055,0.010417,20.444951
2301,"(HERB MARKER THYME, HERB MARKER PARSLEY)",(HERB MARKER ROSEMARY),0.011089,0.012375,0.010553,0.951691,76.905682,0.010416,20.443842
2278,"(HERB MARKER THYME, HERB MARKER BASIL)",(HERB MARKER ROSEMARY),0.010875,0.012375,0.010339,0.950739,76.828759,0.010205,20.048792
3357,"(REGENCY TEA PLATE PINK, REGENCY TEA PLATE ROSES)",(REGENCY TEA PLATE GREEN),0.012643,0.018,0.011946,0.944915,52.496229,0.011719,17.827083
3374,"(WOODEN HEART CHRISTMAS SCANDINAVIAN, WOODEN T...",(WOODEN STAR CHRISTMAS SCANDINAVIAN),0.012428,0.02566,0.011732,0.943966,36.787065,0.011413,17.388217
2288,"(HERB MARKER THYME, HERB MARKER MINT)",(HERB MARKER PARSLEY),0.010714,0.012214,0.010071,0.94,76.960439,0.00994,16.463099
2277,"(HERB MARKER ROSEMARY, HERB MARKER BASIL)",(HERB MARKER THYME),0.011036,0.012321,0.010339,0.936893,76.039067,0.010203,15.65091


In [49]:
## Sample of associations
rules[rules.antecedents == frozenset({'JUMBO BAG TOYS'})]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
913,(JUMBO BAG TOYS),(JUMBO BAG RED RETROSPOT),0.02716,0.10382,0.018107,0.666667,6.421397,0.015287,2.688541
1055,(JUMBO BAG TOYS),(JUMBO STORAGE BAG SUKI),0.02716,0.060535,0.015803,0.581854,9.61192,0.014159,2.24674
1051,(JUMBO BAG TOYS),(JUMBO BAG WOODLAND ANIMALS),0.02716,0.039374,0.015107,0.556213,14.126297,0.014037,2.16461
841,(JUMBO BAG TOYS),(JUMBO BAG PINK POLKADOT),0.02716,0.062088,0.014946,0.550296,8.863134,0.01326,2.08562
343,(JUMBO BAG TOYS),(DOTCOM POSTAGE),0.02716,0.037928,0.014625,0.538462,14.19698,0.013595,2.08449
1052,(JUMBO BAG TOYS),(JUMBO SHOPPER VINTAGE RED PAISLEY),0.02716,0.060695,0.014089,0.518738,8.546581,0.012441,1.951752
2761,(JUMBO BAG TOYS),"(JUMBO BAG RED RETROSPOT, JUMBO STORAGE BAG SUKI)",0.02716,0.037392,0.012589,0.463511,12.395927,0.011573,1.794273
2563,(JUMBO BAG TOYS),"(JUMBO BAG PINK POLKADOT, JUMBO BAG RED RETROS...",0.02716,0.042053,0.012428,0.457594,10.881403,0.011286,1.766106
2749,(JUMBO BAG TOYS),"(JUMBO BAG RED RETROSPOT, JUMBO BAG WOODLAND A...",0.02716,0.025392,0.011946,0.439842,17.321803,0.011257,1.73988
2083,(JUMBO BAG TOYS),"(JUMBO BAG RED RETROSPOT, DOTCOM POSTAGE)",0.02716,0.025982,0.011625,0.428008,16.47345,0.010919,1.702853
