This example is done using the following url: https://pbpython.com/market-basket-analysis.html

In [107]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
df = pd.read_excel('Online Retail.xlsx')

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)

In [5]:
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]

In [6]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [47]:
# for i in sorted(df.Country.unique()):
# #     print(i, df[df['Country']==i].shape)
#     if df[df['Country']==i].shape[0]<1000:
#         print(i, df[df['Country']==i].shape)
#         print(df[df['Country']==i]['InvoiceNo'].nunique())

# Rules for Portugal

In [91]:
data = df[df['Country']=='Portugal']
print('The number of invoices', data['InvoiceNo'].nunique())
print('The number of lines',data.shape[0])

The number of invoices 58
The number of lines 1501


In [79]:
basket = pd.pivot_table(data, index=['InvoiceNo'], columns=['Description'], values=['Quantity'], aggfunc='sum', fill_value=0)

In [80]:
basket.columns = basket.columns.droplevel()

In [81]:
basket.drop(['POSTAGE'], inplace=True, axis=1)

In [82]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

In [83]:
basket_sets = basket.applymap(encode_units)

In [84]:
# del df
del basket

In [95]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True, low_memory=True)

In [98]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.120690,(4 TRADITIONAL SPINNING TOPS)
1,0.086207,(60 TEATIME FAIRY CAKE CASES)
2,0.206897,(BAKING SET 9 PIECE RETROSPOT)
3,0.086207,(BLUE HARMONICA IN BOX)
4,0.086207,(BOX OF VINTAGE JIGSAW BLOCKS)
...,...,...
237,0.086207,"(LUNCH BAG CARS BLUE, JUMBO SHOPPER VINTAGE RE..."
238,0.086207,"(LUNCH BAG DOLLY GIRL DESIGN, LUNCH BAG RED RE..."
239,0.086207,"(LUNCH BAG RED RETROSPOT, JUMBO SHOPPER VINTAG..."
240,0.086207,"(LUNCH BAG CARS BLUE, JUMBO BAG SCANDINAVIAN B..."


In [99]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=.1)

In [108]:
rules[ (rules['lift'] >= 5) &
       (rules['confidence'] >= 0.8) & (rules['conviction'] != np.inf)].sort_values(['confidence','lift'])

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
20,(JUMBO SHOPPER VINTAGE RED PAISLEY),(JUMBO BAG PINK VINTAGE PAISLEY),0.189655,0.155172,0.155172,0.818182,5.272727,0.125743,4.646552
240,"(LUNCH BAG CARS BLUE, LUNCH BAG PINK POLKADOT)",(JUMBO BAG PINK VINTAGE PAISLEY),0.103448,0.155172,0.086207,0.833333,5.370370,0.070155,5.068966
252,"(LUNCH BAG SUKI DESIGN, LUNCH BAG CARS BLUE)",(JUMBO BAG PINK VINTAGE PAISLEY),0.103448,0.155172,0.086207,0.833333,5.370370,0.070155,5.068966
264,"(LUNCH BAG DOLLY GIRL DESIGN, LUNCH BAG RED RE...",(JUMBO BAG PINK VINTAGE PAISLEY),0.103448,0.155172,0.086207,0.833333,5.370370,0.070155,5.068966
270,"(LUNCH BAG RED RETROSPOT, LUNCH BAG PINK POLKA...",(JUMBO BAG PINK VINTAGE PAISLEY),0.103448,0.155172,0.086207,0.833333,5.370370,0.070155,5.068966
...,...,...,...,...,...,...,...,...,...
19,(JUMBO BAG PINK VINTAGE PAISLEY),(JUMBO BAG SCANDINAVIAN BLUE PAISLEY),0.155172,0.155172,0.137931,0.888889,5.728395,0.113853,7.603448
163,"(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG ...",(JUMBO BAG SCANDINAVIAN BLUE PAISLEY),0.155172,0.155172,0.137931,0.888889,5.728395,0.113853,7.603448
166,(JUMBO BAG SCANDINAVIAN BLUE PAISLEY),"(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG ...",0.155172,0.155172,0.137931,0.888889,5.728395,0.113853,7.603448
167,(JUMBO BAG PINK VINTAGE PAISLEY),"(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG ...",0.155172,0.137931,0.137931,0.888889,6.444444,0.116528,7.758621
