This example is done using the following url: https://pbpython.com/market-basket-analysis.html

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
df = pd.read_excel('Online Retail.xlsx')

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)

In [5]:
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]

In [6]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [7]:
# for i in sorted(df.Country.unique()):
# #     print(i, df[df['Country']==i].shape)
#     if df[df['Country']==i].shape[0]<1000:
#         print(i, df[df['Country']==i].shape)
#         print(df[df['Country']==i]['InvoiceNo'].nunique())

# Rules for Portugal

In [8]:
data = df[df['Country']=='Portugal']
print('The number of invoices', data['InvoiceNo'].nunique())
print('The number of lines',data.shape[0])

The number of invoices 58
The number of lines 1501


In [9]:
basket = pd.pivot_table(data, index=['InvoiceNo'], columns=['Description'], values=['Quantity'], aggfunc='sum', fill_value=0)

In [10]:
basket.columns = basket.columns.droplevel()

In [11]:
basket.drop(['POSTAGE'], inplace=True, axis=1)

In [12]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

In [13]:
basket_sets = basket.applymap(encode_units)

  basket_sets = basket.applymap(encode_units)


In [14]:
# del df
del basket

In [15]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True, low_memory=True)



In [21]:
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.12069,(4 TRADITIONAL SPINNING TOPS)
1,0.086207,(60 TEATIME FAIRY CAKE CASES)
2,0.206897,(BAKING SET 9 PIECE RETROSPOT)
3,0.086207,(BLUE HARMONICA IN BOX)
4,0.086207,(BOX OF VINTAGE JIGSAW BLOCKS)


In [17]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=.1)

In [18]:
rules[(rules['lift'] >= 5) & (rules['confidence'] >= 0.8) & (rules['conviction'] != np.inf)].sort_values(['confidence','lift'])

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
20,(JUMBO SHOPPER VINTAGE RED PAISLEY),(JUMBO BAG PINK VINTAGE PAISLEY),0.189655,0.155172,0.155172,0.818182,5.272727,1.0,0.125743,4.646552,1.000000,0.818182,0.784787,0.909091
242,"(LUNCH BAG CARS BLUE, LUNCH BAG PINK POLKADOT)",(JUMBO BAG PINK VINTAGE PAISLEY),0.103448,0.155172,0.086207,0.833333,5.370370,1.0,0.070155,5.068966,0.907692,0.500000,0.802721,0.694444
253,"(LUNCH BAG SUKI DESIGN, LUNCH BAG CARS BLUE)",(JUMBO BAG PINK VINTAGE PAISLEY),0.103448,0.155172,0.086207,0.833333,5.370370,1.0,0.070155,5.068966,0.907692,0.500000,0.802721,0.694444
266,"(LUNCH BAG DOLLY GIRL DESIGN, LUNCH BAG RED RE...",(JUMBO BAG PINK VINTAGE PAISLEY),0.103448,0.155172,0.086207,0.833333,5.370370,1.0,0.070155,5.068966,0.907692,0.500000,0.802721,0.694444
272,"(LUNCH BAG RED RETROSPOT, LUNCH BAG PINK POLKA...",(JUMBO BAG PINK VINTAGE PAISLEY),0.103448,0.155172,0.086207,0.833333,5.370370,1.0,0.070155,5.068966,0.907692,0.500000,0.802721,0.694444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19,(JUMBO BAG PINK VINTAGE PAISLEY),(JUMBO BAG SCANDINAVIAN BLUE PAISLEY),0.155172,0.155172,0.137931,0.888889,5.728395,1.0,0.113853,7.603448,0.977041,0.800000,0.868481,0.888889
164,"(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG ...",(JUMBO BAG SCANDINAVIAN BLUE PAISLEY),0.155172,0.155172,0.137931,0.888889,5.728395,1.0,0.113853,7.603448,0.977041,0.800000,0.868481,0.888889
165,(JUMBO BAG SCANDINAVIAN BLUE PAISLEY),"(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG ...",0.155172,0.155172,0.137931,0.888889,5.728395,1.0,0.113853,7.603448,0.977041,0.800000,0.868481,0.888889
167,(JUMBO BAG PINK VINTAGE PAISLEY),"(JUMBO BAG SCANDINAVIAN BLUE PAISLEY, JUMBO SH...",0.155172,0.137931,0.137931,0.888889,6.444444,1.0,0.116528,7.758621,1.000000,0.888889,0.871111,0.944444
