In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
data = pd.read_excel('Online_Retail.xlsx')
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
data.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [4]:
data.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [5]:
data['Description'] = data['Description'].str.strip()
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

data = data[~data['InvoiceNo'].str.contains('C')]

In [6]:
basket_France = (data[data['Country'] == "France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

basket_UK = (data[data['Country'] == "United Kingdom"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

basket_Por = (data[data['Country'] == "Portugal"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

basket_Sweden = (data[data['Country'] == "Sweden"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [7]:
def hot_encode(x):
  if(x <= 0):
    return 0
  if(x >= 1):
    return 1

In [8]:
basket_encoded = basket_France.applymap(hot_encode)
basket_France = basket_encoded

basket_encoded = basket_UK.applymap(hot_encode)
basket_UK = basket_encoded

basket_encoded = basket_Por.applymap(hot_encode)
basket_Por = basket_encoded

basket_encoded = basket_Sweden.applymap(hot_encode)
basket_Sweden = basket_encoded

In [9]:
# 1) France
frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric = "lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending = [False, False])
print(rules.head())

                                           antecedents  ... conviction
44                        (JUMBO BAG WOODLAND ANIMALS)  ...        inf
259  (PLASTERS IN TIN CIRCUS PARADE, RED TOADSTOOL ...  ...        inf
272  (PLASTERS IN TIN WOODLAND ANIMALS, RED TOADSTO...  ...        inf
301  (SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...  ...  34.897959
300  (SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...  ...  34.489796

[5 rows x 9 columns]


In [10]:
# 2) United Kingdom
frq_items = apriori(basket_UK, min_support = 0.01, use_colnames = True)
rules = association_rules(frq_items, metric = "lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending = [False, False])
print(rules.head())

                                       antecedents  ... conviction
116           (BEADED CRYSTAL HEART PINK ON STICK)  ...  39.637371
2019  (JAM MAKING SET PRINTED, SUKI  SHOULDER BAG)  ...  26.096206
2295         (HERB MARKER THYME, HERB MARKER MINT)  ...  21.947227
2302   (HERB MARKER PARSLEY, HERB MARKER ROSEMARY)  ...  20.444951
2301      (HERB MARKER THYME, HERB MARKER PARSLEY)  ...  20.443842

[5 rows x 9 columns]


In [11]:
# 3) Portugal
frq_items = apriori(basket_Por, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric = "lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending = [False, False])
print(rules.head())

                              antecedents  ... conviction
1170     (SET 12 COLOUR PENCILS SPACEBOY)  ...        inf
1171   (SET 12 COLOUR PENCILS DOLLY GIRL)  ...        inf
1172   (SET OF 4 KNICK KNACK TINS LONDON)  ...        inf
1173   (SET 12 COLOUR PENCILS DOLLY GIRL)  ...        inf
1174  (SET OF 4 KNICK KNACK TINS POPPIES)  ...        inf

[5 rows x 9 columns]


In [12]:
# 4) Sweden
frq_items = apriori(basket_Sweden, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric = "lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending = [False, False])
print(rules.head())

                        antecedents  ... conviction
0     (12 PENCILS SMALL TUBE SKULL)  ...        inf
1     (PACK OF 72 SKULL CAKE CASES)  ...        inf
4           (36 DOILIES DOLLY GIRL)  ...        inf
5    (ASSORTED BOTTLE TOP  MAGNETS)  ...        inf
180  (CHILDRENS CUTLERY DOLLY GIRL)  ...        inf

[5 rows x 9 columns]
