In [1]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

In [2]:
data = pd.read_excel('Online Retail.xlsx')

In [3]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
# Dropping all records with empty values
data.dropna(inplace = True)
data.isna().any()

InvoiceNo      False
StockCode      False
Description    False
Quantity       False
InvoiceDate    False
UnitPrice      False
CustomerID     False
Country        False
dtype: bool

In [5]:
# Removing negative values
data = data[data.Quantity >= 0]
data.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,397924.0,397924.0,397924.0
mean,13.021823,3.116174,15294.315171
std,180.42021,22.096788,1713.169877
min,1.0,0.0,12346.0
25%,2.0,1.25,13969.0
50%,6.0,1.95,15159.0
75%,12.0,3.75,16795.0
max,80995.0,8142.75,18287.0


In [16]:
# Finding rules only from  France as running for the entire dataset gave no results as dataset was huge
df = data[data['Country'] =="France"].groupby(['InvoiceNo', 'Description'])['Quantity'].sum()
df

InvoiceNo  Description                        
536370      SET 2 TEA TOWELS I LOVE LONDON        24
           ALARM CLOCK BAKELIKE GREEN             12
           ALARM CLOCK BAKELIKE PINK              24
           ALARM CLOCK BAKELIKE RED               24
           CHARLOTTE BAG DOLLY GIRL DESIGN        20
           CIRCUS PARADE LUNCH BOX                24
           INFLATABLE POLITICAL GLOBE             48
           LUNCH BOX I LOVE LONDON                24
           MINI JIGSAW CIRCUS PARADE              24
           MINI JIGSAW SPACEBOY                   24
           MINI PAINT SET VINTAGE                 36
           PANDA AND BUNNIES STICKER SHEET        12
           POSTAGE                                 3
           RED TOADSTOOL LED NIGHT LIGHT          24
           ROUND SNACK BOXES SET OF4 WOODLAND     24
           SET/2 RED RETROSPOT TEA TOWELS         18
           SPACEBOY LUNCH BOX                     24
           STARS GIFT TAPE                        24

In [17]:
# List of all invoice nos (or transactions)
transactions = list(set(data[data['Country'] =="France"]['InvoiceNo']))
transactions

[567296,
 563202,
 573442,
 567300,
 557069,
 542735,
 540688,
 559134,
 575519,
 544817,
 544818,
 548913,
 579634,
 569402,
 565321,
 565322,
 553044,
 567380,
 555096,
 563288,
 575581,
 575584,
 540789,
 579708,
 575629,
 577687,
 540824,
 540835,
 575661,
 540851,
 553143,
 542904,
 575671,
 569531,
 557247,
 565443,
 553161,
 542922,
 575692,
 579792,
 569568,
 553195,
 557295,
 553208,
 551163,
 567552,
 575747,
 547087,
 536852,
 545051,
 579870,
 540972,
 563502,
 540976,
 543030,
 559418,
 545086,
 559422,
 561470,
 577856,
 567618,
 545105,
 559441,
 565587,
 569686,
 567640,
 569699,
 553316,
 569701,
 575845,
 567657,
 565612,
 547194,
 547196,
 575880,
 575884,
 553357,
 536974,
 549274,
 545180,
 545181,
 539050,
 557483,
 573867,
 573868,
 575916,
 557489,
 541120,
 541121,
 553411,
 573891,
 567756,
 563662,
 541138,
 545235,
 543188,
 571864,
 551389,
 537065,
 539113,
 567793,
 578033,
 559607,
 547327,
 563712,
 555531,
 571923,
 545301,
 580120,
 555547,
 580126,
 

In [18]:
dataset = []
for transaction in transactions:
    dataset.append(list(df[transaction].index))
dataset

[[' TRELLIS COAT RACK',
  'ASSORTED COLOUR MINI CASES',
  'EDWARDIAN PARASOL BLACK',
  'EDWARDIAN PARASOL NATURAL',
  'FOOT STOOL HOME SWEET HOME ',
  'HANGING QUILTED PATCHWORK APPLES',
  'ICE CREAM BUBBLES',
  'LUNCH BAG RED RETROSPOT',
  'LUNCH BAG VINTAGE DOILY ',
  'LUNCH BAG WOODLAND',
  'POSTAGE',
  'RED RETROSPOT MINI CASES',
  'SET OF 6 TEA TIME BAKING CASES',
  'SET OF 72 RETROSPOT PAPER  DOILIES',
  'STOOL HOME SWEET HOME ',
  'TEA PARTY BIRTHDAY CARD',
  'TRAVEL CARD WALLET KEEP CALM',
  'TRAVEL CARD WALLET PANTRY'],
 ['CHEST OF DRAWERS GINGHAM HEART ', 'CREAM SWEETHEART MINI CHEST'],
 ['CHICK GREY HOT WATER BOTTLE',
  'COSY SLIPPER SHOES SMALL GREEN',
  'FAWN BLUE HOT WATER BOTTLE',
  'MINI LIGHTS WOODLAND MUSHROOMS',
  'RABBIT NIGHT LIGHT',
  'RED TOADSTOOL LED NIGHT LIGHT',
  'WHITE SKULL HOT WATER BOTTLE '],
 [' I LOVE LONDON MINI BACKPACK',
  '3 PIECE SPACEBOY COOKIE CUTTER SET',
  'CHILDRENS CUTLERY CIRCUS PARADE',
  'CHILDRENS CUTLERY DOLLY GIRL ',
  'CHILDRENS CUTLE

In [19]:
te = TransactionEncoder()
te_array = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_array, columns=te.columns_)
df

Unnamed: 0,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,NINE DRAWER OFFICE TIDY,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,TRELLIS COAT RACK,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,...,WRAP SUKI AND FRIENDS,WRAP VINTAGE PETALS DESIGN,YELLOW COAT RACK PARIS FASHION,YELLOW GIANT GARDEN THERMOMETER,ZINC STAR T-LIGHT HOLDER,ZINC FOLKART SLEIGH BELLS,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL
0,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [20]:
# Finding frequent itemsets with minimum support of 10% using FP Growth algorithm
frequent_itemsets = fpgrowth(df, min_support = 0.10, use_colnames = True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.771208,(POSTAGE)
1,0.154242,(LUNCH BAG RED RETROSPOT)
2,0.138817,(RED RETROSPOT MINI CASES)
3,0.118252,(LUNCH BAG WOODLAND)
4,0.187661,(RABBIT NIGHT LIGHT)
5,0.179949,(RED TOADSTOOL LED NIGHT LIGHT)
6,0.159383,(ROUND SNACK BOXES SET OF4 WOODLAND )
7,0.125964,(SPACEBOY LUNCH BOX )
8,0.123393,(STRAWBERRY LUNCH BOX WITH CUTLERY)
9,0.100257,(DOLLY GIRL LUNCH BOX)


In [21]:
# Finding association rules which habe minimum confidence of 0.9
rules = association_rules(frequent_itemsets, metric = "confidence", min_threshold = 0.9)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ROUND SNACK BOXES SET OF4 WOODLAND ),(POSTAGE),0.159383,0.771208,0.1491,0.935484,1.213011,0.026183,3.546272
1,(STRAWBERRY LUNCH BOX WITH CUTLERY),(POSTAGE),0.123393,0.771208,0.115681,0.9375,1.215625,0.020519,3.660668
2,(SET/6 RED SPOTTY PAPER PLATES),(SET/6 RED SPOTTY PAPER CUPS),0.128535,0.138817,0.123393,0.96,6.915556,0.10555,21.529563
3,"(SET/6 RED SPOTTY PAPER PLATES, POSTAGE)",(SET/6 RED SPOTTY PAPER CUPS),0.107969,0.138817,0.102828,0.952381,6.86067,0.08784,18.084833
4,"(SET/20 RED RETROSPOT PAPER NAPKINS , SET/6 RE...",(SET/6 RED SPOTTY PAPER PLATES),0.102828,0.128535,0.100257,0.975,7.5855,0.08704,34.858612
5,"(SET/20 RED RETROSPOT PAPER NAPKINS , SET/6 RE...",(SET/6 RED SPOTTY PAPER CUPS),0.102828,0.138817,0.100257,0.975,7.023611,0.085983,34.447301
