In [1]:
import pandas as pd

In [2]:
df = pd.read_excel(r'C:\Users\barsuraj1\Desktop\Product Recommendation\datasets\Online Retail.xlsx')

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


Goal is to recommend products purchased together by checing the frequency by which different items are purchased together, so we only need information about individual orders and individual products. 
as it will be more convenient for display purposes we will also use the Description. 
so we don't required the rest of the attributed.

We will keep two DataFrames.

One for Building the recommendation system with the following features:

InvoiceNo
StockCode

And one for matching the description to the StockCode:

StockCode
Description

In [4]:
# Modify StockCode to always be a string by prepending '_'
df['StockCode'] = df['StockCode'].apply(lambda x: '_'+str(x))

In [5]:
# DataFrame for building the recommendation system
orders = df[['InvoiceNo', 'StockCode']]
orders.head()

Unnamed: 0,InvoiceNo,StockCode
0,536365,_85123A
1,536365,_71053
2,536365,_84406B
3,536365,_84029G
4,536365,_84029E


In [6]:
# DataFrame for retrieving product descriptions
products = df[['StockCode', 'Description']].copy()
products.head()

Unnamed: 0,StockCode,Description
0,_85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,_71053,WHITE METAL LANTERN
2,_84406B,CREAM CUPID HEARTS COAT HANGER
3,_84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,_84029E,RED WOOLLY HOTTIE WHITE HEART.


In [7]:
# Drop duplicated products
products = products[~products.duplicated()]
products.head()

Unnamed: 0,StockCode,Description
0,_85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,_71053,WHITE METAL LANTERN
2,_84406B,CREAM CUPID HEARTS COAT HANGER
3,_84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,_84029E,RED WOOLLY HOTTIE WHITE HEART.


In [8]:
# Drop descriptions that are not uppercase
products = products[products['Description'].str.upper() == products['Description']]

In [9]:
# Keep only the first Description of each product
products = products[~products.duplicated(subset=['StockCode'])]

products

Unnamed: 0,StockCode,Description
0,_85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,_71053,WHITE METAL LANTERN
2,_84406B,CREAM CUPID HEARTS COAT HANGER
3,_84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,_84029E,RED WOOLLY HOTTIE WHITE HEART.
...,...,...
509369,_85179a,GREEN BITTY LIGHT CHAIN
512588,_23617,SET 10 CARDS SWIRLY XMAS TREE 17104
527065,_90214U,"LETTER ""U"" BLING KEY RING"
537224,_47591b,SCOTTIES CHILDRENS APRON


In [10]:
products.shape

(3905, 2)

In [11]:
# Set the index to StockCode
products = products.set_index('StockCode')

In [12]:
# Convert to Series for eve easier lookups
products = products['Description']

In [15]:
# Test it out
products['_71053']

'WHITE METAL LANTERN'

In [16]:
# No of unique products
len(products)

3905

In [22]:
### no of unique products
products.nunique()

3779

### Restructure the data

We would like each Invoice Number to give us a list of stock codes.

In [23]:
def string_list(x):
    return [str(i) for i in x]

In [24]:
orders = orders.groupby('InvoiceNo')['StockCode'].apply(list).reset_index()
orders.head()

Unnamed: 0,InvoiceNo,StockCode
0,536365,"[_85123A, _71053, _84406B, _84029G, _84029E, _..."
1,536366,"[_22633, _22632]"
2,536367,"[_84879, _22745, _22748, _22749, _22310, _8496..."
3,536368,"[_22960, _22913, _22912, _22914]"
4,536369,[_21756]


In [26]:
from mlxtend.preprocessing import TransactionEncoder

In [27]:
te = TransactionEncoder()

te.fit(orders['StockCode'])
orders_1hot = te.transform(orders['StockCode'])

In [28]:
orders_1hot = pd.DataFrame(orders_1hot, columns =te.columns_)
orders_1hot.head()

Unnamed: 0,_10002,_10080,_10120,_10123C,_10123G,_10124A,_10124G,_10125,_10133,_10134,...,_M,_PADS,_POST,_S,_gift_0001_10,_gift_0001_20,_gift_0001_30,_gift_0001_40,_gift_0001_50,_m
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Apriori

In [29]:
from mlxtend.frequent_patterns import apriori

In [30]:
%%timeit -n1 -r1

apriori(orders_1hot, min_support=0.01, max_len=2, use_colnames=True)

5min 9s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [31]:
is_ap = apriori(orders_1hot, min_support=0.01, max_len=2, use_colnames=True)

In [32]:
is_ap.head()

Unnamed: 0,support,itemsets
0,0.020193,(_15036)
1,0.012587,(_15056BL)
2,0.017876,(_15056N)
3,0.011236,(_16237)
4,0.01251,(_20675)


## FP Growth

In [33]:
from mlxtend.frequent_patterns import fpgrowth

In [34]:
%%timeit -n1 -r1

fpgrowth(orders_1hot, min_support=0.01, max_len=2, use_colnames=True)

13.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [35]:
is_fp = fpgrowth(orders_1hot, min_support=0.01, max_len=2, use_colnames=True)

In [36]:
is_fp.head()

Unnamed: 0,support,itemsets
0,0.086718,(_85123A)
1,0.017915,(_84029G)
2,0.016911,(_84029E)
3,0.014865,(_22752)
4,0.013205,(_71053)


## Show that is_ap and is_fp both contain the same itemsets

In [37]:
def itemset_to_ordered_string(itemset):
    return ','.join(sorted(list(itemset)))

In [38]:
ap_itemset_strings = is_ap['itemsets'].apply(itemset_to_ordered_string)

In [39]:
ap_itemset_strings = ap_itemset_strings.sort_values().reset_index(drop=True)

In [40]:
ap_itemset_strings.head()

0      _15036
1    _15056BL
2     _15056N
3      _16237
4      _20675
Name: itemsets, dtype: object

In [41]:
fp_itemset_strings = is_fp['itemsets'].apply(itemset_to_ordered_string)

In [42]:
fp_itemset_strings = fp_itemset_strings.sort_values().reset_index(drop=True)
fp_itemset_strings.head()

0      _15036
1    _15056BL
2     _15056N
3      _16237
4      _20675
Name: itemsets, dtype: object

In [43]:
# test to see if the itemset lists are equal
fp_itemset_strings.equals(ap_itemset_strings)

True

## Calculate Association Rules

In [44]:
from mlxtend.frequent_patterns import association_rules

In [45]:
is_fp

Unnamed: 0,support,itemsets
0,0.086718,(_85123A)
1,0.017915,(_84029G)
2,0.016911,(_84029E)
3,0.014865,(_22752)
4,0.013205,(_71053)
...,...,...
997,0.010077,"(_23203, _23344)"
998,0.010039,"(_23344, _22086)"
999,0.011853,"(_23293, _23295)"
1000,0.010077,"(_23293, _23296)"


In [46]:
rules = association_rules(is_fp, metric="lift", min_threshold=10)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(_21754),(_21755),0.030386,0.024363,0.011467,0.377382,15.490025,0.010727,1.566993
1,(_21755),(_21754),0.024363,0.030386,0.011467,0.470681,15.490025,0.010727,1.831815
2,(_22748),(_22745),0.016988,0.016448,0.012124,0.713636,43.387751,0.011844,3.434626
3,(_22745),(_22748),0.016448,0.016988,0.012124,0.737089,43.387751,0.011844,3.738955
4,(_22726),(_22727),0.038726,0.041737,0.024942,0.644068,15.431412,0.023326,2.692261


In [47]:
rules.shape

(280, 9)

##  Predictions

In [48]:
def predict(antecedent, rules, max_results= 6):
    
    # get the rules for this antecedent
    preds = rules[rules['antecedents'] == antecedent]
    
    # a messy way to convert a frozen set with one element to string
    preds = preds['consequents'].apply(iter).apply(next)
    
    return preds[:max_results].reset_index(drop=True)

In [61]:
preds = predict({'_20712'}, rules)

In [62]:
preds

0    _22379
1    _20713
2    _21930
3    _21931
4    _22386
5    _21928
Name: consequents, dtype: object

In [63]:
print(products['_20712'])

JUMBO BAG WOODLAND ANIMALS


In [64]:
for stockid in preds:  
    print(products[stockid])

RECYCLING BAG RETROSPOT 
JUMBO BAG OWLS
JUMBO STORAGE BAG SKULLS
JUMBO STORAGE BAG SUKI
JUMBO BAG PINK POLKADOT
JUMBO BAG SCANDINAVIAN PAISLEY


In [82]:
print(products['_21755'])

LOVE BUILDING BLOCK WORD


In [83]:
# get the predictions
preds = predict({'_21755'}, rules)

# Display the descriptions of the predictions
for stockid in preds:  
    print(products[stockid])

HOME BUILDING BLOCK WORD


In [84]:
print(products['_22748'])

POPPY'S PLAYHOUSE KITCHEN


In [85]:
# get the predictions
preds = predict({'_22748'}, rules)

# Display the descriptions of the predictions
for stockid in preds:  
    print(products[stockid])

POPPY'S PLAYHOUSE BEDROOM 
POPPY'S PLAYHOUSE LIVINGROOM 


In [77]:
rules.antecedents

0      (_21754)
1      (_21755)
2      (_22748)
3      (_22745)
4      (_22726)
         ...   
275    (_23295)
276    (_23293)
277    (_23296)
278    (_23355)
279    (_22112)
Name: antecedents, Length: 280, dtype: object