# Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

print('Libraries loaded.')

Libraries loaded.


# Example 1: Simple use of algorithm

## Creating Dataset

In [2]:
basket_lists = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
                ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
                ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
                ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
                ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

trans_encoder = TransactionEncoder()

encoded_baskets = trans_encoder.fit(basket_lists).transform(basket_lists)

dataset_df = pd.DataFrame(encoded_baskets, columns= trans_encoder.columns_)

dataset_df

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


## Computing support values

In [3]:
apriori(dataset_df, min_support= 0.6, use_colnames= True)

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Eggs, Kidney Beans)"
6,0.6,"(Eggs, Onion)"
7,0.6,"(Milk, Kidney Beans)"
8,0.6,"(Onion, Kidney Beans)"
9,0.6,"(Yogurt, Kidney Beans)"


# Example2: Association rule mining

## Loading dataset

In [87]:
basket_df = pd.read_excel('Data/Online Retail.xlsx')

basket_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


### Dataset description

## Some basic exploratory

In [88]:
basket_df.shape

(541909, 8)

In [89]:
basket_df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

In [90]:
basket_df.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [91]:
canceled = basket_df.loc[basket_df['InvoiceNo'].str.contains('C') == True]

canceled.shape

(9288, 8)

## Cleansing dataset

In [92]:
basket_df['Description'] = basket_df['Description'].str.strip()

#removing canceled records
basket_df.drop(canceled.index, inplace= True)

print(f'{canceled.shape[0]} has removed from dataset, new dataset size is {basket_df.shape[0]}')

9288 has removed from dataset, new dataset size is 532621


## Preparing dataset for analysing

### Selecting one region (Germany)

In [106]:
france_basket = basket_df.loc[basket_df['Country'] == 'France']

france_basket.shape

(8408, 8)

### Creating transaction, items set

In [107]:
basket = []
transaction_nom = []

for transaction, items in france_basket.groupby('InvoiceNo').groups.items():
    
    products = []
    
    transaction_nom.append(transaction)
    
    for item in items:
        
        products.append(str(france_basket.loc[item]['StockCode']))
    
    ###   
    basket.append(products)

### creating binary dataset

In [108]:
trans_encoder = TransactionEncoder()

encoded_baskets = trans_encoder.fit(basket).transform(basket)

binary_df = pd.DataFrame(encoded_baskets, columns= trans_encoder.columns_, index= transaction_nom)

binary_df

Unnamed: 0,10002,10120,10125,10135,11001,15036,15039,15044C,15056BL,15056N,...,90030C,90031,90099,90184B,90184C,90201B,90201C,C2,M,POST
536370,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
536852,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
536974,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,True
537065,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
537463,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580986,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
581001,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
581171,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
581279,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


### Association extraction(Recommendation based on antecedents)

In [109]:
frequencies_items = apriori(binary_df, min_support= 0.05, use_colnames= True)
  
rules = association_rules(frequencies_items, metric= "lift", min_threshold= 1)

rules = rules.sort_values(['confidence', 'lift'], ascending= [False, False])

rules.reset_index(inplace= True, drop= True)

rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(20712),(POST),0.076531,0.765306,0.076531,1.0,1.306667,0.017961,inf
1,"(22554, 21731)",(POST),0.053571,0.765306,0.053571,1.0,1.306667,0.012573,inf
2,"(21731, 22556)",(POST),0.05102,0.765306,0.05102,1.0,1.306667,0.011974,inf
3,"(21086, 21080)",(21094),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959
4,"(21094, 21080)",(21086),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796


### Filtering results

In [110]:
condition0 = rules.antecedents.apply(lambda x: False if 'POST' in x else True)

condition1 = rules.consequents.apply(lambda x: False if 'POST' in x else True)

condition2 = rules['antecedents'].apply(lambda x: len(x) >= 2)

rules.loc[condition0 & condition1 & condition2]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
3,"(21086, 21080)",(21094),0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959
4,"(21094, 21080)",(21086),0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796
53,"(22726, 22728)",(22727),0.07398,0.094388,0.063776,0.862069,9.133271,0.056793,6.565689
54,"(22727, 22728)",(22726),0.07398,0.096939,0.063776,0.862069,8.892922,0.056604,6.547194
81,"(21086, 21094)",(21080),0.122449,0.132653,0.09949,0.8125,6.125,0.083247,4.62585
84,"(22726, 22727)",(22728),0.079082,0.102041,0.063776,0.806452,7.903226,0.055706,4.639456
109,"(22556, 22551)",(22554),0.089286,0.170918,0.068878,0.771429,4.513433,0.053617,3.627232
143,"(22554, 22556)",(22551),0.102041,0.137755,0.068878,0.675,4.9,0.054821,2.653061
147,"(22554, 22551)",(22556),0.104592,0.168367,0.068878,0.658537,3.911308,0.051268,2.435496


# Excersice1

# Excersize2