<a href="https://colab.research.google.com/github/xinxingwu-uk/Experiments_C_S/blob/main/AssociationAnalysis_CSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [2]:
path="/content/GroceryStoreDataSet.csv"

In [3]:
loadData = pd.read_csv(path, names = ['products'], sep = ',')

In [4]:
loadData

Unnamed: 0,products
0,"MILK,BREAD,BISCUIT"
1,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,"BREAD,TEA,BOURNVITA"
3,"JAM,MAGGI,BREAD,MILK"
4,"MAGGI,TEA,BISCUIT"
5,"BREAD,TEA,BOURNVITA"
6,"MAGGI,TEA,CORNFLAKES"
7,"MAGGI,BREAD,TEA,BISCUIT"
8,"JAM,MAGGI,BREAD,TEA"
9,"BREAD,MILK"


In [5]:
loadData.shape

(20, 1)

In [6]:
# Let's split the products and create a list called by 'data',
data = list(loadData["products"].apply(lambda x:x.split(",") ))
data

[['MILK', 'BREAD', 'BISCUIT'],
 ['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['JAM', 'MAGGI', 'BREAD', 'MILK'],
 ['MAGGI', 'TEA', 'BISCUIT'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['MAGGI', 'TEA', 'CORNFLAKES'],
 ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'],
 ['JAM', 'MAGGI', 'BREAD', 'TEA'],
 ['BREAD', 'MILK'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'COCK'],
 ['BREAD', 'SUGER', 'BISCUIT'],
 ['COFFEE', 'SUGER', 'CORNFLAKES'],
 ['BREAD', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]

---
# Apriori Algorithm and One-Hot Encoding

#### Apriori's algorithm transforms True/False or 1/0.

#### Using TransactionEncoder, we convert the list to a One-Hot Encoded Boolean list.

#### Products that customers bought or did not buy during shopping will now be represented by values 1 and 0.
---

In [7]:
#Let's transform the list, with one-hot encoding
te = TransactionEncoder()
te_data = te.fit(data).transform(data)
#te_data =te.fit_transform(data)
te_df = pd.DataFrame(te_data,columns=te.columns_)
te_df_rp0 = te_df.replace(False,0)
te_df_rp01 = te_df_rp0.replace(True,1)
te_df_rp01

Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COCK,COFFEE,CORNFLAKES,JAM,MAGGI,MILK,SUGER,TEA
0,1,0,1,0,0,0,0,0,1,0,0
1,1,0,1,0,0,1,0,0,1,0,0
2,0,1,1,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,1,1,1,0,0
4,1,0,0,0,0,0,0,1,0,0,1
5,0,1,1,0,0,0,0,0,0,0,1
6,0,0,0,0,0,1,0,1,0,0,1
7,1,0,1,0,0,0,0,1,0,0,1
8,0,0,1,0,0,0,1,1,0,0,1
9,0,0,1,0,0,0,0,0,1,0,0


---
# Applying Apriori and Resulting

#### The next step is to create the Apriori Model. We can change all the parameters in the Apriori Model in the mlxtend package.

#### Use minimum support parameters for this modeling.

#### For this, I set a min_support value with a threshold value of 20% and printed them on the screen as well.
---

In [8]:
# Find frequent item sets
#set a threshold value for the support value and calculate the support value.
frequent_items = apriori(te_df_rp01, min_support = 0.2, use_colnames = True)
frequent_items

Unnamed: 0,support,itemsets
0,0.35,(BISCUIT)
1,0.2,(BOURNVITA)
2,0.65,(BREAD)
3,0.4,(COFFEE)
4,0.3,(CORNFLAKES)
5,0.25,(MAGGI)
6,0.25,(MILK)
7,0.3,(SUGER)
8,0.35,(TEA)
9,0.2,"(BREAD, BISCUIT)"


---
#### Chose the 60% minimum confidence value. In other words, when product X is purchased, we can say that the purchase of product Y is 60% or more.
---

In [9]:
# Generate strong association rules
#Let's view our interpretation values using the Associan rule function
rules = association_rules(frequent_items, metric = "confidence", min_threshold = 0.6)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75
1,(SUGER),(BREAD),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05
2,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8
3,(SUGER),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8
4,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25


In [10]:
rules_sorted = rules.sort_values(by='confidence', ascending =False)
rules_sorted

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75
4,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25
1,(SUGER),(BREAD),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05
2,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8
3,(SUGER),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8
