In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

Collecting mlxtend==0.23.1
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.23.3
    Uninstalling mlxtend-0.23.3:
      Successfully uninstalled mlxtend-0.23.3
Successfully installed mlxtend-0.23.1


# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [None]:
# prompt: # load the data set ans show the first five transaction
# You can get the dataset here: https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

import pandas as pd
!wget https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv
df = pd.read_csv('retail_dataset.csv')
print(df.head())

--2024-11-28 07:51:19--  https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8420 (8.2K) [text/plain]
Saving to: ‘retail_dataset.csv’


2024-11-28 07:51:19 (37.8 MB/s) - ‘retail_dataset.csv’ saved [8420/8420]

        0       1     2       3       4       5       6
0   Bread    Wine  Eggs    Meat  Cheese  Pencil  Diaper
1   Bread  Cheese  Meat  Diaper    Wine    Milk  Pencil
2  Cheese    Meat  Eggs    Milk    Wine     NaN     NaN
3  Cheese    Meat  Eggs    Milk    Wine     NaN     NaN
4    Meat  Pencil  Wine     NaN     NaN     NaN     NaN


# Get the set of product that has been purchased


In [None]:

product = set()
for col in df:
    product.update(df[col].unique())
print (product)


{'Diaper', nan, 'Meat', 'Cheese', 'Milk', 'Wine', 'Bagel', 'Bread', 'Pencil', 'Eggs'}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [7]:
#create an itemset based on the products
# encoding the feature
itemset = set()
for i in range(0, len(df)):
    itemset.update(df.iloc[i].dropna().values)
# Check if nan is in itemset before removing
if np.nan in itemset:
  itemset.remove(np.nan)
print(itemset)

# create a dictionary to store the encoding
encoding = {}
for i, item in enumerate(itemset):
  encoding[item] = i

encoding

{'Diaper', 'Meat', 'Cheese', 'Milk', 'Wine', 'Bagel', 'Bread', 'Pencil', 'Eggs'}


{'Diaper': 0,
 'Meat': 1,
 'Cheese': 2,
 'Milk': 3,
 'Wine': 4,
 'Bagel': 5,
 'Bread': 6,
 'Pencil': 7,
 'Eggs': 8}

In [9]:
encoded_df = pd.DataFrame(0, index=np.arange(len(df)), columns=list(itemset))

for i in range(0, len(df)):
    for item in df.iloc[i].dropna().values:
        if item in encoding:
            encoded_df.loc[i, item] = 1

display(encoded_df)

Unnamed: 0,Diaper,Meat,Cheese,Milk,Wine,Bagel,Bread,Pencil,Eggs
0,1,1,1,0,1,0,1,1,1
1,1,1,1,1,1,0,1,1,0
2,0,1,1,1,1,0,0,0,1
3,0,1,1,1,1,0,0,0,1
4,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...
310,0,0,1,0,0,0,1,0,1
311,0,1,0,1,0,0,0,1,0
312,1,1,1,0,1,0,1,1,1
313,0,1,1,0,0,0,0,0,0


In [10]:
# Since, the encoded dataframe consist of the empty column. We will drop the NaN column or u can use the index.

# Option 1: Drop the NaN column by name if it exists
if np.nan in encoded_df.columns:
    encoded_df = encoded_df.drop(columns=[np.nan])

# Option 2: Select all columns except the first one (if NaN is the first column)
# encoded_df = encoded_df.iloc[:, 1:]

display(encoded_df)

Unnamed: 0,Diaper,Meat,Cheese,Milk,Wine,Bagel,Bread,Pencil,Eggs
0,1,1,1,0,1,0,1,1,1
1,1,1,1,1,1,0,1,1,0
2,0,1,1,1,1,0,0,0,1
3,0,1,1,1,1,0,0,0,1
4,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...
310,0,0,1,0,0,0,1,0,1
311,0,1,0,1,0,0,0,1,0
312,1,1,1,0,1,0,1,1,1
313,0,1,1,0,0,0,0,0,0


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [11]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules

# Set the minimum support threshold
min_support = 0.2

# Apply Apriori algorithm
frequent_itemsets = apriori(encoded_df, min_support=min_support, use_colnames=True)

# Display the frequent itemsets
display(frequent_itemsets)




Unnamed: 0,support,itemsets
0,0.406349,(Diaper)
1,0.47619,(Meat)
2,0.501587,(Cheese)
3,0.501587,(Milk)
4,0.438095,(Wine)
5,0.425397,(Bagel)
6,0.504762,(Bread)
7,0.361905,(Pencil)
8,0.438095,(Eggs)
9,0.2,"(Cheese, Diaper)"


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [12]:
# Set the minimum confidence threshold
min_confidence = 0.6

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

# Display the rules
display(rules)

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891,0.526414
1,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754,0.500891
2,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624,0.387409
3,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
4,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
5,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754,0.330409
6,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203,0.469167
7,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265,0.402687
8,"(Cheese, Milk)",(Meat),0.304762,0.47619,0.203175,0.666667,1.4,0.05805,1.571429,0.410959
9,"(Cheese, Meat)",(Milk),0.32381,0.501587,0.203175,0.627451,1.250931,0.040756,1.337845,0.296655


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)

The metrics in association rule mining provide insights into the relationships between items in a dataset. Antecedent support measures how often the antecedent (the "if" part of the rule) appears in transactions, while consequent support measures how often the consequent (the "then" part) appears. Support indicates the proportion of transactions containing both the antecedent and consequent, representing their joint probability. Confidence measures the likelihood that the consequent appears in a transaction given the presence of the antecedent, indicating the strength of the rule. Lift compares the observed co-occurrence of antecedent and consequent to what would be expected if they were independent, with values greater than 1 suggesting a positive association. Leverage quantifies the difference between the observed co-occurrence and the expected frequency under independence, with higher positive values indicating stronger associations. Conviction assesses how strongly the consequent depends on the antecedent, with higher values implying more reliable rules. For example, in a rule like {Milk} → {Bread}, high confidence means that customers who buy milk are very likely to buy bread, and a lift greater than 1 implies that this co-occurrence is not due to random chance but a meaningful relationship. These metrics collectively help identify and interpret actionable patterns in transactional data.