In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

Collecting mlxtend==0.23.1
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.23.3
    Uninstalling mlxtend-0.23.3:
      Successfully uninstalled mlxtend-0.23.3
Successfully installed mlxtend-0.23.1


# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [2]:
# load the data set ans show the first five transaction
url = "https://raw.githubusercontent.com/tsalisacamila/Data-Mining/refs/heads/main/retail_dataset.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


In [3]:
print(set(np.ravel(df)))

{'Diaper', 'Wine', 'Bread', 'Pencil', 'Eggs', 'Cheese', 'Bagel', 'Meat', 'Milk', nan}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [6]:
#create an itemset based on the products
itemset = set(np.ravel(df))
# encoding the feature
encodedValue = []
for index, row in df.iterrows():
    rowset = set(row)
    labels = {}
    uncommons = list(itemset - rowset)
    commons = list(itemset.intersection(rowset))
    for i in uncommons:
        labels[i] = 0
    for j in commons:
        labels[j] = 1
    encodedValue.append(labels)

print(labels)



{'Diaper': 0, 'Cheese': 0, 'Pencil': 0, 'Milk': 0, 'Wine': 1, 'Bread': 1, 'Eggs': 1, 'Bagel': 1, 'Meat': 1, nan: 1}


In [9]:
# create new dataframe from the encoded features
encodeddf = pd.DataFrame(encodedValue)
# show the new dataframe
encodeddf.head()


Unnamed: 0,NaN,Milk,Bagel,Diaper,Wine,Bread,Pencil,Eggs,Cheese,Meat
0,0,0,0,1,1,1,1,1,1,1
1,0,1,0,1,1,1,1,0,1,1
2,1,1,0,0,1,0,0,1,1,1
3,1,1,0,0,1,0,0,1,1,1
4,1,0,0,0,1,0,1,0,0,1


In [10]:
# Since, the encoded dataframe consist of the empty column. We will drop the NaN column or u can use the index.
encodeddf = encodeddf.drop(encodeddf.columns[2], axis=1)
encodeddf


Unnamed: 0,NaN,Milk,Diaper,Wine,Bread,Pencil,Eggs,Cheese,Meat
0,0,0,1,1,1,1,1,1,1
1,0,1,1,1,1,1,0,1,1
2,1,1,0,1,0,0,1,1,1
3,1,1,0,1,0,0,1,1,1
4,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
310,1,0,0,0,1,0,1,1,0
311,1,1,0,0,0,1,0,0,1
312,0,0,1,1,1,1,1,1,1
313,1,0,0,0,0,0,0,1,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [11]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules
freqpurchase = apriori(encodeddf, min_support=0.2, use_colnames=True)
freqpurchase.head(33)


Unnamed: 0,support,itemsets
0,0.869841,(nan)
1,0.501587,(Milk)
2,0.406349,(Diaper)
3,0.438095,(Wine)
4,0.504762,(Bread)
5,0.361905,(Pencil)
6,0.438095,(Eggs)
7,0.501587,(Cheese)
8,0.47619,(Meat)
9,0.409524,"(Milk, nan)"


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [12]:
assRules = association_rules(freqpurchase, metric="confidence", min_threshold=0.6)
assRules.head(14)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Milk),(nan),0.501587,0.869841,0.409524,0.816456,0.938626,-0.026778,0.709141,-0.115976
1,(Diaper),(nan),0.406349,0.869841,0.31746,0.78125,0.898152,-0.035999,0.595011,-0.160381
2,(Wine),(nan),0.438095,0.869841,0.31746,0.724638,0.833069,-0.063613,0.472682,-0.262869
3,(Bread),(nan),0.504762,0.869841,0.396825,0.786164,0.903801,-0.042237,0.608683,-0.176903
4,(Pencil),(nan),0.361905,0.869841,0.266667,0.736842,0.8471,-0.048133,0.494603,-0.220499
5,(Eggs),(nan),0.438095,0.869841,0.336508,0.768116,0.883053,-0.044565,0.56131,-0.190735
6,(Cheese),(nan),0.501587,0.869841,0.393651,0.78481,0.902245,-0.042651,0.604855,-0.178565
7,(Meat),(nan),0.47619,0.869841,0.368254,0.773333,0.889051,-0.045956,0.57423,-0.192405
8,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
9,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)

1. Antecedent Support
Antecedent support refers to the frequency or probability of the occurrence of the items on the left-hand side (antecedent) of a rule in the dataset. It tells you how often the items in the antecedent appear in the database.
2. Consequent Support
Consequent support refers to the frequency or probability of the occurrence of the items on the right-hand side (consequent) of a rule in the dataset. It measures how often the consequent item appears in transactions.
3. Support
Support is a general metric that measures how often a particular itemset (or combination of items) appears in the dataset. It is used to identify frequent itemsets in market basket analysis.
4. Confidence
Confidence is a measure of the likelihood that the consequent item will appear in a transaction, given that the antecedent item is already present. It shows the strength of the association between the antecedent and consequent.
5. Lift
Lift measures how much more likely the consequent item is to appear when the antecedent is present, compared to when the antecedent is absent. It is a ratio that compares the observed support of the rule to the expected support if the two items were independent.
6. Leverage
Leverage quantifies the difference between the observed support of a rule and the expected support under the assumption that the antecedent and consequent are independent.
7. Conviction
Conviction is a metric that measures how likely it is that the rule will hold if the antecedent occurs. It is a measure of how much stronger the rule is than just random chance.
8. Interpretation of the Case: Given these metrics, it would interpret a case involving association rule mining or market basket analysis by looking at the following:
- **Support** tells you how frequent a specific rule or itemset is in your data. High support suggests that the items or rules are common in transactions.
- **Confidence** gives insight into how strong the relationship is between the antecedent and consequent. A higher confidence means that if the antecedent appears, the consequent is likely to appear as well.
- **Lift** shows how much more likely the antecedent and consequent are to co-occur than would be expected by chance. Lift values greater than 1 suggest that the two items are positively correlated.
- **Leverage** quantifies how much more frequent the occurrence of the antecedent and consequent together is compared to their independent occurrences.
- **Conviction** measures the reliability of a rule and can be particularly useful for distinguishing strong rules from weak ones.

