In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

Collecting mlxtend==0.23.1
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.23.3
    Uninstalling mlxtend-0.23.3:
      Successfully uninstalled mlxtend-0.23.3
Successfully installed mlxtend-0.23.1


# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [2]:
# load the data set ans show the first five transaction
# Load the dataset
df = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv')

# Show the first five transactions
df.head()


Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [3]:
products = set()
for col in df.columns:
    products.update(df[col].unique())
print (products)

{'Bagel', 'Cheese', 'Pencil', 'Milk', 'Bread', 'Diaper', 'Meat', 'Wine', 'Eggs', nan}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [4]:
#create an itemset based on the products

products = set()
for col in df.columns:
    products.update(df[col].unique())


# encoding the feature
encoded_transactions = []
for _, row in df.iterrows():
    transaction_dict = {product: (1 if product in row.values else 0) for product in products}
    encoded_transactions.append(transaction_dict)

encoded_transactions[0]


{'Bagel': 0,
 'Cheese': 1,
 'Pencil': 1,
 'Milk': 0,
 'Bread': 1,
 'Diaper': 1,
 'Meat': 1,
 'Wine': 1,
 'Eggs': 1,
 nan: 0}

In [5]:
# Replace NaN with a specific value, e.g., 'No Product'
df_with_nan = df.fillna('NaN')

# Flatten the data again, but keep 'No Product' as a category
flattened_data = df_with_nan.values.flatten()

# Step 1: Use OneHotEncoder, including 'No Product' (which represents missing values)
# Replace 'sparse' with 'sparse_output'
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # Make sure unknown values are ignored
encoded_data = encoder.fit_transform(flattened_data.reshape(-1, 1))

# Step 2: Create the one-hot encoded DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=encoder.categories_[0])

# Step 3: Create a DataFrame with 0s and 1s, representing whether the item is bought
product_data = pd.DataFrame(0, index=df.index, columns=encoder.categories_[0])

# Loop through each row and mark presence of products, including 'No Product'
for i, row in df_with_nan.iterrows():
    for product in row:
        product_data.loc[i, product] = 1

# Display the final DataFrame
print("Final one-hot encoded dataset with 'No Product':")
product_data.head()

Final one-hot encoded dataset with 'No Product':


Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,NaN,Pencil,Wine
0,0,1,1,1,1,1,0,0,1,1
1,0,1,1,1,0,1,1,0,1,1
2,0,0,1,0,1,1,1,1,0,1
3,0,0,1,0,1,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1,1


In [6]:
# Since, the encoded dataframe consist of the empty column. We will drop the NaN column or u can use the index.

if 'NaN' in product_data.columns:
    product_data.drop(columns=['NaN'], inplace=True)

product_data.head()

Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [7]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules
min_support = 0.2
frequent_itemsets = apriori(product_data, min_support=min_support, use_colnames=True)
frequent_itemsets




Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.504762,(Bread)
2,0.501587,(Cheese)
3,0.406349,(Diaper)
4,0.438095,(Eggs)
5,0.47619,(Meat)
6,0.501587,(Milk)
7,0.361905,(Pencil)
8,0.438095,(Wine)
9,0.279365,"(Bagel, Bread)"


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [8]:
confidence_threshold = 0.6
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=confidence_threshold)
rules.drop(columns=['zhangs_metric'], inplace=True)
rules

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
1,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203
2,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891
3,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
4,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
5,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
6,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754
7,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624
8,"(Cheese, Meat)",(Eggs),0.32381,0.438095,0.215873,0.666667,1.521739,0.074014,1.685714
9,"(Eggs, Meat)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)

**Antecedent Support** represents the proportion of transactions that include the item on the left-hand side of the rule (the antecedent). For example, in the rule *Bagel → Bread*, the antecedent *Bagel* has a support of 42.54%, meaning *Bagel* appears in 42.54% of all transactions. **Consequent Support** indicates the proportion of transactions containing the item on the right-hand side of the rule (the consequent), such as *Bread*, which has a support of 50.48%.

**Support** measures the proportion of transactions that contain both the antecedent and consequent together. For the rule *Bagel → Bread*, this combination occurs in 27.94% of transactions.  

**Confidence** measures the likelihood of the consequent occurring when the antecedent is present. For the rule *Bagel → Bread*, the confidence is 65.67%, meaning 65.67% of transactions that include *Bagel* also include *Bread*.

**Lift** evaluates the strength of the relationship between the antecedent and consequent; a lift value greater than 1 indicates a positive association. For the rule *Bagel → Bread*, the lift is 1.30, showing a moderate positive relationship.

**Leverage** measures the difference between the actual frequency of the antecedent-consequent pair and the frequency expected if the items were independent. For *Bagel → Bread*, the leverage of 0.046 indicates that this combination occurs slightly more frequently than expected if the items were unrelated. Lastly,

**Conviction** assesses the strength of the rule by considering how rarely the antecedent occurs without the consequent. For *Bagel → Bread*, the conviction is 1.44, reflecting a moderately strong, but not perfect, relationship.  

In this analysis, the rule with the highest *lift* is *Cheese → Meat* (lift = 1.52), suggesting a relatively strong positive association between purchasing cheese and meat. The highest *confidence* is found in the rule *Eggs, Meat → Cheese* with a value of 80.95%, meaning that most transactions containing eggs and meat also include cheese. The rule with the highest *conviction* is *Milk, Cheese → Meat* (conviction = 2.95), indicating a significant association between milk, cheese, and meat. On the other hand, rules with lower *lift*, such as *Milk → Cheese* (lift = 1.21), indicate weaker relationships. This analysis provides insights into consumer purchasing patterns, which can be leveraged for more effective promotional strategies.

Source : https://www.techtarget.com/searchbusinessanalytics/definition/association-rules-in-data-mining#:~:text=Association%20rules%20are%20if%2Dthen,in%20various%20types%20of%20databases.

