In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

Collecting mlxtend==0.23.1
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.23.3
    Uninstalling mlxtend-0.23.3:
      Successfully uninstalled mlxtend-0.23.3
Successfully installed mlxtend-0.23.1


# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [64]:
# load the data set ans show the first five transaction
df = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv')
df.head()


  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [65]:
products = set()
for col in df.columns:
    products.update(df[col].unique())
print(products)

{'Bagel', nan, 'Wine', 'Cheese', 'Meat', 'Bread', 'Eggs', 'Diaper', 'Milk', 'Pencil'}


  and should_run_async(code)


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [66]:
#create an itemset based on the products
# Convert rows to transactions, keeping NaN values
transactions = df.apply(lambda row: row.tolist(), axis=1).tolist()

# Handle NaN by replacing it with a special placeholder ('NaN')
transactions_with_nan = [
    ['NaN' if item is None else item for item in transaction]
    for transaction in transactions
]

# Flatten the transactions to create a list of unique items, including 'NaN'
# Convert all items to strings before sorting to avoid type comparison errors
all_items = sorted(set(str(item) for transaction in transactions_with_nan for item in transaction))

# Add 'NaN' to the list of items explicitly to handle it properly
# Since we've already converted everything to strings, we need to add 'nan' as a string
all_items.append('nan')

# encoding the feature
# Prepare the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit([[item] for item in all_items])

# Create dictionaries for each transaction
encoded_transactions = []
for transaction in transactions_with_nan:
    # Encode the current transaction
    encoded_array = encoder.transform([[item] for item in transaction])

    # Sum the encoded results to get the presence of each item
    summed_encoding = encoded_array.sum(axis=0)

    # Map the items back to the dictionary format
    encoded_dict = {item: int(value) for item, value in zip(encoder.categories_[0], summed_encoding)}
    encoded_transactions.append(encoded_dict)

# Display the first encoded transaction
encoded_transactions[0]

  and should_run_async(code)


{'Bagel': 0,
 'Bread': 1,
 'Cheese': 1,
 'Diaper': 1,
 'Eggs': 1,
 'Meat': 1,
 'Milk': 0,
 'Pencil': 1,
 'Wine': 1,
 'nan': 0}

In [67]:
  # create new dataframe from the encoded features
df_new = pd.DataFrame(encoded_transactions)

  # show the new dataframe
df_new.head()

  and should_run_async(code)


Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine,nan
0,0,1,1,1,1,1,0,1,1,0
1,0,1,1,1,0,1,1,1,1,0
2,0,0,1,0,1,1,1,0,1,0
3,0,0,1,0,1,1,1,0,1,0
4,0,0,0,0,0,1,0,1,1,0


In [68]:
# Since, the encoded dataframe consist of the empty column. We will drop the NaN column or u can use the index.
df_new.drop(columns='nan', inplace=True)

df_new.head()


  and should_run_async(code)


Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [71]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules

support = 0.2
frequent_itemsets = apriori(df_new, min_support=support, use_colnames=True)

frequent_itemsets


  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.504762,(Bread)
2,0.501587,(Cheese)
3,0.406349,(Diaper)
4,0.438095,(Eggs)
5,0.47619,(Meat)
6,0.501587,(Milk)
7,0.361905,(Pencil)
8,0.438095,(Wine)
9,0.279365,"(Bagel, Bread)"


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [73]:
threshold = 0.6
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=threshold)

rules.drop(columns='zhangs_metric', inplace=True)
rules

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
1,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203
2,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891
3,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
4,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
5,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
6,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754
7,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624
8,"(Eggs, Cheese)",(Meat),0.298413,0.47619,0.215873,0.723404,1.519149,0.073772,1.893773
9,"(Eggs, Meat)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)

The association rules shows relationships between items purchased together. The antecedent is the item(s) that a customer buys first, and the consequent is the item(s) that they are likely to buy after. The other columns describe various metrics used to evaluate the strength and significance of these associations.

Key Metrics and Their Interpretation:
- Antecedent Support: This measures how frequently the antecedent (the "if" part of the rule) appears in the transactions. For example, in the first rule, Bagel appears in 42.54% of all transactions (0.425397), meaning Bagels are bought in about 42.5% of the total transactions.

- Consequent Support: This shows how frequently the consequent (the "then" part of the rule) appears in the transactions. For instance, Bread appears in 50.48% of transactions, indicating that Bread is bought in half of the transactions.

- Support: This value tells us how often the antecedent and consequent appear together in a transaction. For the rule from Bagel -> Bread, the support is 27.94%, meaning that Bagel and Bread are purchased together in 27.94% of transactions. Support gives a measure of the overall importance of the rule.

- Confidence: Confidence indicates how likely it is that the consequent will appear given that the antecedent has appeared in a transaction. For the Bagel -> Bread rule, the confidence is 65.67%, meaning that if a customer buys Bagels, there is a 65.67% chance they will also buy Bread.

- Lift: Lift tells us how much more likely the consequent is to occur with the antecedent than without. A lift greater than 1 suggests a positive relationship. In the case of Bagel -> Bread, the lift is 1.30, which means that the occurrence of Bagel increases the likelihood of Bread being purchased by 1.30 times, compared to its overall likelihood of being purchased without the presence of Bagels.

- Leverage: Leverage is a measure of the difference between the observed frequency of the rule and the expected frequency if the items were independent. For Bagel -> Bread, the leverage is 0.064641, indicating the strength of the co-occurrence of Bagel and Bread relative to their individual probabilities.

- Conviction: This metric gives an indication of the strength of the rule in terms of the likelihood of the antecedent occurring without the consequent. A conviction greater than 1 suggests a strong association. For Bagel -> Bread, the conviction is 1.44, indicating a reasonably strong association, meaning if a customer buys Bagels, it is more likely that they will also buy Bread.

Observations from the Rules:

- Eggs -> Cheese has a confidence of 68.12% and lift of 1.36, meaning customers who buy Eggs are likely to buy Cheese with a strong relationship.
- Cheese, Meat -> Eggs has a confidence of 66.67% and a lift of 1.52, which suggests that the combination of Cheese and Meat strongly predicts the purchase of Eggs.
- Milk -> Cheese has a support of 30.48%, confidence of 60.76%, and lift of 1.21, which means Milk and Cheese are bought together reasonably often, though the relationship is weaker compared to other rules.

In general, these rules suggest that certain products are more likely to be purchased together, which can help in product placement, promotions, and targeted marketing strategies. For example, the strong relationship between Eggs and Cheese could be used for bundling them together in promotions or strategically placing them in proximity on store shelves to encourage customers to buy both.