In [1]:
from dataset import read_dataset, split
from pathlib import Path
import pandas as pd
from bundling import rule_based_product_bundle
import logging

In [2]:
data_dir = Path("data/data.csv")

logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
df = read_dataset(path=data_dir)
train_df, valid_df, test_df = split(df, "InvoiceNo")

2023-10-01 18:19:41,959 - root - DEBUG - Loading preprocessed csv from data/data_cleaned.csv
2023-10-01 18:19:42,385 - root - DEBUG - Splits NOT found in: data/splits or some are missing. Splitting...
2023-10-01 18:19:42,385 - root - DEBUG - Identify unique InvoiceNo
2023-10-01 18:19:42,389 - root - DEBUG - Perform a InvoiceNo-level split Train / Valid / Test: 0.6 / 0.2 / 0.2
2023-10-01 18:19:42,392 - root - DEBUG - Split the dataset


In [3]:
predicted_bundles = rule_based_product_bundle(train_df, min_support=0.025, metric="confidence", min_threshold=0.8, min_confidence=0.8, min_bundle_size=1)
len(predicted_bundles)

2023-10-01 18:19:49,817 - root - DEBUG - Create a pivot table for market basket analysis using InvoiceNo and ItemID
2023-10-01 18:19:50,435 - root - DEBUG - Perform market basket analysis using Apriori algorithm
2023-10-01 18:19:51,253 - root - DEBUG - Extracted 10 common_bundles from frequent itemsets
2023-10-01 18:19:51,254 - root - DEBUG - Generate association rules from frequent itemsets
2023-10-01 18:19:51,256 - root - DEBUG - Sort the rules by confidence or lift, depending on your preference
2023-10-01 18:19:51,257 - root - DEBUG - Iterate through the sorted rules and extract bundles
2023-10-01 18:19:51,257 - root - DEBUG - Generated 0 bundles using association rules


10

In [4]:
setty_df = df.groupby("InvoiceNo")["ItemID"].apply(frozenset)
actual_bundles = set(setty_df[setty_df.apply(len) > 1].to_list())

# Calculate evaluation metrics
true_positives = len(predicted_bundles & actual_bundles)
false_positives = len(predicted_bundles - actual_bundles)
false_negatives = len(actual_bundles - predicted_bundles)

precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)


print("Evaluation Metrics:")
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Evaluation Metrics:
Precision: 0.4
Recall: 0.0002363088556743664
F1 Score: 0.0004723386668241129
