In [1]:
from dataset import read_dataset, split
from pathlib import Path
from bundling import rule_based_product_bundle
import logging

In [3]:
data_dir = Path("../data/data.csv")

logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
df = read_dataset(path=data_dir)
train_df, valid_df, test_df = split(df, data_dir, "InvoiceNo", data_dir)

2023-10-01 20:57:32,653 - root - DEBUG - Loading preprocessed csv from ../data/data_cleaned.csv
2023-10-01 20:57:33,085 - root - DEBUG - Splits NOT found in: ../data or some are missing. Splitting...
2023-10-01 20:57:33,085 - root - DEBUG - Identify unique InvoiceNo
2023-10-01 20:57:33,089 - root - DEBUG - Perform a InvoiceNo-level split Train / Valid / Test: 0.6 / 0.2 / 0.2
2023-10-01 20:57:33,092 - root - DEBUG - Split the dataset


In [4]:
predicted_bundles = rule_based_product_bundle(train_df, min_support=0.025, metric="confidence", min_threshold=0.8, min_confidence=0.8, min_bundle_size=1)
len(predicted_bundles)

2023-10-01 20:57:51,562 - root - DEBUG - Create a pivot table for market basket analysis using InvoiceNo and ItemID
2023-10-01 20:57:53,249 - root - DEBUG - Perform market basket analysis using Apriori algorithm
2023-10-01 20:57:54,140 - root - DEBUG - Extracted 10 common_bundles from frequent itemsets
2023-10-01 20:57:54,142 - root - DEBUG - Generate association rules from frequent itemsets
2023-10-01 20:57:54,145 - root - DEBUG - Sort the rules by confidence or lift, depending on your preference
2023-10-01 20:57:54,147 - root - DEBUG - Iterate through the sorted rules and extract bundles
2023-10-01 20:57:54,148 - root - DEBUG - Generated 0 bundles using association rules


10

In [None]:
setty_df = df.groupby("InvoiceNo")["ItemID"].apply(frozenset)
actual_bundles = set(setty_df[setty_df.apply(len) > 1].to_list())

# Calculate evaluation metrics
true_positives = len(predicted_bundles & actual_bundles)
false_positives = len(predicted_bundles - actual_bundles)
false_negatives = len(actual_bundles - predicted_bundles)

precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)


print("Evaluation Metrics:")
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Evaluation Metrics:
Precision: 0.4
Recall: 0.0002363088556743664
F1 Score: 0.0004723386668241129


In [7]:
from bundling import load_best_bundles
import pickle 

with open("../static/best_bundles.pickle", "rb") as pickle_file:
    static_bundles = pickle.load(pickle_file)

In [8]:
product_id = '22423'

In [14]:
static_bundles

{frozenset({'22386', '23199'}),
 frozenset({'20725', '22382', '22383'}),
 frozenset({'22423', '23173'}),
 frozenset({'82483', '82486'}),
 frozenset({'20727', '23207'}),
 frozenset({'20727', '85123A'}),
 frozenset({'20719', '22355'}),
 frozenset({'20725', '22384'}),
 frozenset({'21928', '21929'}),
 frozenset({'20723', '22355'}),
 frozenset({'22382', '23206'}),
 frozenset({'20726', '20728', '22383'}),
 frozenset({'22727', '22729'}),
 frozenset({'20728', '23208'}),
 frozenset({'22386', '85099F'}),
 frozenset({'22384', '85123A'}),
 frozenset({'20727', '22382'}),
 frozenset({'21755', '85123A'}),
 frozenset({'21931', '22385'}),
 frozenset({'82482', '85123A'}),
 frozenset({'20728', '23206'}),
 frozenset({'20725', '23206'}),
 frozenset({'20727', '22384'}),
 frozenset({'22551', '22555'}),
 frozenset({'23321', '23322'}),
 frozenset({'20713', '85099B'}),
 frozenset({'82580', '82581'}),
 frozenset({'21931', '23203'}),
 frozenset({'23173', '23175'}),
 frozenset({'20725', '22386', '85099B'}),
 froze

In [9]:
set(filter(lambda x: product_id in x, static_bundles))


{frozenset({'22423', '84879'}),
 frozenset({'22423', '23245'}),
 frozenset({'22423', '85123A'}),
 frozenset({'22423', '22698'}),
 frozenset({'22423', '22698', '22699'}),
 frozenset({'22423', '23173'}),
 frozenset({'22423', '22697', '22698'}),
 frozenset({'22423', '22699'}),
 frozenset({'22423', '22697'}),
 frozenset({'22423', '22697', '22698', '22699'}),
 frozenset({'22423', '47566'}),
 frozenset({'22423', '22697', '22699'}),
 frozenset({'22423', '22720'})}

In [13]:
set(filter(lambda x: product_id in x, static_bundles))

{frozenset({'22423', '84879'}),
 frozenset({'22423', '23245'}),
 frozenset({'22423', '85123A'}),
 frozenset({'22423', '22698'}),
 frozenset({'22423', '22698', '22699'}),
 frozenset({'22423', '23173'}),
 frozenset({'22423', '22697', '22698'}),
 frozenset({'22423', '22699'}),
 frozenset({'22423', '22697'}),
 frozenset({'22423', '22697', '22698', '22699'}),
 frozenset({'22423', '47566'}),
 frozenset({'22423', '22697', '22699'}),
 frozenset({'22423', '22720'})}

In [15]:
product_id = '22423'

# Filter the frozensets that contain the productId
filtered_bundles = [bundle for bundle in static_bundles if product_id in bundle]

# Sort the filtered bundles by length in descending order
sorted_bundles = sorted(filtered_bundles, key=len, reverse=True)

# Get the first longest frozenset
if sorted_bundles:
    first_longest_bundle = sorted_bundles[0]
    print("First Longest Bundle:", first_longest_bundle)
else:
    print("No bundle found containing productId:", product_id)


First Longest Bundle: frozenset({'22699', '22698', '22697', '22423'})


In [34]:
product_id = '22423'

df[df["ItemID"] == product_id].sort_values("InvoiceDate", ascending=False).iloc[0]["UnitPrice"]

12.75

In [37]:
logging.basicConfig(
    level=logging.INFO,  # Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
data_dir = Path("../data/data.csv")
logging.info("Load and preprocess dataset (load preprocessed file if possible)")
dataset = read_dataset(data_dir, force=False)

2023-10-01 21:18:45,187 - root - INFO - Load and preprocess dataset (load preprocessed file if possible)
2023-10-01 21:18:45,190 - root - DEBUG - Loading preprocessed csv from ../data/data_cleaned.csv


Unnamed: 0,ItemID,UnitPrice
0,85123A,2.55
1,71053,3.39
2,84406B,2.75
3,84029G,3.39
4,84029E,3.39
...,...,...
392727,22613,0.85
392728,22899,2.10
392729,23254,4.15
392730,23255,4.15
