In [1]:
import pandas as pd
import time
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

basket_bool = pd.read_parquet("../data/processed/basket_bool.parquet")

In [2]:
def run_rules(min_support, min_confidence, min_lift):
    freq = fpgrowth(
        basket_bool,
        min_support=min_support,
        use_colnames=True
    )
    
    rules = association_rules(
        freq,
        metric="confidence",
        min_threshold=min_confidence
    )
    
    rules = rules[rules["lift"] >= min_lift]
    
    return {
        "min_support": min_support,
        "min_confidence": min_confidence,
        "min_lift": min_lift,
        "n_rules": rules.shape[0],
        "avg_len": rules["antecedents"].apply(len).mean() if len(rules) > 0 else 0
    }


In [3]:
results_normal = []

for s in [0.02, 0.01, 0.005]:
    for c in [0.3, 0.5]:
        for l in [1.0, 1.2]:
            results_normal.append(run_rules(s, c, l))

df_normal = pd.DataFrame(results_normal)
df_normal




Unnamed: 0,min_support,min_confidence,min_lift,n_rules,avg_len
0,0.02,0.3,1.0,184,1.065217
1,0.02,0.3,1.2,184,1.065217
2,0.02,0.5,1.0,76,1.144737
3,0.02,0.5,1.2,76,1.144737
4,0.01,0.3,1.0,2374,1.603201
5,0.01,0.3,1.2,2374,1.603201
6,0.01,0.5,1.0,1208,1.865066
7,0.01,0.5,1.2,1208,1.865066
8,0.005,0.3,1.0,77125,2.404992
9,0.005,0.3,1.2,77125,2.404992


In [4]:
df = pd.read_csv("../data/processed/cleaned_uk_data.csv")
df["InvoiceValue"] = df["Quantity"] * df["UnitPrice"]

invoice_value = df.groupby("InvoiceNo")["InvoiceValue"].sum()



  df = pd.read_csv("../data/processed/cleaned_uk_data.csv")


In [5]:
def weighted_support(rule, df, invoice_value):
    items = set(rule["antecedents"]).union(set(rule["consequents"]))
    invoices = df[df["Description"].isin(items)]["InvoiceNo"].unique()
    return invoice_value.loc[invoices].sum() / invoice_value.sum()


In [6]:
def run_weighted_rules(min_support, min_confidence, min_weighted_support):
    freq = fpgrowth(
        basket_bool,
        min_support=min_support,
        use_colnames=True
    )
    
    rules = association_rules(
        freq,
        metric="confidence",
        min_threshold=min_confidence
    )
    
    rules["weighted_support"] = rules.apply(
        weighted_support,
        axis=1,
        df=df,
        invoice_value=invoice_value
    )
    
    rules = rules[rules["weighted_support"] >= min_weighted_support]
    
    return {
        "min_support": min_support,
        "min_confidence": min_confidence,
        "min_weighted_support": min_weighted_support,
        "n_rules": rules.shape[0],
        "avg_len": rules["antecedents"].apply(len).mean() if len(rules) > 0 else 0
    }


In [7]:
results_weighted = []

for s in [0.02, 0.01]:
    for c in [0.3, 0.5]:
        for ws in [0.01, 0.02]:
            results_weighted.append(run_weighted_rules(s, c, ws))

df_weighted = pd.DataFrame(results_weighted)
df_weighted




Unnamed: 0,min_support,min_confidence,min_weighted_support,n_rules,avg_len
0,0.02,0.3,0.01,184,1.065217
1,0.02,0.3,0.02,184,1.065217
2,0.02,0.5,0.01,76,1.144737
3,0.02,0.5,0.02,76,1.144737
4,0.01,0.3,0.01,2374,1.603201
5,0.01,0.3,0.02,2374,1.603201
6,0.01,0.5,0.01,1208,1.865066
7,0.01,0.5,0.02,1208,1.865066
