# Market Basket Analysis (Association Rules)

## Basket Analysis

In [None]:
import pandas as pd
import numpy as np
from dask.distributed import Client
from mlxtend.frequent_patterns import fpgrowth, association_rules
import dask.dataframe as dd

from google.colab import drive
drive.mount('/content/drive')

# ✅ Start Parallel Processing to Handle Large Data
client = Client()

# ✅ Load Data (Keep Only Essential Columns)
sales_df = dd.read_csv('redacted.csv', usecols=["store_id", "product_id", "sales"])

# ✅ Ensure `product_id` is a Categorical Variable
sales_df["product_id"] = sales_df["product_id"].astype("category")

# ✅ Explicitly Define Categories (This Prevents Errors)
sales_df = sales_df.categorize(columns=["product_id"])

# ✅ Filter for Top-Selling Products
top_products = sales_df.groupby("product_id")["sales"].sum().nlargest(15).index.compute()
sales_df = sales_df[sales_df["product_id"].isin(top_products)]

# ✅ Convert Dask DataFrame to Pandas for Faster Pivot Processing
sales_df = sales_df.compute()

# ✅ Create Pivot Table for Transaction Data (Store vs. Products)
basket_df = sales_df.pivot_table(index="store_id", columns="product_id", values="sales", aggfunc="sum").fillna(0)

# ✅ Convert to Boolean Matrix (1 if purchased, 0 if not)
basket_df = basket_df > 0

# ✅ Run FP-Growth Algorithm (Much Faster than Apriori)
frequent_itemsets = fpgrowth(basket_df, min_support=0.1, use_colnames=True)

# ✅ Generate Association Rules (Keep Only Product Pairs)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules = rules[rules["antecedents"].apply(lambda x: len(x) == 1) & rules["consequents"].apply(lambda x: len(x) == 1)]

# ✅ Save Results
rules.to_csv("redacted/market_basket_rules.csv", index=False)

# ✅ Display First Few Rules
print(rules.head())

Mounted at /content/drive


INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:41677
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:8787/status
INFO:distributed.scheduler:Registering Worker plugin shuffle
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:36967'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:36877'
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:45483', name: 1, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:45483
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:49174
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:34653', name: 0, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker co

  antecedents consequents  antecedent support  consequent support   support  \
0     (P0438)     (P0364)            0.906542            0.869159  0.803738   
1     (P0364)     (P0438)            0.869159            0.906542  0.803738   
2     (P0438)     (P0103)            0.906542            0.850467  0.794393   
3     (P0103)     (P0438)            0.850467            0.906542  0.794393   
4     (P0103)     (P0364)            0.850467            0.869159  0.747664   

   confidence      lift  representativity  leverage  conviction  \
0    0.886598  1.020064               1.0  0.015809    1.153781   
1    0.924731  1.020064               1.0  0.015809    1.241656   
2    0.876289  1.030361               1.0  0.023408    1.208723   
3    0.934066  1.030361               1.0  0.023408    1.417445   
4    0.879121  1.011462               1.0  0.008472    1.082413   

   zhangs_metric   jaccard  certainty  kulczynski  
0       0.210465  0.826923   0.133284    0.905665  
1       0.150332  