### in the name of Allah
# Shopping Cart Analysis and Recommender System based on ARM (Association Rule Mining)
------------------
## TASK I: Data Preprocessing

This task focuses on preparing the Instacart e-commerce dataset for Association Rule Mining analysis. The preprocessing pipeline ensures data quality and creates a manageable subset for efficient pattern mining. We address common data issues including missing values and irrelevant transactions, while focusing on multi-item baskets essential for meaningful association rule discovery. By sampling 20,000 users, we balance computational feasibility with sufficient data coverage for robust analysis.

### **Step 1: Loading the Data**
Load CSV files:
- **pandas** for small files: `aisles.csv`, `departments.csv`, `products.csv`
- **dask** for large files: `order_products__train.csv`, `orders.csv`, `order_products__prior.csv`

### **Step 2: Data Cleaning Functions**
Define functions:
- **`remove_nulls(df)`**: Remove rows with missing values.
- **`filter_single_item_orders(df)`**: Remove orders with only one product.
- **`filter_by_order_ids(df, order_ids_set)`**: Filter by order IDs.

### **Step 3: Preprocessing Execution**
1. **Remove nulls** from all tables.
2. **Remove single-item orders** from order products data.
3. **Sample 20,000 users** randomly from orders.
4. **Extract orders** for sampled users.
5. **Filter order products** to include only sampled orders.

### **Step 4: Saving Cleaned Data**
Save cleaned data to `./processed_data/`:
- **`aisles_cleaned.csv`**, **`products_cleaned.csv`**, **`departments_cleaned.csv`**
- **`orders_sampled.csv`**: Orders of 20,000 sampled users.
- **`order_products_train_sampled.csv`**, **`order_products_prior_sampled.csv`**
- **`order_products_combined.csv`**: Combined data for basket analysis.

This preprocessing ensures clean, multi-item basket data ready for Association Rule Mining in subsequent tasks.

In [None]:
import os
import ast
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules

# ============================================================================
# LOADING DATA FROM CSV FILES
# ============================================================================

# Load small metadata files using pandas
aisles = pd.read_csv('aisles.csv')
products = pd.read_csv('products.csv')
departments = pd.read_csv('departments.csv')

# Load large transactional files using dask
orders = dd.read_csv('orders.csv')
order_products_train = dd.read_csv('order_products__train.csv')
order_products_prior = dd.read_csv('order_products__prior.csv')

print("Data loading completed.")
print(f"Orders: {orders.shape[0].compute():,} rows")
print(f"Order Products Prior: {order_products_prior.shape[0].compute():,} rows")
print(f"Order Products Train: {order_products_train.shape[0].compute():,} rows")

KeyboardInterrupt: 

In [None]:
# ============================================================================
# DATA PREPROCESSING FUNCTIONS
# ============================================================================

def remove_nulls(df):
    """Remove rows with null values from dataframe."""
    return df.dropna()


def filter_single_item_orders(df):
    """
    Remove orders that contain only one product item.
    For basket analysis, we need at least two items per order.
    """
    # Count products per order using groupby and size
    item_counts = df.groupby('order_id').size().reset_index()
    item_counts = item_counts.rename(columns={0: 'item_count'})
    
    # Get orders with more than 1 item & filter original dataframe
    multi_item_orders = item_counts[item_counts['item_count'] > 1]
    return df[df['order_id'].isin(multi_item_orders['order_id'])]


def filter_by_order_ids(df, order_ids_set):
    """
    Filter dataframe based on a set of order_ids
    >>> used for 2000 selected users.
    """
    return df[df['order_id'].isin(order_ids_set)]

In [None]:
# ============================================================================
# TASK 1: DATA PREPROCESSING EXECUTION
# ============================================================================

print("\n" + "="*50)
print("TASK 1: DATA PREPROCESSING")
print("="*50)

# Step 1: Remove null values
print("\n1. Removing null values...")
aisles_cleaned = remove_nulls(aisles)
products_cleaned = remove_nulls(products)
departments_cleaned = remove_nulls(departments)
orders_cleaned = remove_nulls(orders).persist()
order_products_train_cleaned = remove_nulls(order_products_train).persist()
order_products_prior_cleaned = remove_nulls(order_products_prior).persist()

# Step 2: Remove single-item orders
print("2. Removing single-item orders...")
order_products_train_filtered = filter_single_item_orders(order_products_train_cleaned).persist()
order_products_prior_filtered = filter_single_item_orders(order_products_prior_cleaned).persist()

# Step 3: Sample 20,000 users
print("3. Sampling 20,000 random users...")
unique_users_count = orders_cleaned['user_id'].nunique().compute()
frac_value = min(20000 / unique_users_count, 1.0)

sampled_users = orders_cleaned['user_id'].drop_duplicates().sample(
    frac=frac_value, 
    random_state=42
)

# Step 4: Get orders for sampled users
sampled_users_list = sampled_users.compute().tolist()
orders_sampled = orders_cleaned[orders_cleaned['user_id'].isin(sampled_users_list)]

# Get order IDs
sampled_order_ids = orders_sampled['order_id'].compute().tolist()
order_ids_set = set(sampled_order_ids)

# Step 5: Filter order products
print("4. Filtering order products...")
order_products_train_sampled = order_products_train_filtered.map_partitions(
    filter_by_order_ids, 
    order_ids_set,
    meta=order_products_train_filtered._meta
).persist()

order_products_prior_sampled = order_products_prior_filtered.map_partitions(
    filter_by_order_ids,
    order_ids_set,
    meta=order_products_prior_filtered._meta
).persist()

print("\nPreprocessing completed!")


TASK 1: DATA PREPROCESSING

1. Removing null values...
2. Removing single-item orders...
3. Sampling 20,000 random users...
4. Filtering order products...

Preprocessing completed!


In [None]:
# ============================================================================
# SAVING PROCESSED DATA
# ============================================================================

output_folder = './processed_data'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

print(f"\nSaving data to {output_folder}/")

# Save metadata files
aisles_cleaned.to_csv(f'{output_folder}/aisles_cleaned.csv', index=False)
products_cleaned.to_csv(f'{output_folder}/products_cleaned.csv', index=False)
departments_cleaned.to_csv(f'{output_folder}/departments_cleaned.csv', index=False)

# Save transactional data (need to compute dask dataframes first)
orders_sampled_computed = orders_sampled.compute()
orders_sampled_computed.to_csv(f'{output_folder}/orders_sampled.csv', index=False)

order_products_train_sampled_computed = order_products_train_sampled.compute()
order_products_train_sampled_computed.to_csv(
    f'{output_folder}/order_products_train_sampled.csv', 
    index=False
)

order_products_prior_sampled_computed = order_products_prior_sampled.compute()
order_products_prior_sampled_computed.to_csv(
    f'{output_folder}/order_products_prior_sampled.csv', 
    index=False
)

# Save combined data for basket analysis
order_products_combined = dd.concat([order_products_train_sampled, order_products_prior_sampled])
order_products_combined_computed = order_products_combined.compute()
order_products_combined_computed.to_csv(
    f'{output_folder}/order_products_combined.csv', 
    index=False
)

# Final summary
print("\n" + "="*50)
print("TASK 1 SUMMARY")
print("="*50)
print(f"Sampled users: {len(sampled_users_list):,}")
print(f"Sampled orders: {orders_sampled_computed.shape[0]:,}")
print(f"Total products for basket analysis: {order_products_combined_computed.shape[0]:,}")
print("\n✓ Task 1 completed - Data ready for basket analysis (Task 2)")
print("="*50)



Saving data to ./processed_data/

TASK 1 SUMMARY
Sampled users: 20,000
Sampled orders: 312,778
Total products for basket analysis: 3,083,695

✓ Task 1 completed - Data ready for basket analysis (Task 2)


## TASK II: 

In [None]:
# ============================================================================
# LOADING PROCESSED DATA (FROM TASK1)
# ============================================================================

# Load order products data
order_products = pd.read_csv('./processed_data/order_products_combined.csv')

# Load products for mapping IDs to names
products = pd.read_csv('./processed_data/products_cleaned.csv')

print(f"Order products: {order_products.shape[0]:,} rows")
print(f"Unique orders: {order_products['order_id'].nunique():,}")
print(f"Unique products: {order_products['product_id'].nunique():,}")

FileNotFoundError: [Errno 2] No such file or directory: './processed_data/cleaned_data.csv'

In [None]:
# ============================================================================
# CREATING SHOPPING BASKETS
# ============================================================================

# Group by order_id to create product lists
baskets = order_products.groupby('order_id')['product_id'].apply(list).reset_index(name='products')

print(f"Total baskets created: {len(baskets):,}")
print(f"Avg items per basket: {baskets['products'].apply(len).mean():.2f}")

# Prepare transactions for one-hot encoding
transactions = baskets['products'].tolist()

In [None]:
# ============================================================================
# CREATING ONE-HOT ENCODED MATRIX
# ============================================================================

# Initialize and fit transaction encoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)

# Create binary basket dataframe
basket_encoded = pd.DataFrame(te_ary, columns=te.columns_)

print(f"Binary matrix shape: {basket_encoded.shape}")
print(f"Products (columns): {basket_encoded.shape[1]}")

# Save for future use
basket_encoded.to_csv('./processed_data/basket_encoded.csv', index=False)
print("Saved: basket_encoded.csv")

# Final summary
print("\n" + "="*50)
print("TASK 2 SUMMARY")
print("="*50)
print(f"Total baskets created: {len(baskets):,}")
print(f"Unique products in baskets: {basket_encoded.shape[1]}")
print(f"Binary matrix shape: {basket_encoded.shape}")
print("\n✓ Task 2 completed - Basket data ready for Apriori algorithm")
print("="*50)

## TASK III: 

In [None]:
# ============================================================================
# RUNNING APRIORI WITH DIFFERENT < MIN SUPPORT > VALUES
# ============================================================================

# Test different min_support values
min_support_values = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10]
results = []

for min_sup in min_support_values:
    frequent_itemsets = apriori(
        basket_encoded, 
        min_support=min_sup,
        use_colnames=True,
        max_len=None
    )
    
    results.append({
        'min_support': min_sup,
        'total_itemsets': len(frequent_itemsets),
        'max_itemset_size': frequent_itemsets['itemsets'].apply(len).max() if len(frequent_itemsets) > 0 else 0
    })
    
    print(f"min_support={min_sup:.2f}: {len(frequent_itemsets):,} itemsets")

# Create results dataframe
results_df = pd.DataFrame(results)
print("\nResults summary:")
print(results_df.to_string())


Running Apriori with different min_support values...


NameError: name 'apriori' is not defined

In [None]:
# ============================================================================
# CREATING COMPARISON PLOTS
# ============================================================================

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Plot 1: Total itemsets vs min_support
ax1.plot(results_df['min_support'], results_df['total_itemsets'], 'b-o')
ax1.set_xlabel('Minimum Support')
ax1.set_ylabel('Total Frequent Itemsets')
ax1.set_title('Itemsets vs Minimum Support')
ax1.grid(True, alpha=0.3)

# Plot 2: Max itemset size vs min_support
ax2.plot(results_df['min_support'], results_df['max_itemset_size'], 'r-o')
ax2.set_xlabel('Minimum Support')
ax2.set_ylabel('Maximum Itemset Size')
ax2.set_title('Itemset Size vs Minimum Support')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('./processed_data/apriori_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("Saved: apriori_comparison.png")

In [None]:
# ============================================================================
# RUNNING APRIORI WITH MIN_SUPPORT = 0.05
# ============================================================================

# Run Apriori with specified min_support
frequent_itemsets_005 = apriori(
    basket_encoded, 
    min_support=0.05,
    use_colnames=True,
    max_len=None
)

# Add itemset length
frequent_itemsets_005['length'] = frequent_itemsets_005['itemsets'].apply(len)
print(f"Found {len(frequent_itemsets_005):,} frequent itemsets with min_support=0.05")

# Show distribution
print("\nItemset size distribution:")
size_dist = frequent_itemsets_005['length'].value_counts().sort_index()
for size, count in size_dist.items():
    print(f"  Size {size}: {count:,} itemsets")

# Show top 10 itemsets
print("\nTop 10 frequent itemsets by support:")
top_10 = frequent_itemsets_005.nlargest(10, 'support')

# Create product name mapping
product_names = products.set_index('product_id')['product_name'].to_dict()
for idx, row in top_10.iterrows():
    items = list(row['itemsets'])
    item_names = [product_names.get(item, f"Product_{item}") for item in items]
    print(f"  Support: {row['support']:.4f} - {', '.join(item_names[:2])}{'...' if len(item_names) > 2 else ''}")

# Save results
frequent_itemsets_005.to_csv('./processed_data/frequent_itemsets_005.csv', index=False)
print("\nSaved: frequent_itemsets_005.csv")

# Final summary
print("\n" + "="*50)
print("TASK 3 SUMMARY")
print("="*50)
print(f"Frequent itemsets with min_support=0.05: {len(frequent_itemsets_005):,}")
print(f"Itemset size distribution: {dict(size_dist)}")
print("\n✓ Task 3 completed - Frequent itemsets generated for association rules")
print("="*50)

## TASK IV: 

In [None]:
# ============================================================================
# LOADING FREQUENT ITEMSETS FROM TASK 3
# ============================================================================

# Load frequent itemsets from Task 3
frequent_itemsets = pd.read_csv('./processed_data/frequent_itemsets_005.csv')

# Convert itemsets column from string to frozenset
frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(
    lambda x: frozenset(ast.literal_eval(x)) if isinstance(x, str) else x
)

print(f"Loaded {len(frequent_itemsets):,} frequent itemsets")
print(f"Itemset size range: {frequent_itemsets['length'].min()} to {frequent_itemsets['length'].max()}")

In [None]:
# ============================================================================
# EXTRACTING & ANALYZING ASSOCIATION RULES
# ============================================================================

# Generate association rules
rules = association_rules(
    frequent_itemsets, 
    metric="confidence", 
    min_threshold=0.1
)

print(f"Generated {len(rules):,} association rules")
print(f"\nFirst 3 rules:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(3))

# Analyzing rules metrics
print("Analysis of rule metrics:")
print(f"Support range: {rules['support'].min():.4f} to {rules['support'].max():.4f}")
print(f"Confidence range: {rules['confidence'].min():.4f} to {rules['confidence'].max():.4f}")
print(f"Lift range: {rules['lift'].min():.4f} to {rules['lift'].max():.4f}")

print(f"\nAverage confidence: {rules['confidence'].mean():.4f}")
print(f"Average lift: {rules['lift'].mean():.4f}")
print(f"Rules with lift > 1: {(rules['lift'] > 1).sum():,}")

In [None]:
# ============================================================================
# FINDING TOP 3 RULES BY LIFT
# ============================================================================

# Get top 3 rules by lift
top_3_lift = rules.nlargest(3, 'lift')

print("Top 3 association rules by lift:")
print("=" * 50)

# Load product names for better interpretation
products = pd.read_csv('./processed_data/products_cleaned.csv')
product_names = products.set_index('product_id')['product_name'].to_dict()

for idx, row in top_3_lift.iterrows():
    antecedents = list(row['antecedents'])
    consequents = list(row['consequents'])
    
    ant_names = [product_names.get(item, f"Product_{item}") for item in antecedents]
    cons_names = [product_names.get(item, f"Product_{item}") for item in consequents]
    
    print(f"\nRule {idx+1}:")
    print(f"  IF {', '.join(ant_names)}")
    print(f"  THEN {', '.join(cons_names)}")
    print(f"  Support: {row['support']:.4f}")
    print(f"  Confidence: {row['confidence']:.4f}")
    print(f"  Lift: {row['lift']:.4f}")

In [None]:
# ============================================================================
# SAVING RULES AND FINAL SUMMARY
# ============================================================================

# Save all rules to CSV
rules.to_csv('./processed_data/association_rules.csv', index=False)
print("\nSaved: association_rules.csv")

# Save top rules separately
top_3_lift.to_csv('./processed_data/top_3_rules_by_lift.csv', index=False)
print("Saved: top_3_rules_by_lift.csv")

# Final summary
print("\n" + "="*50)
print("TASK 4 SUMMARY")
print("="*50)
print(f"Total association rules generated: {len(rules):,}")
print(f"Rules with lift > 1: {(rules['lift'] > 1).sum():,}")
print(f"Top lift value: {top_3_lift['lift'].max():.4f}")
print("\n✓ Task 4 completed - Association rules extracted and analyzed")
print("="*50)