In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import gc

# Read in chunks
chunk_size = 10000  # Adjust based on your memory capacity
chunks = []

for chunk in pd.read_csv('/content/hackathon_ready_dataset.csv', chunksize=chunk_size):
    # Process each chunk as needed
    processed_chunk = chunk  # Apply your processing here
    chunks.append(processed_chunk)

# Combine chunks if needed
df = pd.concat(chunks, ignore_index=True)

# Clear memory
del chunks
gc.collect()

0

In [3]:
df.head()

Unnamed: 0,CUSTOMER_ID,STORE_NUMBER,ORDER_CREATED_DATE,ORDER_ID,ORDER_CHANNEL_NAME,ORDER_SUBCHANNEL_NAME,ORDER_OCCASION_NAME,item1,item1_price,item2,...,item3,item3_price,item4,item4_price,item5,item5_price,item6,item6_price,avg_order_price,CUSTOMER_TYPE
0,362204699,2156,2024-07-24,7247194287,Digital,WWT,ToGo,10 pc Grilled Wings Combo,15.29,8 pc Grilled Wings Combo,...,8 pc Spicy Wings Combo,10.99,,0.0,,0.0,,0.0,13.19,Registered
1,269612955,1419,2025-02-15,791214421,Digital,WWT,ToGo,Ranch Dip - Regular,1.59,50 pc Grilled Wings,...,Regular Buffalo Fries,3.49,,0.0,,0.0,,0.0,23.52,Registered
2,585330633,2249,2025-02-15,7575285208,Digital,WWT,ToGo,20pc Spicy Feast Deal,16.99,,...,,0.0,,0.0,,0.0,,0.0,16.99,Guest
3,950661333,2513,2024-03-29,4253875716,Digital,WWT,ToGo,20 pc Grilled Wings,26.59,Ranch Dip - Regular,...,,0.0,,0.0,,0.0,,0.0,14.04,Registered
4,434985772,1754,2024-04-08,7150407872,Digital,WWT,ToGo,6 pc Grilled Wings Combo,11.29,8 pc Grilled Wings Combo,...,,0.0,,0.0,,0.0,,0.0,12.29,Guest


In [4]:
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd

# Convert your 6-item columns into transaction format
def create_basket_matrix(df):
    # Get all transactions as lists of items
    transactions = []
    for _, row in df.iterrows():
        # Use a set to get unique items for each transaction
        items = list(set([row[f'item{i}'] for i in range(1,7)
                         if pd.notna(row[f'item{i}'] )]))
        transactions.append(items)

    # Create binary matrix (item present/absent)
    all_items = set(item for trans in transactions for item in trans)
    basket_matrix = pd.DataFrame(False,
                                index=range(len(transactions)),
                                columns=list(all_items))

    for i, items in enumerate(transactions):
        # Ensure items are in the columns of basket_matrix before assigning
        valid_items = [item for item in items if item in basket_matrix.columns]
        basket_matrix.loc[i, valid_items] = True


    return basket_matrix

# Run Market Basket Analysis
basket_data = create_basket_matrix(df)
frequent_items = apriori(basket_data, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_items, metric="confidence", min_threshold=0.1)

# View top recommendations
print(rules.sort_values('lift', ascending=False).head(10))

                antecedents              consequents  antecedent support  \
6       (10 pc Spicy Wings)  (Regular Buffalo Fries)            0.041558   
7   (Regular Buffalo Fries)      (10 pc Spicy Wings)            0.070527   
0     (10 pc Grilled Wings)    (Ranch Dip - Regular)            0.047136   
1     (Ranch Dip - Regular)    (10 pc Grilled Wings)            0.213584   
19    (15 pc Grilled Wings)    (Ranch Dip - Regular)            0.024512   
5       (10 pc Spicy Wings)    (Ranch Dip - Regular)            0.041558   
4     (Ranch Dip - Regular)      (10 pc Spicy Wings)            0.213584   
2     (20 pc Grilled Wings)    (Ranch Dip - Regular)            0.022144   
8     (Ranch Dip - Regular)  (Regular Buffalo Fries)            0.213584   
9   (Regular Buffalo Fries)    (Ranch Dip - Regular)            0.070527   

    consequent support   support  confidence      lift  representativity  \
6             0.070527  0.010966    0.263882  3.741582               1.0   
7          

In [5]:
# Create popular items fallback list
def get_popular_items(df, top_n=10):
    """Extract most frequently ordered items as fallback recommendations"""
    item_cols = [f'item{i}' for i in range(1,7)]
    all_items = df[item_cols].melt(value_name='item')['item'].dropna().value_counts()
    return all_items.head(top_n).index.tolist()

# Generate popular items from your cleaned dataset
popular_items = get_popular_items(df, top_n=15)
print("Top 15 most popular items:")
for i, item in enumerate(popular_items, 1):
    print(f"{i}. {item}")

Top 15 most popular items:
1. Ranch Dip - Regular
2. 20pc Spicy Feast Deal
3. 10 pc Grilled Wings Combo
4. 6 pc Grilled Wings Combo
5. 8 pc Grilled Wings Combo
6. Regular Buffalo Fries
7. 2 pc Crispy Strips
8. Ranch Dip - Large
9. 6 pc Spicy Wings Combo
10. 10 pc Grilled Wings
11. Large Buffalo Fries
12. 8 pc Spicy Wings Combo
13. 10 pc Spicy Wings
14. Fried Corn - Regular
15. Chicken Sub Combo


In [6]:
# 1. Create popular items fallback
popular_items = get_popular_items(df, top_n=15)

# 2. Define the recommendation function (as before)
def wings_r_us_recommender(current_cart, rules_df, popular_fallback, top_n=3):
    """Wings R Us MBA-based recommendation engine"""
    recommendations = {}

    # Apply association rules
    for _, rule in rules_df.iterrows():
        antecedents = set(rule['antecedents'])
        consequents = set(rule['consequents'])

        # Check if cart items match rule antecedents
        if antecedents.intersection(set(current_cart)):
            for item in consequents:
                if item not in current_cart:
                    # Weight by confidence * lift for strong patterns
                    score = rule['confidence'] * rule['lift']
                    recommendations[item] = max(recommendations.get(item, 0), score)

    # Sort and get top recommendations
    sorted_recs = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    final_recs = [item for item, score in sorted_recs[:top_n]]

    # Fill with popular items if needed
    while len(final_recs) < top_n:
        for item in popular_fallback:
            if item not in current_cart and item not in final_recs:
                final_recs.append(item)
                break
        else:
            break

    return final_recs[:top_n]

# 3. Test your scenarios
test_scenarios = [
    ["10 pc Spicy Wings"],      # Should suggest Regular Buffalo Fries
    ["10 pc Grilled Wings"],    # Should suggest Ranch Dip - Regular
    ["Regular Buffalo Fries"],  # Should suggest 10 pc Spicy Wings
    ["15 pc Grilled Wings"],    # Should suggest Ranch Dip - Regular
]

print("=== TESTING RECOMMENDATIONS ===")
for cart in test_scenarios:
    recs = wings_r_us_recommender(cart, rules, popular_items)
    print(f"Cart: {cart}")
    print(f"‚Üí Recommendations: {recs}")
    print("-" * 50)

=== TESTING RECOMMENDATIONS ===
Cart: ['10 pc Spicy Wings']
‚Üí Recommendations: ['Ranch Dip - Regular', 'Regular Buffalo Fries', '20pc Spicy Feast Deal']
--------------------------------------------------
Cart: ['10 pc Grilled Wings']
‚Üí Recommendations: ['Ranch Dip - Regular', '20pc Spicy Feast Deal', '10 pc Grilled Wings Combo']
--------------------------------------------------
Cart: ['Regular Buffalo Fries']
‚Üí Recommendations: ['Ranch Dip - Regular', '10 pc Spicy Wings', '20pc Spicy Feast Deal']
--------------------------------------------------
Cart: ['15 pc Grilled Wings']
‚Üí Recommendations: ['Ranch Dip - Regular', '20pc Spicy Feast Deal', '10 pc Grilled Wings Combo']
--------------------------------------------------


In [7]:
import random
import numpy as np
from tqdm import tqdm

def evaluate_recall_on_training_data(df, rules, popular_items, sample_size=1000):
    """
    Evaluate Recall@3 on training data using leave-one-out validation

    Args:
        df: Your cleaned training dataset
        rules: Association rules from MBA
        popular_items: Popular items for fallback
        sample_size: Number of orders to test (subset for speed)

    Returns:
        recall_score, detailed_results
    """

    # Filter orders with 2+ items (need something to remove)
    multi_item_orders = df[df['basket_size'] >= 2].copy()

    # Sample for testing (to avoid long computation)
    if len(multi_item_orders) > sample_size:
        test_orders = multi_item_orders.sample(n=sample_size, random_state=42)
    else:
        test_orders = multi_item_orders

    correct_predictions = 0
    total_predictions = 0
    detailed_results = []

    print(f"Testing on {len(test_orders)} orders from training data...")

    for _, order in tqdm(test_orders.iterrows(), total=len(test_orders)):
        # Get all items in this order
        original_items = [order[f'item{i}'] for i in range(1, 7)
                         if pd.notna(order[f'item{i}'])]

        if len(original_items) < 2:
            continue  # Skip single-item orders

        # Randomly remove one item (this becomes our "target")
        target_item = random.choice(original_items)
        remaining_cart = [item for item in original_items if item != target_item]

        # Generate recommendations based on remaining cart
        recommendations = wings_r_us_recommender(remaining_cart, rules, popular_items, top_n=3)

        # Check if target item is in top 3 recommendations
        is_correct = target_item in recommendations
        if is_correct:
            correct_predictions += 1

        total_predictions += 1

        # Store detailed results
        detailed_results.append({
            'order_id': order.get('ORDER_ID', ''),
            'original_cart': original_items,
            'remaining_cart': remaining_cart,
            'target_item': target_item,
            'recommendations': recommendations,
            'correct': is_correct
        })

    # Calculate recall
    recall_at_3 = correct_predictions / total_predictions if total_predictions > 0 else 0

    print(f"\n=== TRAINING DATA RECALL RESULTS ===")
    print(f"Total test cases: {total_predictions}")
    print(f"Correct predictions: {correct_predictions}")
    print(f"Recall@3: {recall_at_3:.3f} ({recall_at_3*100:.1f}%)")

    return recall_at_3, detailed_results

# Add basket_size column if not already present
if 'basket_size' not in df.columns:
    df['basket_size'] = df[[f'item{i}' for i in range(1,7)]].notna().sum(axis=1)

# Run the evaluation
recall_score, results = evaluate_recall_on_training_data(
    df, rules, popular_items, sample_size=2000
)

Testing on 2000 orders from training data...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [00:02<00:00, 972.48it/s]


=== TRAINING DATA RECALL RESULTS ===
Total test cases: 2000
Correct predictions: 577
Recall@3: 0.288 (28.8%)





In [8]:
# Build customer profiles with rich features
def create_customer_features(df):
    customer_features = df.groupby('CUSTOMER_ID').agg({
        'avg_order_price': ['mean', 'std', 'min', 'max'],
        'basket_size': ['mean', 'std', 'max'],
        'ORDER_ID': 'count',  # frequency
        'ORDER_CREATED_DATE': ['min', 'max']  # recency
    }).reset_index()

    # Flatten column names
    customer_features.columns = ['_'.join(col).strip() if col[1] else col[0]
                               for col in customer_features.columns]

    # Add behavioral segments
    customer_features['days_active'] = (
        customer_features['ORDER_CREATED_DATE_max'] -
        customer_features['ORDER_CREATED_DATE_min']
    ).dt.days

    customer_features['orders_per_week'] = (
        customer_features['ORDER_ID_count'] /
        (customer_features['days_active'] / 7 + 1)
    )

    return customer_features

# Create sophisticated customer segments
def segment_customers(customer_features):
    from sklearn.cluster import KMeans

    features_for_clustering = [
        'avg_order_price_mean', 'basket_size_mean',
        'ORDER_ID_count', 'orders_per_week'
    ]

    kmeans = KMeans(n_clusters=5, random_state=42)
    customer_features['customer_segment'] = kmeans.fit_predict(
        customer_features[features_for_clustering].fillna(0)
    )

    return customer_features, kmeans

In [9]:
# Capture customer order sequences and temporal patterns
def extract_sequential_patterns(df):
    """Find patterns in customer order sequences"""

    # Sort by customer and date
    df_sorted = df.sort_values(['CUSTOMER_ID', 'ORDER_CREATED_DATE'])

    sequences = {}
    for customer_id, group in df_sorted.groupby('CUSTOMER_ID'):
        customer_sequence = []
        for _, order in group.iterrows():
            items = [order[f'item{i}'] for i in range(1,7) if pd.notna(order[f'item{i}'])]
            customer_sequence.append(items)
        sequences[customer_id] = customer_sequence

    return sequences

# Build next-item prediction based on sequence patterns
def predict_next_items(customer_sequence, all_sequences, top_n=10):
    """Predict likely next items based on similar customer sequences"""

    if len(customer_sequence) == 0:
        return []

    last_order = set(customer_sequence[-1])
    candidate_items = {}

    # Find similar customers based on recent orders
    for other_sequence in all_sequences.values():
        if len(other_sequence) <= len(customer_sequence):
            continue

        # Check if this customer had similar recent orders
        for i in range(len(other_sequence) - len(customer_sequence)):
            window = other_sequence[i:i+len(customer_sequence)]

            # Calculate similarity with our customer's sequence
            if len(window) > 0:
                last_other_order = set(window[-1])
                similarity = len(last_order.intersection(last_other_order)) / len(last_order.union(last_other_order))

                if similarity > 0.3:  # Similar enough
                    # Look at what they ordered next
                    if i + len(customer_sequence) < len(other_sequence):
                        next_items = other_sequence[i + len(customer_sequence)]
                        for item in next_items:
                            if item not in last_order:  # Don't recommend what they already have
                                candidate_items[item] = candidate_items.get(item, 0) + similarity

    # Return top candidates
    return sorted(candidate_items.items(), key=lambda x: x[1], reverse=True)[:top_n]

In [10]:
# Collaborative filtering with advanced techniques
from sklearn.decomposition import NMF
from scipy.sparse import csr_matrix

def build_user_item_matrix(df):
    """Create user-item interaction matrix"""

    # Get all unique items
    all_items = []
    for i in range(1, 7):
        all_items.extend(df[f'item{i}'].dropna().unique())
    unique_items = list(set(all_items))

    # Create mapping dictionaries
    user_to_idx = {user: idx for idx, user in enumerate(df['CUSTOMER_ID'].unique())}
    item_to_idx = {item: idx for idx, item in enumerate(unique_items)}

    # Build sparse matrix
    rows, cols, data = [], [], []

    for _, order in df.iterrows():
        user_idx = user_to_idx[order['CUSTOMER_ID']]
        items_in_order = [order[f'item{i}'] for i in range(1,7) if pd.notna(order[f'item{i}'])]

        for item in items_in_order:
            if item in item_to_idx:
                item_idx = item_to_idx[item]
                rows.append(user_idx)
                cols.append(item_idx)
                data.append(1.0)  # Could weight by price or quantity

    matrix = csr_matrix((data, (rows, cols)),
                       shape=(len(user_to_idx), len(item_to_idx)))

    return matrix, user_to_idx, item_to_idx, unique_items

def collaborative_filtering_recommendations(user_id, matrix, user_to_idx, item_to_idx, unique_items, top_n=10):
    """Generate recommendations using matrix factorization"""

    if user_id not in user_to_idx:
        return []

    # Apply NMF
    nmf = NMF(n_components=50, random_state=42)
    W = nmf.fit_transform(matrix)
    H = nmf.components_

    user_idx = user_to_idx[user_id]
    user_profile = W[user_idx]

    # Calculate scores for all items
    item_scores = user_profile @ H

    # Get top recommendations
    top_items_idx = item_scores.argsort()[-top_n-10:][::-1]  # Get extra in case some are filtered

    recommendations = []
    for item_idx in top_items_idx:
        if len(recommendations) >= top_n:
            break
        item_name = unique_items[item_idx]
        recommendations.append((item_name, item_scores[item_idx]))

    return recommendations

In [11]:
def ultimate_recommender(customer_id, current_cart, df, rules, popular_items,
                        customer_features, sequences, matrix, user_to_idx,
                        item_to_idx, unique_items, top_n=3):
    """
    Ensemble recommender combining all approaches
    """

    all_recommendations = {}

    # 1. Market Basket Analysis (your current best)
    mba_recs = wings_r_us_recommender(current_cart, rules, popular_items, top_n=10)
    for i, item in enumerate(mba_recs):
        score = (10 - i) * 0.3  # Decreasing weight, MBA gets 30% weight
        all_recommendations[item] = all_recommendations.get(item, 0) + score

    # 2. Customer Segment-based recommendations
    if customer_id in customer_features['CUSTOMER_ID'].values:
        customer_segment = customer_features[
            customer_features['CUSTOMER_ID'] == customer_id
        ]['customer_segment'].iloc[0]

        # Find popular items in this customer's segment
        segment_customers = customer_features[
            customer_features['customer_segment'] == customer_segment
        ]['CUSTOMER_ID'].tolist()

        segment_orders = df[df['CUSTOMER_ID'].isin(segment_customers)]
        segment_popular = get_popular_items(segment_orders, top_n=10)

        for i, item in enumerate(segment_popular):
            if item not in current_cart:
                score = (10 - i) * 0.2  # 20% weight
                all_recommendations[item] = all_recommendations.get(item, 0) + score

    # 3. Sequential pattern recommendations
    if customer_id in sequences:
        seq_recs = predict_next_items(sequences[customer_id], sequences, top_n=10)
        for item, seq_score in seq_recs:
            if item not in current_cart:
                score = seq_score * 0.25  # 25% weight
                all_recommendations[item] = all_recommendations.get(item, 0) + score

    # 4. Collaborative filtering recommendations
    cf_recs = collaborative_filtering_recommendations(
        customer_id, matrix, user_to_idx, item_to_idx, unique_items, top_n=10
    )
    for item, cf_score in cf_recs:
        if item not in current_cart:
            score = cf_score * 0.25  # 25% weight
            all_recommendations[item] = all_recommendations.get(item, 0) + score

    # 5. Category-based rules (complementary items)
    category_recs = get_category_recommendations(current_cart, top_n=5)
    for item in category_recs:
        if item not in current_cart:
            all_recommendations[item] = all_recommendations.get(item, 0) + 1.0

    # Sort and return top N
    final_recommendations = sorted(all_recommendations.items(),
                                 key=lambda x: x[1], reverse=True)

    return [item for item, score in final_recommendations[:top_n]]

def get_category_recommendations(current_cart, top_n=5):
    """Rule-based category recommendations"""
    recommendations = []

    for item in current_cart:
        item_lower = item.lower()
        if 'wings' in item_lower:
            recommendations.extend(['Ranch Dip - Regular', 'Blue Cheese Dip - Regular',
                                  '20 Oz Soda', '32 Oz Soda'])
        elif 'spicy' in item_lower:
            recommendations.extend(['Regular Buffalo Fries', 'Ranch Dip - Regular',
                                  'Large Fruit Punch'])
        elif 'dip' in item_lower:
            recommendations.extend(['10 pc Grilled Wings', '8 pc Grilled Wings',
                                  'Regular Buffalo Fries'])

    return list(set(recommendations))[:top_n]

In [12]:
# Start with smaller samples for rapid iteration
def test_ensemble_performance_fast(df, sample_size=500):  # Reduced from 2000
    """Faster version for initial testing"""

    print("Preparing ensemble components...")

    # Convert date column to datetime
    df['ORDER_CREATED_DATE'] = pd.to_datetime(df['ORDER_CREATED_DATE'])

    # Use smaller subset for component building (speeds up significantly)
    df_subset = df.sample(n=min(50000, len(df)), random_state=42)  # Max 50k for components

    # 1. Customer features and segments (on subset)
    customer_features = create_customer_features(df_subset)
    customer_features, kmeans_model = segment_customers(customer_features)

    # 2. Sequential patterns (on subset)
    sequences = extract_sequential_patterns(df_subset.head(10000))  # Limit sequences

    # 3. User-item matrix (on subset)
    matrix, user_to_idx, item_to_idx, unique_items = build_user_item_matrix(df_subset)

    # Test on smaller sample
    multi_item_orders = df[df['basket_size'] >= 2].sample(n=sample_size, random_state=42)

    ensemble_correct = 0
    mba_correct = 0
    total_tests = 0

    print(f"Testing ensemble on {sample_size} orders...")

    for _, order in tqdm(multi_item_orders.iterrows(), total=len(multi_item_orders)):
        original_items = [order[f'item{i}'] for i in range(1, 7) if pd.notna(order[f'item{i}'])]

        if len(original_items) < 2:
            continue

        target_item = random.choice(original_items)
        remaining_cart = [item for item in original_items if item != target_item]
        customer_id = order['CUSTOMER_ID']

        # Test ensemble recommendations (with error handling)
        try:
            ensemble_recs = ultimate_recommender(
                customer_id, remaining_cart, df_subset, rules, popular_items,
                customer_features, sequences, matrix, user_to_idx,
                item_to_idx, unique_items, top_n=3
            )
        except:
            # Fallback to MBA if ensemble fails
            ensemble_recs = wings_r_us_recommender(remaining_cart, rules, popular_items, top_n=3)

        # Test original MBA recommendations
        mba_recs = wings_r_us_recommender(remaining_cart, rules, popular_items, top_n=3)

        # Check if target found
        if target_item in ensemble_recs:
            ensemble_correct += 1
        if target_item in mba_recs:
            mba_correct += 1

        total_tests += 1

    ensemble_recall = ensemble_correct / total_tests
    mba_recall = mba_correct / total_tests
    improvement = ensemble_recall - mba_recall

    print(f"\n=== FAST ENSEMBLE VS MBA COMPARISON ===")
    print(f"Original MBA Recall@3: {mba_recall:.3f} ({mba_recall*100:.1f}%)")
    print(f"Ensemble Recall@3: {ensemble_recall:.3f} ({ensemble_recall*100:.1f}%)")
    print(f"Improvement: +{improvement:.3f} (+{improvement*100:.1f} percentage points)")

    return ensemble_recall, mba_recall, customer_features, sequences, matrix, user_to_idx, item_to_idx, unique_items


In [13]:
# Faster customer features creation
def create_customer_features_fast(df):
    """Optimized customer feature creation"""
    print("Creating customer features...")

    # Use vectorized operations instead of loops
    customer_features = df.groupby('CUSTOMER_ID').agg({
        'avg_order_price': ['mean', 'std'],  # Removed min/max for speed
        'basket_size': ['mean', 'max'],      # Reduced metrics
        'ORDER_ID': 'count',
        'ORDER_CREATED_DATE': ['min', 'max']
    }).reset_index()

    # Flatten column names
    customer_features.columns = ['_'.join(col).strip() if col[1] else col[0]
                               for col in customer_features.columns]

    # Simplified calculations
    customer_features['days_active'] = (
        customer_features['ORDER_CREATED_DATE_max'] -
        customer_features['ORDER_CREATED_DATE_min']
    ).dt.days.fillna(1)

    customer_features['orders_per_week'] = (
        customer_features['ORDER_ID_count'] /
        (customer_features['days_active'] / 7 + 1)
    )

    return customer_features

# Faster sequential patterns (limited depth)
def extract_sequential_patterns_fast(df, max_customers=5000):
    """Extract sequential patterns with limits for speed"""
    print("Extracting sequential patterns...")

    # Limit to top customers by order count for speed
    top_customers = df['CUSTOMER_ID'].value_counts().head(max_customers).index
    df_filtered = df[df['CUSTOMER_ID'].isin(top_customers)]

    # Sort by customer and date
    df_sorted = df_filtered.sort_values(['CUSTOMER_ID', 'ORDER_CREATED_DATE'])

    sequences = {}
    for customer_id, group in df_sorted.groupby('CUSTOMER_ID'):
        customer_sequence = []
        for _, order in group.head(10).iterrows():  # Limit to last 10 orders per customer
            items = [order[f'item{i}'] for i in range(1,7) if pd.notna(order[f'item{i}'])]
            if items:  # Only add non-empty orders
                customer_sequence.append(items)
        if customer_sequence:  # Only add customers with orders
            sequences[customer_id] = customer_sequence

    return sequences

# Optimized user-item matrix
def build_user_item_matrix_fast(df, max_items=100):
    """Create user-item matrix with popular items only"""
    print("Building user-item matrix...")

    # Get top N most popular items only (for speed)
    all_items = []
    for i in range(1, 7):
        all_items.extend(df[f'item{i}'].dropna().tolist())

    item_counts = pd.Series(all_items).value_counts()
    top_items = item_counts.head(max_items).index.tolist()

    # Create mapping dictionaries
    users = df['CUSTOMER_ID'].unique()[:10000]  # Limit users for speed
    user_to_idx = {user: idx for idx, user in enumerate(users)}
    item_to_idx = {item: idx for idx, item in enumerate(top_items)}

    # Build sparse matrix
    rows, cols, data = [], [], []

    df_filtered = df[df['CUSTOMER_ID'].isin(users)]

    for _, order in df_filtered.iterrows():
        if order['CUSTOMER_ID'] not in user_to_idx:
            continue

        user_idx = user_to_idx[order['CUSTOMER_ID']]
        items_in_order = [order[f'item{i}'] for i in range(1,7) if pd.notna(order[f'item{i}'])]

        for item in items_in_order:
            if item in item_to_idx:
                item_idx = item_to_idx[item]
                rows.append(user_idx)
                cols.append(item_idx)
                data.append(1.0)

    matrix = csr_matrix((data, (rows, cols)),
                       shape=(len(user_to_idx), len(item_to_idx)))

    return matrix, user_to_idx, item_to_idx, top_items

In [14]:
def ultimate_recommender_fast(customer_id, current_cart, df, rules, popular_items,
                             customer_features, sequences, matrix, user_to_idx,
                             item_to_idx, unique_items, top_n=3):
    """Faster ensemble recommender with simplified logic"""

    all_recommendations = {}

    # 1. Market Basket Analysis (primary - 50% weight)
    try:
        mba_recs = wings_r_us_recommender(current_cart, rules, popular_items, top_n=6)
        for i, item in enumerate(mba_recs):
            score = (6 - i) * 0.5  # 50% weight to MBA
            all_recommendations[item] = all_recommendations.get(item, 0) + score
    except:
        pass

    # 2. Popularity in customer segment (30% weight)
    try:
        if customer_id in customer_features['CUSTOMER_ID'].values:
            customer_row = customer_features[customer_features['CUSTOMER_ID'] == customer_id]
            if not customer_row.empty:
                segment = customer_row['customer_segment'].iloc[0]

                # Get top items for this segment (pre-computed for speed)
                segment_items = popular_items[:5]  # Simplified - use global popular

                for i, item in enumerate(segment_items):
                    if item not in current_cart:
                        score = (5 - i) * 0.3
                        all_recommendations[item] = all_recommendations.get(item, 0) + score
    except:
        pass

    # 3. Category-based rules (20% weight)
    try:
        category_recs = get_category_recommendations(current_cart, top_n=3)
        for item in category_recs:
            if item not in current_cart:
                all_recommendations[item] = all_recommendations.get(item, 0) + 1.0
    except:
        pass

    # Sort and return top N
    if not all_recommendations:
        # Fallback to MBA only
        return wings_r_us_recommender(current_cart, rules, popular_items, top_n)

    final_recommendations = sorted(all_recommendations.items(),
                                 key=lambda x: x[1], reverse=True)

    return [item for item, score in final_recommendations[:top_n]]


In [15]:
print("=== PROGRESSIVE PERFORMANCE TESTING ===")

# Step 1: Test MBA baseline (should be fast)
print("1. Testing MBA baseline...")
recall_mba_only = evaluate_recall_on_training_data(
    df, rules, popular_items, sample_size=500
)[0]
print(f"MBA Baseline: {recall_mba_only:.3f}")

# Step 2: Test fast ensemble
print("\n2. Testing fast ensemble...")
ensemble_recall_fast, mba_recall_fast, *components = test_ensemble_performance_fast(
    df, sample_size=500
)

# Step 3: If improvement is significant, scale up
if ensemble_recall_fast > recall_mba_only + 0.05:  # 5% improvement
    print(f"\n‚úÖ Ensemble shows {(ensemble_recall_fast - recall_mba_only)*100:.1f}% improvement!")
    print("3. Testing on larger sample...")

    final_ensemble_recall, final_mba_recall, *final_components = test_ensemble_performance_fast(
        df, sample_size=1500  # Larger sample
    )

    print(f"Final Ensemble Recall@3: {final_ensemble_recall:.3f} ({final_ensemble_recall*100:.1f}%)")

else:
    print(f"\n‚ö†Ô∏è  Ensemble improvement minimal. Stick with MBA baseline.")


=== PROGRESSIVE PERFORMANCE TESTING ===
1. Testing MBA baseline...
Testing on 500 orders from training data...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 950.34it/s]



=== TRAINING DATA RECALL RESULTS ===
Total test cases: 500
Correct predictions: 144
Recall@3: 0.288 (28.8%)
MBA Baseline: 0.288

2. Testing fast ensemble...
Preparing ensemble components...
Testing ensemble on 500 orders...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [08:45<00:00,  1.05s/it]


=== FAST ENSEMBLE VS MBA COMPARISON ===
Original MBA Recall@3: 0.320 (32.0%)
Ensemble Recall@3: 0.316 (31.6%)
Improvement: +-0.004 (+-0.4 percentage points)

‚ö†Ô∏è  Ensemble improvement minimal. Stick with MBA baseline.





In [16]:
# 1. First, define the advanced association rules function
def advanced_association_rules(df, min_support=0.005, min_confidence=0.05):
    """Extract more comprehensive association rules"""

    from mlxtend.frequent_patterns import apriori, association_rules

    # Create basket matrix
    basket_data = create_basket_matrix(df)
    frequent_items = apriori(basket_data, min_support=min_support, use_colnames=True, max_len=4)

    # Generate rules with different metrics
    rules_confidence = association_rules(frequent_items, metric="confidence", min_threshold=min_confidence)
    rules_lift = association_rules(frequent_items, metric="lift", min_threshold=1.2)

    # Combine and rank all rules
    all_rules = pd.concat([rules_confidence, rules_lift]).drop_duplicates()

    # Create composite scoring
    all_rules['composite_score'] = (
        all_rules['confidence'] * 0.4 +
        all_rules['lift'] * 0.3 +
        all_rules['support'] * 0.3
    )

    return all_rules.sort_values('composite_score', ascending=False)

# 2. Define the item hierarchy function
def create_item_hierarchy():
    """Create detailed item categorization"""

    item_hierarchy = {
        # Wings - Main Category
        'Wings': {
            'Grilled': ['10 pc Grilled Wings', '15 pc Grilled Wings', '20 pc Grilled Wings', '8 pc Grilled Wings'],
            'Spicy': ['10 pc Spicy Wings', '6 pc Boneless Mild', '20pc Spicy Feast Deal'],
            'Family': ['40pc Family Wings', '50 pc Grilled Wings', '24pc Family Wings'],
            'Combos': ['10 pc Grilled Wings Combo', '8 pc Spicy Wings Combo']
        },

        # Dips - Secondary Category
        'Dips': {
            'Ranch': ['Ranch Dip - Regular', 'Ranch Dip - Large'],
            'Blue_Cheese': ['Blue Cheese Dip - Regular', 'Blue Cheese Dip - Large'],
            'Other': ['Buffalo Dip', 'BBQ Dip']
        },

        # Sides - Complementary Category
        'Sides': {
            'Fries': ['Regular Buffalo Fries', 'Large Cheese Fries', 'Medium Buffalo Fries'],
            'Corn': ['Fried Corn - Regular', 'Fried Corn - Large'],
            'Other': ['Veggie Sticks', 'Onion Rings']
        },

        # Beverages - Essential Category
        'Beverages': {
            'Soda': ['20 Oz Soda', '32 Oz Soda'],
            'Specialty': ['Large Fruit Punch', 'Sweet Tea', 'Lemonade']
        }
    }

    return item_hierarchy

# 3. Define the context-aware recommender
def context_aware_recommender(current_cart, customer_data, order_context, rules, popular_items, top_n=3):
    """Recommendations that consider context"""

    recommendations = {}

    # Base MBA recommendations
    base_recs = wings_r_us_recommender(current_cart, rules, popular_items, top_n=6)
    for i, item in enumerate(base_recs):
        score = (6 - i) * 1.0
        recommendations[item] = recommendations.get(item, 0) + score

    # Time-based patterns
    hour = order_context.get('hour', 12)
    day_of_week = order_context.get('day_of_week', 1)

    # Lunch hours (11 AM - 2 PM) - boost quick items
    if 11 <= hour <= 14:
        quick_items = ['10 pc Grilled Wings', '8 pc Grilled Wings', '20 Oz Soda', 'Ranch Dip - Regular']
        for item in quick_items:
            if item not in current_cart:
                recommendations[item] = recommendations.get(item, 0) + 2.0

    # Dinner hours (5 PM - 9 PM) - boost larger items
    elif 17 <= hour <= 21:
        dinner_items = ['20pc Spicy Feast Deal', '15 pc Grilled Wings', '40pc Family Wings', 'Large Cheese Fries']
        for item in dinner_items:
            if item not in current_cart:
                recommendations[item] = recommendations.get(item, 0) + 2.5

    # Weekend boost
    if day_of_week in [4, 5]:  # Friday/Saturday
        family_items = ['40pc Family Wings', '50 pc Grilled Wings']
        for item in family_items:
            if item not in current_cart:
                recommendations[item] = recommendations.get(item, 0) + 1.5

    # Channel-specific patterns
    channel = order_context.get('channel', 'Digital')

    if channel == 'Delivery':
        delivery_boost = ['32 Oz Soda', 'Large Fruit Punch', 'Blue Cheese Dip - Large']
        for item in delivery_boost:
            if item not in current_cart:
                recommendations[item] = recommendations.get(item, 0) + 1.0

    # Sort and return
    final_recs = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    return [item for item, score in final_recs[:top_n]]

# 4. Define the hierarchical recommender
def hierarchical_recommender(current_cart, rules, popular_items, hierarchy, top_n=3):
    """Recommendations using item hierarchy"""

    recommendations = {}

    # Standard MBA
    mba_recs = wings_r_us_recommender(current_cart, rules, popular_items, top_n=5)
    for i, item in enumerate(mba_recs):
        score = (5 - i) * 2.0
        recommendations[item] = recommendations.get(item, 0) + score

    # Identify categories in current cart
    cart_categories = set()

    for cart_item in current_cart:
        for main_cat, subcats in hierarchy.items():
            for subcat, items in subcats.items():
                if cart_item in items:
                    cart_categories.add(main_cat)

    # Cross-category recommendations
    category_rules = {
        'Wings': ['Dips', 'Beverages'],
        'Dips': ['Wings', 'Sides'],
        'Sides': ['Beverages', 'Dips'],
        'Beverages': ['Wings', 'Sides']
    }

    for cart_cat in cart_categories:
        for rec_cat in category_rules.get(cart_cat, []):
            if rec_cat in hierarchy:
                for subcat, items in hierarchy[rec_cat].items():
                    for item in items[:2]:  # Top 2 from each subcategory
                        if item not in current_cart:
                            recommendations[item] = recommendations.get(item, 0) + 1.5

    # Sort and return
    final_recs = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    return [item for item, score in final_recs[:top_n]]

# 5. Define customer journey analysis
def analyze_customer_journeys(df):
    """Analyze customer ordering patterns over time"""

    df_sorted = df.sort_values(['CUSTOMER_ID', 'ORDER_CREATED_DATE'])

    customer_patterns = {}

    for customer_id, group in df_sorted.groupby('CUSTOMER_ID'):
        if len(group) >= 2:  # Need multiple orders
            orders = []
            for _, order in group.iterrows():
                items = [order[f'item{i}'] for i in range(1,7) if pd.notna(order[f'item{i}'])]
                if items:
                    orders.append({
                        'items': items,
                        'date': order['ORDER_CREATED_DATE']
                    })

            if len(orders) >= 2:
                customer_patterns[customer_id] = orders

    return customer_patterns

# 6. Define journey-based recommender
def journey_based_recommender(current_cart, customer_id, customer_patterns, rules, popular_items, top_n=3):
    """Recommend based on customer's historical journey"""

    recommendations = {}

    # Base MBA recommendations
    mba_recs = wings_r_us_recommender(current_cart, rules, popular_items, top_n=4)
    for i, item in enumerate(mba_recs):
        score = (4 - i) * 1.5
        recommendations[item] = recommendations.get(item, 0) + score

    # Customer historical patterns
    if customer_id in customer_patterns:
        customer_orders = customer_patterns[customer_id]

        # Find what they typically order after current items
        for past_order in customer_orders:
            past_items = set(past_order['items'])
            current_items = set(current_cart)

            # If past order contains any current items
            if current_items.intersection(past_items):
                for item in past_items:
                    if item not in current_cart:
                        recommendations[item] = recommendations.get(item, 0) + 1.0

        # Customer preferences
        all_customer_items = []
        for order in customer_orders:
            all_customer_items.extend(order['items'])

        if all_customer_items:
            customer_favorites = pd.Series(all_customer_items).value_counts().head(5)
            for item, freq in customer_favorites.items():
                if item not in current_cart:
                    recommendations[item] = recommendations.get(item, 0) + (freq * 0.5)

    # Sort and return
    final_recs = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    return [item for item, score in final_recs[:top_n]]

In [17]:
# 7. Define the ultimate improved recommender (same as your code)
def ultimate_improved_recommender(current_cart, customer_id, customer_data, order_context,
                                advanced_rules, hierarchy, customer_patterns, popular_items, top_n=3):
    """Combine all improvement strategies"""

    all_recommendations = {}

    # 1. Advanced MBA (40% weight)
    adv_mba_recs = wings_r_us_recommender(current_cart, advanced_rules, popular_items, top_n=5)
    for i, item in enumerate(adv_mba_recs):
        score = (5 - i) * 2.0
        all_recommendations[item] = all_recommendations.get(item, 0) + score

    # 2. Context-aware (25% weight)
    context_recs = context_aware_recommender(current_cart, customer_data, order_context, advanced_rules, popular_items, top_n=4)
    for i, item in enumerate(context_recs):
        score = (4 - i) * 1.25
        all_recommendations[item] = all_recommendations.get(item, 0) + score

    # 3. Hierarchical (20% weight)
    hier_recs = hierarchical_recommender(current_cart, advanced_rules, popular_items, hierarchy, top_n=4)
    for i, item in enumerate(hier_recs):
        score = (4 - i) * 1.0
        all_recommendations[item] = all_recommendations.get(item, 0) + score

    # 4. Customer journey (15% weight)
    journey_recs = journey_based_recommender(current_cart, customer_id, customer_patterns, advanced_rules, popular_items, top_n=3)
    for i, item in enumerate(journey_recs):
        score = (3 - i) * 0.75
        all_recommendations[item] = all_recommendations.get(item, 0) + score

    # Sort and return
    final_recs = sorted(all_recommendations.items(), key=lambda x: x[1], reverse=True)
    return [item for item, score in final_recs[:top_n]]

# 8. Test function (same as your code but with error handling)
def test_ultimate_improved_model(df, sample_size=1000):
    """Test all improvements combined"""

    print("Preparing advanced components...")

    # Prepare components with error handling
    try:
        advanced_rules = advanced_association_rules(df)
        print(f"‚úÖ Advanced rules created: {len(advanced_rules)} rules")
    except Exception as e:
        print(f"‚ö†Ô∏è Using original rules due to error: {e}")
        advanced_rules = rules

    hierarchy = create_item_hierarchy()
    print("‚úÖ Item hierarchy created")

    customer_patterns = analyze_customer_journeys(df)
    print(f"‚úÖ Customer patterns analyzed: {len(customer_patterns)} customers")

    multi_item_orders = df[df['basket_size'] >= 2].sample(n=sample_size, random_state=42)

    improved_correct = 0
    mba_correct = 0
    total_tests = 0

    print(f"Testing ultimate improved model on {sample_size} orders...")

    for _, order in tqdm(multi_item_orders.iterrows(), total=len(multi_item_orders)):
        original_items = [order[f'item{i}'] for i in range(1, 7) if pd.notna(order[f'item{i}'])]

        if len(original_items) < 2:
            continue

        target_item = random.choice(original_items)
        remaining_cart = [item for item in original_items if item != target_item]
        customer_id = order['CUSTOMER_ID']

        # Create context
        order_datetime = pd.to_datetime(order['ORDER_CREATED_DATE'])
        context = {
            'hour': order_datetime.hour,
            'day_of_week': order_datetime.dayofweek,
            'channel': order.get('ORDER_CHANNEL_NAME', 'Digital')
        }

        # Test improved model
        try:
            improved_recs = ultimate_improved_recommender(
                remaining_cart, customer_id, None, context,
                advanced_rules, hierarchy, customer_patterns, popular_items, top_n=3
            )
        except Exception as e:
            # Fallback to MBA if improved model fails
            improved_recs = wings_r_us_recommender(remaining_cart, rules, popular_items, top_n=3)

        # Test baseline MBA
        mba_recs = wings_r_us_recommender(remaining_cart, rules, popular_items, top_n=3)

        if target_item in improved_recs:
            improved_correct += 1
        if target_item in mba_recs:
            mba_correct += 1

        total_tests += 1

    improved_recall = improved_correct / total_tests
    mba_recall = mba_correct / total_tests
    improvement = improved_recall - mba_recall

    print(f"\n=== ULTIMATE IMPROVED MODEL RESULTS ===")
    print(f"Original MBA Recall@3: {mba_recall:.3f} ({mba_recall*100:.1f}%)")
    print(f"Improved Model Recall@3: {improved_recall:.3f} ({improved_recall*100:.1f}%)")
    print(f"Improvement: +{improvement:.3f} (+{improvement*100:.1f} percentage points)")

    if improved_recall > mba_recall:
        print(f"üéâ SUCCESS! Improved model beats MBA baseline!")
    else:
        print(f"‚ö†Ô∏è Improved model didn't beat baseline - stick with MBA")

    return improved_recall, mba_recall

# 9. Run the test (make sure to use your correct dataframe variable name)
print("Starting ultimate improved model test...")
improved_recall, baseline_recall = test_ultimate_improved_model(df)  # or df, whatever your variable name is

Starting ultimate improved model test...
Preparing advanced components...
‚úÖ Advanced rules created: 102 rules
‚úÖ Item hierarchy created
‚úÖ Customer patterns analyzed: 230612 customers
Testing ultimate improved model on 1000 orders...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:24<00:00, 40.68it/s]


=== ULTIMATE IMPROVED MODEL RESULTS ===
Original MBA Recall@3: 0.296 (29.6%)
Improved Model Recall@3: 0.315 (31.5%)
Improvement: +0.019 (+1.9 percentage points)
üéâ SUCCESS! Improved model beats MBA baseline!





In [18]:
def optimized_ultimate_recommender(current_cart, customer_id, customer_data, order_context,
                                 advanced_rules, hierarchy, customer_patterns, popular_items,
                                 weights, top_n=3):
    """Ultimate recommender with customizable weights"""

    all_recommendations = {}

    # 1. Advanced MBA (customizable weight)
    try:
        adv_mba_recs = wings_r_us_recommender(current_cart, advanced_rules, popular_items, top_n=6)
        for i, item in enumerate(adv_mba_recs):
            score = (6 - i) * weights['mba']
            all_recommendations[item] = all_recommendations.get(item, 0) + score
    except:
        pass

    # 2. Context-aware (customizable weight)
    try:
        context_recs = context_aware_recommender(current_cart, customer_data, order_context, advanced_rules, popular_items, top_n=5)
        for i, item in enumerate(context_recs):
            score = (5 - i) * weights['context']
            all_recommendations[item] = all_recommendations.get(item, 0) + score
    except:
        pass

    # 3. Hierarchical (customizable weight)
    try:
        hier_recs = hierarchical_recommender(current_cart, advanced_rules, popular_items, hierarchy, top_n=5)
        for i, item in enumerate(hier_recs):
            score = (5 - i) * weights['hierarchical']
            all_recommendations[item] = all_recommendations.get(item, 0) + score
    except:
        pass

    # 4. Customer journey (customizable weight)
    try:
        journey_recs = journey_based_recommender(current_cart, customer_id, customer_patterns, advanced_rules, popular_items, top_n=4)
        for i, item in enumerate(journey_recs):
            score = (4 - i) * weights['journey']
            all_recommendations[item] = all_recommendations.get(item, 0) + score
    except:
        pass

    # 5. Enhanced category boost (new component)
    category_boost = enhanced_category_recommender(current_cart, popular_items, top_n=3)
    for i, item in enumerate(category_boost):
        score = (3 - i) * weights.get('category', 0.5)
        all_recommendations[item] = all_recommendations.get(item, 0) + score

    # Sort and return
    if not all_recommendations:
        return wings_r_us_recommender(current_cart, advanced_rules, popular_items, top_n)

    final_recs = sorted(all_recommendations.items(), key=lambda x: x[1], reverse=True)
    return [item for item, score in final_recs[:top_n]]

def enhanced_category_recommender(current_cart, popular_items, top_n=3):
    """Enhanced category-based recommendations"""

    recommendations = []

    # Analyze current cart categories
    has_wings = any('wings' in item.lower() for item in current_cart)
    has_dips = any('dip' in item.lower() for item in current_cart)
    has_sides = any('fries' in item.lower() or 'corn' in item.lower() for item in current_cart)
    has_beverages = any('soda' in item.lower() or 'oz' in item.lower() or 'punch' in item.lower() for item in current_cart)
    has_spicy = any('spicy' in item.lower() for item in current_cart)

    # Smart category completion rules
    if has_wings and not has_dips:
        recommendations.extend(['Ranch Dip - Regular', 'Blue Cheese Dip - Regular'])

    if has_wings and not has_beverages:
        recommendations.extend(['20 Oz Soda', '32 Oz Soda'])

    if has_spicy and not has_sides:
        recommendations.extend(['Regular Buffalo Fries', 'Large Cheese Fries'])

    if has_dips and not has_wings:
        recommendations.extend(['10 pc Grilled Wings', '8 pc Grilled Wings'])

    # Size upgrade logic
    for cart_item in current_cart:
        if '8 pc' in cart_item:
            upgrade_item = cart_item.replace('8 pc', '10 pc')
            if upgrade_item not in current_cart:
                recommendations.append(upgrade_item)
        elif '10 pc' in cart_item:
            upgrade_item = cart_item.replace('10 pc', '15 pc')
            if upgrade_item not in current_cart:
                recommendations.append(upgrade_item)
        elif 'Regular' in cart_item:
            upgrade_item = cart_item.replace('Regular', 'Large')
            if upgrade_item not in current_cart:
                recommendations.append(upgrade_item)

    # Remove duplicates and items already in cart
    unique_recs = []
    for item in recommendations:
        if item not in current_cart and item not in unique_recs:
            unique_recs.append(item)

    return unique_recs[:top_n]

def test_weight_combinations(df, sample_size=800):
    """Test multiple weight combinations to find optimal"""

    # Prepare components once
    print("Preparing optimization components...")
    advanced_rules = advanced_association_rules(df)
    hierarchy = create_item_hierarchy()
    customer_patterns = analyze_customer_journeys(df)

    # Define weight combinations to test
    weight_combinations = [
        # Current weights (baseline)
        {'mba': 2.0, 'context': 1.25, 'hierarchical': 1.0, 'journey': 0.75, 'category': 0.5},

        # MBA-focused
        {'mba': 2.5, 'context': 1.0, 'hierarchical': 0.8, 'journey': 0.6, 'category': 0.4},

        # Context-focused
        {'mba': 1.8, 'context': 1.8, 'hierarchical': 1.0, 'journey': 0.8, 'category': 0.6},

        # Balanced approach
        {'mba': 2.2, 'context': 1.4, 'hierarchical': 1.2, 'journey': 0.9, 'category': 0.7},

        # Journey-enhanced
        {'mba': 2.0, 'context': 1.2, 'hierarchical': 0.9, 'journey': 1.2, 'category': 0.5},

        # Category-boosted
        {'mba': 1.9, 'context': 1.3, 'hierarchical': 1.1, 'journey': 0.8, 'category': 1.0},

        # High-performance mix
        {'mba': 2.3, 'context': 1.6, 'hierarchical': 1.3, 'journey': 1.0, 'category': 0.8}
    ]

    best_recall = 0
    best_weights = None
    best_results = []

    multi_item_orders = df[df['basket_size'] >= 2].sample(n=sample_size, random_state=42)

    for idx, weights in enumerate(weight_combinations):
        print(f"\nüîç Testing weight combination {idx+1}/{len(weight_combinations)}")
        print(f"Weights: {weights}")

        correct = 0
        total = 0

        for _, order in tqdm(multi_item_orders.iterrows(), total=len(multi_item_orders), desc=f"Testing weights {idx+1}"):
            original_items = [order[f'item{i}'] for i in range(1, 7) if pd.notna(order[f'item{i}'])]

            if len(original_items) < 2:
                continue

            target_item = random.choice(original_items)
            remaining_cart = [item for item in original_items if item != target_item]
            customer_id = order['CUSTOMER_ID']

            # Create context
            order_datetime = pd.to_datetime(order['ORDER_CREATED_DATE'])
            context = {
                'hour': order_datetime.hour,
                'day_of_week': order_datetime.dayofweek,
                'channel': order.get('ORDER_CHANNEL_NAME', 'Digital')
            }

            # Test with these weights
            try:
                recommendations = optimized_ultimate_recommender(
                    remaining_cart, customer_id, None, context,
                    advanced_rules, hierarchy, customer_patterns,
                    popular_items, weights, top_n=3
                )

                if target_item in recommendations:
                    correct += 1

            except Exception as e:
                # Fallback
                fallback_recs = wings_r_us_recommender(remaining_cart, rules, popular_items, top_n=3)
                if target_item in fallback_recs:
                    correct += 1

            total += 1

        recall = correct / total if total > 0 else 0
        print(f"Recall@3: {recall:.3f} ({recall*100:.1f}%)")

        best_results.append({
            'weights': weights,
            'recall': recall,
            'improvement': recall - 0.314  # Your current baseline
        })

        if recall > best_recall:
            best_recall = recall
            best_weights = weights
            print(f"üéâ NEW BEST! Improvement: +{(recall - 0.314)*100:.1f} percentage points")

    # Sort results
    best_results.sort(key=lambda x: x['recall'], reverse=True)

    print(f"\n=== WEIGHT OPTIMIZATION RESULTS ===")
    print(f"üèÜ BEST PERFORMANCE: {best_recall:.3f} ({best_recall*100:.1f}%)")
    print(f"üöÄ IMPROVEMENT: +{(best_recall - 0.314)*100:.1f} percentage points over 31.4% baseline")
    print(f"‚öôÔ∏è  OPTIMAL WEIGHTS: {best_weights}")

    print(f"\nüìä TOP 3 WEIGHT COMBINATIONS:")
    for i, result in enumerate(best_results[:3], 1):
        print(f"{i}. Recall: {result['recall']:.3f} (+{result['improvement']*100:.1f}%) | Weights: {result['weights']}")

    return best_recall, best_weights, best_results

# Run weight optimization
print("üî¨ Starting comprehensive weight optimization...")
best_recall, optimal_weights, all_results = test_weight_combinations(df)

üî¨ Starting comprehensive weight optimization...
Preparing optimization components...

üîç Testing weight combination 1/7
Weights: {'mba': 2.0, 'context': 1.25, 'hierarchical': 1.0, 'journey': 0.75, 'category': 0.5}


Testing weights 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800/800 [00:21<00:00, 36.99it/s]


Recall@3: 0.309 (30.9%)
üéâ NEW BEST! Improvement: +-0.5 percentage points

üîç Testing weight combination 2/7
Weights: {'mba': 2.5, 'context': 1.0, 'hierarchical': 0.8, 'journey': 0.6, 'category': 0.4}


Testing weights 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800/800 [00:17<00:00, 46.20it/s]


Recall@3: 0.302 (30.2%)

üîç Testing weight combination 3/7
Weights: {'mba': 1.8, 'context': 1.8, 'hierarchical': 1.0, 'journey': 0.8, 'category': 0.6}


Testing weights 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800/800 [00:17<00:00, 45.90it/s]


Recall@3: 0.312 (31.2%)
üéâ NEW BEST! Improvement: +-0.2 percentage points

üîç Testing weight combination 4/7
Weights: {'mba': 2.2, 'context': 1.4, 'hierarchical': 1.2, 'journey': 0.9, 'category': 0.7}


Testing weights 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800/800 [00:17<00:00, 46.16it/s]


Recall@3: 0.297 (29.8%)

üîç Testing weight combination 5/7
Weights: {'mba': 2.0, 'context': 1.2, 'hierarchical': 0.9, 'journey': 1.2, 'category': 0.5}


Testing weights 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800/800 [00:17<00:00, 46.03it/s]


Recall@3: 0.321 (32.1%)
üéâ NEW BEST! Improvement: +0.7 percentage points

üîç Testing weight combination 6/7
Weights: {'mba': 1.9, 'context': 1.3, 'hierarchical': 1.1, 'journey': 0.8, 'category': 1.0}


Testing weights 6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800/800 [00:18<00:00, 42.50it/s]


Recall@3: 0.299 (29.9%)

üîç Testing weight combination 7/7
Weights: {'mba': 2.3, 'context': 1.6, 'hierarchical': 1.3, 'journey': 1.0, 'category': 0.8}


Testing weights 7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800/800 [00:18<00:00, 43.41it/s]


Recall@3: 0.329 (32.9%)
üéâ NEW BEST! Improvement: +1.5 percentage points

=== WEIGHT OPTIMIZATION RESULTS ===
üèÜ BEST PERFORMANCE: 0.329 (32.9%)
üöÄ IMPROVEMENT: +1.5 percentage points over 31.4% baseline
‚öôÔ∏è  OPTIMAL WEIGHTS: {'mba': 2.3, 'context': 1.6, 'hierarchical': 1.3, 'journey': 1.0, 'category': 0.8}

üìä TOP 3 WEIGHT COMBINATIONS:
1. Recall: 0.329 (+1.5%) | Weights: {'mba': 2.3, 'context': 1.6, 'hierarchical': 1.3, 'journey': 1.0, 'category': 0.8}
2. Recall: 0.321 (+0.7%) | Weights: {'mba': 2.0, 'context': 1.2, 'hierarchical': 0.9, 'journey': 1.2, 'category': 0.5}
3. Recall: 0.312 (+-0.2%) | Weights: {'mba': 1.8, 'context': 1.8, 'hierarchical': 1.0, 'journey': 0.8, 'category': 0.6}


In [35]:
def generate_final_test_submission():
    """Generate final 1000-customer test submission with optimized 32.9% model"""

    import pandas as pd
    from tqdm import tqdm

    # Load test data
    test_data = pd.read_csv("/content/test_data_question (1).csv")
    print(f"üìã Loaded test data: {len(test_data)} customers")

    # Your proven optimal weights (32.9% performance)
    optimal_weights = {
        'mba': 2.3,
        'context': 1.6,
        'hierarchical': 1.3,
        'journey': 1.0,
        'category': 0.8
    }

    print("üöÄ Preparing optimized components...")

    # Prepare all advanced components
    try:
        # Assuming your_cleaned_data is your main training DataFrame (likely 'df')
        advanced_rules = advanced_association_rules(df) # Changed from your_cleaned_data
        print(f"‚úÖ Advanced rules: {len(advanced_rules)}")
    except Exception as e:
        print(f"‚ö†Ô∏è Using original rules due to error: {e}")
        advanced_rules = rules
        print(f"‚úÖ Fallback to original rules: {len(advanced_rules)}")

    try:
        hierarchy = create_item_hierarchy()
        print("‚úÖ Item hierarchy created")
    except Exception as e:
        hierarchy = {}
        print(f"‚ö†Ô∏è Empty hierarchy fallback due to error: {e}")

    try:
        # Assuming your_cleaned_data is your main training DataFrame (likely 'df')
        customer_patterns = analyze_customer_journeys(df) # Changed from your_cleaned_data
        print(f"‚úÖ Customer patterns: {len(customer_patterns)}")
    except Exception as e:
        customer_patterns = {}
        print(f"‚ö†Ô∏è Empty customer patterns fallback due to error: {e}")

    # Generate predictions for all 1000 test customers
    final_predictions = []
    customer_ids = []
    order_ids = []
    test_items = [] # To store the items from the test order

    print(f"üéØ Generating predictions for {len(test_data)} test customers...")

    # Dynamically identify item columns in test data
    test_item_cols = [col for col in test_data.columns if col.startswith('item') and not col.endswith('_price')]
    print(f"Detected test item columns: {test_item_cols}")

    for idx, row in tqdm(test_data.iterrows(), total=len(test_data), desc="Processing customers"):

        # Extract customer and order info
        customer_id = row.get('CUSTOMER_ID', f'CUST_{idx+1}')
        order_id = row.get('ORDER_ID', f'ORD_{idx+1}')

        # Extract current cart using only available item columns
        current_cart = [row[col] for col in test_item_cols if pd.notna(row[col])]

        # Store for output format (only take up to 4 items as per submission format)
        customer_ids.append(customer_id)
        order_ids.append(order_id)
        test_items.append(current_cart[:4]) # Store up to 4 test items

        # Create order context
        try:
            # Use .get() with a default for robustness
            order_date_str = row.get('ORDER_CREATED_DATE')
            if order_date_str:
                 order_datetime = pd.to_datetime(order_date_str)
                 context = {
                    'hour': order_datetime.hour,
                    'day_of_week': order_datetime.dayofweek,
                    'channel': row.get('ORDER_CHANNEL_NAME', 'Digital')
                 }
            else:
                # Default context if date is missing
                context = {'hour': 12, 'day_of_week': 1, 'channel': row.get('ORDER_CHANNEL_NAME', 'Digital')}

        except Exception as e:
            # Fallback for date parsing errors
            print(f"‚ö†Ô∏è Date parsing error for order {order_id}: {e}. Using default context.")
            context = {'hour': 12, 'day_of_week': 1, 'channel': row.get('ORDER_CHANNEL_NAME', 'Digital')}


        # Generate recommendations using your proven optimized model
        try:
            # Use the main training DataFrame 'df' for components that need it
            recommendations = optimized_ultimate_recommender(
                current_cart, customer_id, None, context, # customer_data is None as it's not used in optimized_ultimate_recommender
                advanced_rules, hierarchy, customer_patterns,
                popular_items, optimal_weights, top_n=3
            )
        except Exception as e:
            # Fallback to proven MBA baseline
            print(f"‚ö†Ô∏è Fallback for customer {customer_id}, order {order_id}: {e}")
            recommendations = wings_r_us_recommender(current_cart, rules, popular_items, top_n=3)

        # Ensure we have exactly 3 recommendations
        while len(recommendations) < 3:
            added = False
            for item in popular_items:
                # Ensure item is not already in the cart or recommendations
                if item not in current_cart and item not in recommendations:
                    recommendations.append(item)
                    added = True
                    break
            if not added:
                 # If no popular items left, add an empty string
                 recommendations.append('')


        final_predictions.append(recommendations[:3])  # Exactly 3 recommendations

    # Create submission DataFrame in the exact format from your image
    submission_data = []

    for i in range(len(test_data)):
        row_data = {
            'CUSTOMER_ID': customer_ids[i],
            'ORDER_ID': order_ids[i],
        }

        # Add test items (Item1, Item2, Item3, Item4) from stored test_items
        for j in range(4):
            if j < len(test_items[i]):
                row_data[f'Item{j+1}'] = test_items[i][j]
            else:
                row_data[f'Item{j+1}'] = '' # Use empty string for missing items

        # Add recommendations
        row_data['RECOMMENDATION 1'] = final_predictions[i][0] if len(final_predictions[i]) > 0 else ''
        row_data['RECOMMENDATION 2'] = final_predictions[i][1] if len(final_predictions[i]) > 1 else ''
        row_data['RECOMMENDATION 3'] = final_predictions[i][2] if len(final_predictions[i]) > 2 else ''

        submission_data.append(row_data)

    # Create final DataFrame
    submission_df = pd.DataFrame(submission_data)

    # Ensure all required columns are present even if empty for some rows
    required_cols = ['CUSTOMER_ID', 'ORDER_ID', 'Item1', 'Item2', 'Item3', 'Item4',
                     'RECOMMENDATION 1', 'RECOMMENDATION 2', 'RECOMMENDATION 3']
    for col in required_cols:
        if col not in submission_df.columns:
            submission_df[col] = '' # Add missing columns with empty strings

    # Reorder columns to match the required format
    submission_df = submission_df[required_cols]


    # Save to Excel file (matching your image format)
    submission_df.to_excel("Wings_R_Us_Final_1000_Customers_Submission.xlsx", index=False)

    print("üèÜ FINAL SUBMISSION COMPLETE!")
    print(f"üìä Generated predictions for {len(submission_df)} customers")
    print(f"üìÅ Saved as: Wings_R_Us_Final_1000_Customers_Submission.xlsx")
    # Note: The 32.9% performance was on training data, test performance may vary
    print(f"üéØ Model Used: Optimized Ensemble with custom weights")

    # Display first few rows to verify format
    print("\nüìã SUBMISSION PREVIEW (First 5 rows):")
    display(submission_df.head()) # Use display for better formatting

    # Summary statistics
    print(f"\nüìà SUBMISSION SUMMARY:")
    print(f"Total customers: {len(submission_df)}")
    non_empty_rec1 = (submission_df['RECOMMENDATION 1'] != '').sum()
    non_empty_rec2 = (submission_df['RECOMMENDATION 2'] != '').sum()
    non_empty_rec3 = (submission_df['RECOMMENDATION 3'] != '').sum()

    print(f"Non-empty Recommendation 1: {non_empty_rec1} ({non_empty_rec1/len(submission_df)*100:.1f}%)")
    print(f"Non-empty Recommendation 2: {non_empty_rec2} ({non_empty_rec2/len(submission_df)*100:.1f}%)")
    print(f"Non-empty Recommendation 3: {non_empty_rec3} ({non_empty_rec3/len(submission_df)*100:.1f}%)")

    return submission_df

# Generate your final submission
# Ensure the necessary components (rules, popular_items, etc.) are defined before running
# You might need to run cells AM47W0qthp8-, 3d37a0aa, VS5P6MCzyduX, and xhm08gXZgfZ_ first.
final_submission = generate_final_test_submission()

üìã Loaded test data: 1000 customers
üöÄ Preparing optimized components...
‚úÖ Advanced rules: 102
‚úÖ Item hierarchy created
‚úÖ Customer patterns: 230612
üéØ Generating predictions for 1000 test customers...
Detected test item columns: ['item1', 'item2', 'item3']


Processing customers: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:23<00:00, 42.65it/s]


üèÜ FINAL SUBMISSION COMPLETE!
üìä Generated predictions for 1000 customers
üìÅ Saved as: Wings_R_Us_Final_1000_Customers_Submission.xlsx
üéØ Model Used: Optimized Ensemble with custom weights

üìã SUBMISSION PREVIEW (First 5 rows):


Unnamed: 0,CUSTOMER_ID,ORDER_ID,Item1,Item2,Item3,Item4,RECOMMENDATION 1,RECOMMENDATION 2,RECOMMENDATION 3
0,997177535,9351345556,Chicken Sub Combo,Ranch Dip - Regular,10 pc Spicy Wings Combo,,Regular Buffalo Fries,10 pc Spicy Wings,10 pc Grilled Wings
1,345593831,3595377080,Regular Buffalo Fries,10 pc Spicy Wings,3 pc Crispy Strips Combo,,Ranch Dip - Regular,10 pc Grilled Wings,15 pc Grilled Wings
2,160955031,4071757785,Large Buffalo Fries,10 pc Spicy Wings,Ranch Dip - Regular,,Regular Buffalo Fries,10 pc Grilled Wings,Ranch Dip - Large
3,890671991,3931766769,6 pc Grilled Wings Combo,20 pc Grilled Wings,Fried Corn - Large,,Ranch Dip - Regular,Regular Buffalo Fries,20pc Spicy Feast Deal
4,73989021,3739700809,Regular Buffalo Fries,20 pc Grilled Wings,Ranch Dip - Large,,Ranch Dip - Regular,10 pc Spicy Wings,10 pc Grilled Wings



üìà SUBMISSION SUMMARY:
Total customers: 1000
Non-empty Recommendation 1: 1000 (100.0%)
Non-empty Recommendation 2: 1000 (100.0%)
Non-empty Recommendation 3: 1000 (100.0%)


In [37]:
dt = pd.read_excel('/content/Wings_R_Us_Final_1000_Customers_Submission.xlsx')

In [38]:
dt.head()

Unnamed: 0,CUSTOMER_ID,ORDER_ID,Item1,Item2,Item3,Item4,RECOMMENDATION 1,RECOMMENDATION 2,RECOMMENDATION 3
0,997177535,9351345556,Chicken Sub Combo,Ranch Dip - Regular,10 pc Spicy Wings Combo,,Regular Buffalo Fries,10 pc Spicy Wings,10 pc Grilled Wings
1,345593831,3595377080,Regular Buffalo Fries,10 pc Spicy Wings,3 pc Crispy Strips Combo,,Ranch Dip - Regular,10 pc Grilled Wings,15 pc Grilled Wings
2,160955031,4071757785,Large Buffalo Fries,10 pc Spicy Wings,Ranch Dip - Regular,,Regular Buffalo Fries,10 pc Grilled Wings,Ranch Dip - Large
3,890671991,3931766769,6 pc Grilled Wings Combo,20 pc Grilled Wings,Fried Corn - Large,,Ranch Dip - Regular,Regular Buffalo Fries,20pc Spicy Feast Deal
4,73989021,3739700809,Regular Buffalo Fries,20 pc Grilled Wings,Ranch Dip - Large,,Ranch Dip - Regular,10 pc Spicy Wings,10 pc Grilled Wings
