# Food Demand Forecasting

## Problem Statement
Predict demand for the next 10 weeks (146-155) for center-meal combinations.

## Evaluation Metric
**100 × RMSLE** (Root Mean Squared Logarithmic Error)

## Approach
Use **sklearn Pipeline with DictVectorizer** for production-ready ML with **interactive Plotly visualizations**

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer
import warnings
warnings.filterwarnings('ignore')

## 2. Load Data

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test_QoiMO9B.csv')
fulfillment_center = pd.read_csv('data/fulfilment_center_info.csv')
meal_info = pd.read_csv('data/meal_info.csv')

print(f"Train: {train.shape}")
print(f"Test: {test.shape}")
print(f"Centers: {fulfillment_center.shape}")
print(f"Meals: {meal_info.shape}")

In [None]:
display(train.head())
display(fulfillment_center.head())
display(meal_info.head())

## 3. Combine Train and Test

In [None]:
train['is_train'] = 1
test['is_train'] = 0
combined = pd.concat([train, test], ignore_index=True)
combined = combined.merge(fulfillment_center, on='center_id', how='left')
combined = combined.merge(meal_info, on='meal_id', how='left')
print(f"Combined: {combined.shape}")

## 4. Exploratory Data Analysis (Interactive Plotly Visualizations)

In [None]:
train_data = combined[combined['is_train'] == 1].copy()

print(f"Mean orders: {train_data['num_orders'].mean():.2f}")
print(f"Median orders: {train_data['num_orders'].median():.2f}")
print(f"Std orders: {train_data['num_orders'].std():.2f}")

In [None]:
# Distribution plots
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Distribution of Orders', 'Log-Transformed Distribution')
)

fig.add_trace(
    go.Histogram(x=train_data['num_orders'], nbinsx=50, name='Orders'),
    row=1, col=1
)

fig.add_trace(
    go.Histogram(x=np.log1p(train_data['num_orders']), nbinsx=50, name='Log(Orders+1)'),
    row=1, col=2
)

fig.update_layout(
    height=400,
    showlegend=False,
    title_text="Order Distribution Analysis"
)
fig.update_xaxes(title_text="Number of Orders", row=1, col=1)
fig.update_xaxes(title_text="Log(Orders + 1)", row=1, col=2)
fig.update_yaxes(title_text="Frequency", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=2)
fig.show()

In [None]:
# Weekly trends
weekly = train_data.groupby('week')['num_orders'].agg(['mean', 'sum']).reset_index()

fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=('Average Orders per Week', 'Total Orders per Week'),
    vertical_spacing=0.12
)

fig.add_trace(
    go.Scatter(x=weekly['week'], y=weekly['mean'], mode='lines+markers', 
               name='Avg Orders', line=dict(color='blue', width=2)),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=weekly['week'], y=weekly['sum'], mode='lines+markers',
               name='Total Orders', line=dict(color='orange', width=2)),
    row=2, col=1
)

fig.update_layout(height=700, showlegend=False, title_text="Weekly Order Trends")
fig.update_xaxes(title_text="Week", row=2, col=1)
fig.update_yaxes(title_text="Average Orders", row=1, col=1)
fig.update_yaxes(title_text="Total Orders", row=2, col=1)
fig.show()

In [None]:
# Category analysis - Center Type
center_orders = train_data.groupby('center_type')['num_orders'].mean().sort_values(ascending=False).reset_index()

fig = px.bar(center_orders, x='center_type', y='num_orders',
             title='Average Orders by Center Type',
             labels={'num_orders': 'Average Orders', 'center_type': 'Center Type'},
             color='num_orders', color_continuous_scale='Blues')
fig.update_layout(height=400, showlegend=False)
fig.show()

In [None]:
# Category analysis - Meal Category
category_orders = train_data.groupby('category')['num_orders'].mean().sort_values(ascending=False).reset_index()

fig = px.bar(category_orders, x='category', y='num_orders',
             title='Average Orders by Meal Category',
             labels={'num_orders': 'Average Orders', 'category': 'Category'},
             color='num_orders', color_continuous_scale='Oranges')
fig.update_layout(height=400, showlegend=False)
fig.show()

In [None]:
# Category analysis - Cuisine
cuisine_orders = train_data.groupby('cuisine')['num_orders'].mean().sort_values(ascending=False).reset_index()

fig = px.bar(cuisine_orders, x='cuisine', y='num_orders',
             title='Average Orders by Cuisine',
             labels={'num_orders': 'Average Orders', 'cuisine': 'Cuisine'},
             color='num_orders', color_continuous_scale='Greens')
fig.update_layout(height=400, showlegend=False)
fig.show()

In [None]:
# Promotion analysis
promo_data = train_data.groupby(['emailer_for_promotion', 'homepage_featured'])['num_orders'].mean().reset_index()
promo_data['promotion_type'] = promo_data.apply(
    lambda x: f"Email: {x['emailer_for_promotion']}, Homepage: {x['homepage_featured']}", axis=1
)

fig = px.bar(promo_data, x='promotion_type', y='num_orders',
             title='Average Orders by Promotion Type',
             labels={'num_orders': 'Average Orders', 'promotion_type': 'Promotion'},
             color='num_orders', color_continuous_scale='Purples')
fig.update_layout(height=400, showlegend=False)
fig.show()

## 5. Feature Engineering

**Key point**: Keep categorical variables as strings for DictVectorizer to handle

In [None]:
def create_features(df):
    df = df.copy()
    
    # Price features
    df['discount'] = df['base_price'] - df['checkout_price']
    df['discount_percentage'] = (df['discount'] / df['base_price']) * 100
    df['discount_percentage'] = df['discount_percentage'].fillna(0)
    
    # Promotional features
    df['total_promotion'] = df['emailer_for_promotion'] + df['homepage_featured']
    
    # Time-based features
    df['week_mod_4'] = df['week'] % 4
    df['week_mod_13'] = df['week'] % 13
    df['week_mod_52'] = df['week'] % 52
    
    # Convert categorical to strings (for DictVectorizer)
    categorical_cols = ['center_id', 'meal_id', 'city_code', 'region_code',
                       'center_type', 'category', 'cuisine']
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].astype(str)
    
    return df

combined_features = create_features(combined)
print(f"Features created: {combined_features.shape}")

## 6. Prepare Train/Validation/Test Sets

In [None]:
train_df = combined_features[combined_features['is_train'] == 1].copy()
test_df = combined_features[combined_features['is_train'] == 0].copy()

exclude_cols = ['id', 'num_orders', 'is_train']
feature_cols = [col for col in train_df.columns if col not in exclude_cols]

print(f"Features: {len(feature_cols)}")
print(feature_cols)

In [None]:
X = train_df[feature_cols]
y = train_df['num_orders']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = test_df[feature_cols]

# Sample for faster tuning
SAMPLE_SIZE = 50000
if len(X_train) > SAMPLE_SIZE:
    sample_indices = np.random.choice(len(X_train), SAMPLE_SIZE, replace=False)
    X_train_sample = X_train.iloc[sample_indices].copy()
    y_train_sample = y_train.iloc[sample_indices].copy()
    print(f"Created sample of {SAMPLE_SIZE} records")
else:
    X_train_sample = X_train
    y_train_sample = y_train

print(f"Train: {X_train.shape}")
print(f"Sample: {X_train_sample.shape}")
print(f"Val: {X_val.shape}")
print(f"Test: {X_test.shape}")

## 7. Convert to Dictionary Format

DictVectorizer requires list of dictionaries

In [None]:
X_train_dict = X_train.to_dict('records')
X_train_sample_dict = X_train_sample.to_dict('records')
X_val_dict = X_val.to_dict('records')

print(f"Converted to dict format")

## 8. Define RMSLE Metric

In [None]:
def rmsle(y_true, y_pred):
    y_pred = np.maximum(y_pred, 0)
    return np.sqrt(np.mean((np.log1p(y_true) - np.log1p(y_pred)) ** 2))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

## 9. Model Comparison with DictVectorizer Pipelines

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(alpha=1.0, random_state=42),
    'Ridge': Ridge(alpha=1.0, random_state=42),
    'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, n_jobs=-1)
}

results = []
print(f"{'Model':<20} {'Train RMSLE':<15} {'Val RMSLE':<15} {'100*RMSLE':<15}")
print("="*70)

for name, model in models.items():
    try:
        pipeline = Pipeline([
            ('dict_vectorizer', DictVectorizer(sparse=False)),
            ('model', model)
        ])
        
        pipeline.fit(X_train_dict, y_train)
        
        y_train_pred = pipeline.predict(X_train_dict)
        y_val_pred = pipeline.predict(X_val_dict)
        
        train_rmsle = rmsle(y_train, y_train_pred)
        val_rmsle = rmsle(y_val, y_val_pred)
        
        results.append({
            'Model': name,
            'Train RMSLE': train_rmsle,
            'Val RMSLE': val_rmsle,
            'Score (100*RMSLE)': 100 * val_rmsle,
            'Pipeline': pipeline
        })
        
        print(f"{name:<20} {train_rmsle:<15.4f} {val_rmsle:<15.4f} {100*val_rmsle:<15.4f}")
    except Exception as e:
        print(f"{name:<20} Error: {str(e)}")

In [None]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Val RMSLE')
print("\nBest Model:")
display(results_df[['Model', 'Val RMSLE', 'Score (100*RMSLE)']].head())

In [None]:
# Interactive model comparison chart
fig = px.bar(results_df, x='Model', y='Score (100*RMSLE)',
             title='Model Comparison (Lower is Better)',
             labels={'Score (100*RMSLE)': '100 × RMSLE'},
             color='Score (100*RMSLE)',
             color_continuous_scale='RdYlGn_r',
             text='Score (100*RMSLE)')

fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(height=500, showlegend=False)
fig.show()

## 10. Hyperparameter Tuning for Best Model

In [None]:
best_model_name = results_df.iloc[0]['Model']
print(f"Best Model: {best_model_name}")

In [None]:
# Define param grids for models worth tuning
param_grids = {
    'XGBoost': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [5, 7],
        'model__learning_rate': [0.05, 0.1],
        'model__subsample': [0.8, 0.9]
    },
    'Random Forest': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [10, 15, 20],
        'model__min_samples_split': [2, 5]
    },
    'Gradient Boosting': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [3, 5],
        'model__learning_rate': [0.05, 0.1]
    },
    'Decision Tree': {
        'model__max_depth': [5, 10, 15, 20],
        'model__min_samples_split': [2, 5, 10]
    }
}

# Check if best model has a tuning grid
if best_model_name in param_grids:
    print(f"Found tuning grid for {best_model_name}")
    
    # Get the original model from our models dict
    base_model = models[best_model_name]
    
    # Create fresh pipeline with the base model
    pipeline = Pipeline([
        ('dict_vectorizer', DictVectorizer(sparse=False)),
        ('model', base_model)
    ])
    
    # Run grid search
    grid = RandomizedSearchCV(
        pipeline, 
        param_grids[best_model_name], 
        n_iter=20,
        random_state=42,
        cv=3, 
        scoring=rmsle_scorer, 
        n_jobs=-1, 
        verbose=1
    )
    
    print("Starting random search...")
    grid.fit(X_train_dict, y_train)
    
    print("\nBest params:", grid.best_params_)
    best_pipeline = grid.best_estimator_
    
    # Show improvement
    original_val_rmsle = results_df.iloc[0]['Val RMSLE']
    tuned_val_pred = best_pipeline.predict(X_val_dict)
    tuned_val_rmsle = rmsle(y_val, tuned_val_pred)
    improvement = ((original_val_rmsle - tuned_val_rmsle) / original_val_rmsle) * 100
    print(f"\nOriginal RMSLE: {original_val_rmsle:.4f}")
    print(f"Tuned RMSLE: {tuned_val_rmsle:.4f}")
    print(f"Improvement: {improvement:.2f}%")
else:
    print(f"No tuning grid defined for {best_model_name}")
    print("Using default parameters from comparison")
    best_pipeline = results_df.iloc[0]['Pipeline']
    print("\nTo add hyperparameter tuning for this model:")
    print(f"Add '{best_model_name}' to param_grids dict above")

## 11. Evaluate Best Pipeline

In [None]:
y_val_pred = best_pipeline.predict(X_val_dict)
val_rmsle_final = rmsle(y_val, y_val_pred)

print(f"Final Validation RMSLE: {val_rmsle_final:.4f}")
print(f"Final Score (100*RMSLE): {100*val_rmsle_final:.4f}")

# Pipeline details
dict_vec = best_pipeline.named_steps['dict_vectorizer']
print(f"\nPipeline Steps: {list(best_pipeline.named_steps.keys())}")
print(f"Input features: {len(feature_cols)}")
print(f"Vectorized features: {len(dict_vec.feature_names_)}")

## 12. Feature Importance (if available)

In [None]:
model = best_pipeline.named_steps['model']

if hasattr(model, 'feature_importances_'):
    dict_vec = best_pipeline.named_steps['dict_vectorizer']
    feature_importance = pd.DataFrame({
        'feature': dict_vec.feature_names_,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Interactive feature importance plot
    top_n = 20
    fig = px.bar(feature_importance.head(top_n), 
                 x='importance', y='feature',
                 orientation='h',
                 title=f'Top {top_n} Most Important Features',
                 labels={'importance': 'Importance', 'feature': 'Feature'},
                 color='importance',
                 color_continuous_scale='Viridis')
    
    fig.update_layout(height=600, yaxis={'categoryorder':'total ascending'}, showlegend=False)
    fig.show()
    
    print("\nTop 15 Features:")
    display(feature_importance.head(15))
else:
    print("Model doesn't have feature_importances_")

## 13. Retrain on Full Data and Make Predictions

In [None]:
# Retrain on full training data
X_full_dict = X.to_dict('records')
best_pipeline.fit(X_full_dict, y)
print("✓ Retrained on full data")

In [None]:
# Make predictions
X_test_dict = X_test.to_dict('records')
predictions = best_pipeline.predict(X_test_dict)
predictions = np.maximum(predictions, 0)

print(f"Predictions: {len(predictions)}")
print(f"Min: {predictions.min():.2f}")
print(f"Max: {predictions.max():.2f}")
print(f"Mean: {predictions.mean():.2f}")

In [None]:
# Visualize prediction distribution
fig = go.Figure()
fig.add_trace(go.Histogram(x=predictions, nbinsx=50, name='Test Predictions'))
fig.update_layout(
    title='Distribution of Test Predictions',
    xaxis_title='Predicted Orders',
    yaxis_title='Frequency',
    height=400
)
fig.show()

## 14. Save Pipeline and Create Submission

In [None]:
import pickle

with open('final_model.pkl', 'wb') as f:
    pickle.dump(best_pipeline, f)
print("✓ Pipeline saved as 'final_model.pkl'")

with open('feature_cols.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)
print("✓ Feature columns saved")

In [None]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'num_orders': predictions.round().astype(int)
})

submission.to_csv('submission.csv', index=False)
print("✓ Submission saved")
display(submission.head(10))

## Summary

### Key Points:
1. ✅ Used **DictVectorizer** for automatic one-hot encoding
2. ✅ **Pipeline** encapsulates entire workflow
3. ✅ Compared **7 different models**
4. ✅ **Interactive Plotly visualizations** for better EDA
5. ✅ Selected best model and tuned hyperparameters
6. ✅ Saved complete pipeline in **single pickle file**

### Production Ready:
- `train.py` uses this exact approach
- `predict.py` loads pipeline and serves predictions
- Single artifact for deployment

### Visualizations:
- All plots are **interactive** (hover, zoom, pan)
- Professional appearance
- Better for presentations and reports