# SmartInventory - Demand Forecasting Exploration

This notebook explores the sales data and develops the demand forecasting model for SmartInventory.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Data Loading and Exploration

In [None]:
# Load sample data
df = pd.read_csv('../sample_sales_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Basic statistics
print("Dataset Info:")
df.info()

print("\nNumerical Statistics:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Check date range
df['date'] = pd.to_datetime(df['date'])
print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")
print(f"Number of unique stores: {df['store_id'].nunique()}")
print(f"Number of unique products: {df['sku_id'].nunique()}")

## 2. Exploratory Data Analysis

In [None]:
# Sales distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Sales histogram
axes[0, 0].hist(df['sales'], bins=50, alpha=0.7)
axes[0, 0].set_title('Sales Distribution')
axes[0, 0].set_xlabel('Sales')
axes[0, 0].set_ylabel('Frequency')

# Price histogram
axes[0, 1].hist(df['price'], bins=50, alpha=0.7, color='orange')
axes[0, 1].set_title('Price Distribution')
axes[0, 1].set_xlabel('Price')
axes[0, 1].set_ylabel('Frequency')

# Inventory histogram
axes[1, 0].hist(df['on_hand'], bins=50, alpha=0.7, color='green')
axes[1, 0].set_title('Inventory Distribution')
axes[1, 0].set_xlabel('On Hand')
axes[1, 0].set_ylabel('Frequency')

# Promotions pie chart
promo_counts = df['promotions_flag'].value_counts()
axes[1, 1].pie(promo_counts.values, labels=['No Promotion', 'Promotion'], autopct='%1.1f%%')
axes[1, 1].set_title('Promotion Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Time series analysis
daily_sales = df.groupby('date')['sales'].agg(['sum', 'mean', 'count']).reset_index()

fig, axes = plt.subplots(3, 1, figsize=(15, 12))

# Total daily sales
axes[0].plot(daily_sales['date'], daily_sales['sum'])
axes[0].set_title('Total Daily Sales')
axes[0].set_ylabel('Total Sales')

# Average daily sales
axes[1].plot(daily_sales['date'], daily_sales['mean'], color='orange')
axes[1].set_title('Average Daily Sales per Transaction')
axes[1].set_ylabel('Average Sales')

# Number of transactions
axes[2].plot(daily_sales['date'], daily_sales['count'], color='green')
axes[2].set_title('Number of Daily Transactions')
axes[2].set_ylabel('Transaction Count')
axes[2].set_xlabel('Date')

plt.tight_layout()
plt.show()

In [None]:
# Seasonal patterns
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Day of week pattern
dow_sales = df.groupby('day_of_week')['sales'].mean()
dow_labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[0].bar(range(7), dow_sales.values)
axes[0].set_xticks(range(7))
axes[0].set_xticklabels(dow_labels)
axes[0].set_title('Average Sales by Day of Week')
axes[0].set_ylabel('Average Sales')

# Monthly pattern
monthly_sales = df.groupby('month')['sales'].mean()
axes[1].bar(monthly_sales.index, monthly_sales.values, color='orange')
axes[1].set_title('Average Sales by Month')
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Average Sales')

plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
def prepare_features(df):
    """Prepare features for modeling"""
    df = df.copy()
    df = df.sort_values(['store_id', 'sku_id', 'date'])
    
    # Encode categorical variables
    label_encoders = {}
    for col in ['store_id', 'sku_id']:
        label_encoders[col] = LabelEncoder()
        df[f'{col}_encoded'] = label_encoders[col].fit_transform(df[col])
    
    # Time-based features
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['day_of_month'] = df['date'].dt.day
    df['quarter'] = df['date'].dt.quarter
    
    # Lag features
    for lag in [1, 7, 14, 30]:
        df[f'sales_lag_{lag}'] = df.groupby(['store_id', 'sku_id'])['sales'].shift(lag)
    
    # Rolling averages
    for window in [7, 14, 30]:
        df[f'sales_rolling_{window}'] = df.groupby(['store_id', 'sku_id'])['sales'].rolling(window=window).mean().reset_index(0, drop=True)
    
    # Price features
    df['price_change'] = df.groupby(['store_id', 'sku_id'])['price'].pct_change()
    df['price_rolling_7'] = df.groupby(['store_id', 'sku_id'])['price'].rolling(window=7).mean().reset_index(0, drop=True)
    
    # Inventory features
    df['inventory_ratio'] = df['sales'] / (df['on_hand'] + 1)
    
    # Promotion features
    df['promotions_flag'] = df['promotions_flag'].astype(int)
    
    return df, label_encoders

# Prepare features
df_features, encoders = prepare_features(df)

print(f"Features prepared. Shape: {df_features.shape}")
print(f"New columns: {[col for col in df_features.columns if col not in df.columns]}")

In [None]:
# Define feature columns
feature_columns = [
    'store_id_encoded', 'sku_id_encoded', 'day_of_week', 'month', 
    'day_of_month', 'quarter', 'price', 'price_change', 'price_rolling_7',
    'on_hand', 'inventory_ratio', 'promotions_flag',
    'sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_lag_30',
    'sales_rolling_7', 'sales_rolling_14', 'sales_rolling_30'
]

# Remove rows with NaN values
df_clean = df_features.dropna(subset=feature_columns + ['sales'])

print(f"Clean dataset shape: {df_clean.shape}")
print(f"Removed {len(df_features) - len(df_clean)} rows with missing values")

## 4. Model Development

In [None]:
# Prepare training data
X = df_clean[feature_columns]
y = df_clean['sales']

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Split data (time-based split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Train baseline model
baseline_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

print("Training baseline model...")
baseline_model.fit(X_train, y_train)

# Make predictions
train_pred = baseline_model.predict(X_train)
test_pred = baseline_model.predict(X_test)

# Calculate metrics
train_mae = mean_absolute_error(y_train, train_pred)
test_mae = mean_absolute_error(y_test, test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))

print(f"\nBaseline Model Performance:")
print(f"Train MAE: {train_mae:.3f}")
print(f"Test MAE: {test_mae:.3f}")
print(f"Train RMSE: {train_rmse:.3f}")
print(f"Test RMSE: {test_rmse:.3f}")

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': baseline_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance.head(15), x='importance', y='feature')
plt.title('Top 15 Feature Importances')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

In [None]:
# Hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 6, 8],
    'min_samples_split': [10, 20],
    'min_samples_leaf': [4, 8]
}

print("Starting hyperparameter tuning...")
grid_search = GridSearchCV(
    GradientBoostingRegressor(random_state=42),
    param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

# Use a subset for faster tuning
sample_size = min(10000, len(X_train))
sample_indices = np.random.choice(len(X_train), sample_size, replace=False)
X_sample = X_train.iloc[sample_indices]
y_sample = y_train.iloc[sample_indices]

grid_search.fit(X_sample, y_sample)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {-grid_search.best_score_:.3f}")

In [None]:
# Train optimized model
optimized_model = grid_search.best_estimator_
optimized_model.fit(X_train, y_train)

# Make predictions
opt_train_pred = optimized_model.predict(X_train)
opt_test_pred = optimized_model.predict(X_test)

# Calculate metrics
opt_train_mae = mean_absolute_error(y_train, opt_train_pred)
opt_test_mae = mean_absolute_error(y_test, opt_test_pred)
opt_train_rmse = np.sqrt(mean_squared_error(y_train, opt_train_pred))
opt_test_rmse = np.sqrt(mean_squared_error(y_test, opt_test_pred))

print(f"\nOptimized Model Performance:")
print(f"Train MAE: {opt_train_mae:.3f}")
print(f"Test MAE: {opt_test_mae:.3f}")
print(f"Train RMSE: {opt_train_rmse:.3f}")
print(f"Test RMSE: {opt_test_rmse:.3f}")

print(f"\nImprovement over baseline:")
print(f"MAE improvement: {((test_mae - opt_test_mae) / test_mae * 100):.1f}%")
print(f"RMSE improvement: {((test_rmse - opt_test_rmse) / test_rmse * 100):.1f}%")

## 5. Model Evaluation

In [None]:
# Prediction vs Actual plots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Baseline model
axes[0].scatter(y_test, test_pred, alpha=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Sales')
axes[0].set_ylabel('Predicted Sales')
axes[0].set_title(f'Baseline Model (MAE: {test_mae:.3f})')

# Optimized model
axes[1].scatter(y_test, opt_test_pred, alpha=0.5, color='orange')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Sales')
axes[1].set_ylabel('Predicted Sales')
axes[1].set_title(f'Optimized Model (MAE: {opt_test_mae:.3f})')

plt.tight_layout()
plt.show()

In [None]:
# Residual analysis
residuals = y_test - opt_test_pred

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Residuals vs Predicted
axes[0].scatter(opt_test_pred, residuals, alpha=0.5)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_xlabel('Predicted Sales')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Residuals vs Predicted')

# Residuals histogram
axes[1].hist(residuals, bins=50, alpha=0.7)
axes[1].set_xlabel('Residuals')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Residuals Distribution')

plt.tight_layout()
plt.show()

print(f"Residuals statistics:")
print(f"Mean: {residuals.mean():.3f}")
print(f"Std: {residuals.std():.3f}")
print(f"Min: {residuals.min():.3f}")
print(f"Max: {residuals.max():.3f}")

## 6. Model Insights and Conclusions

In [None]:
# Performance by different segments
test_data = df_clean.iloc[X_test.index].copy()
test_data['predictions'] = opt_test_pred
test_data['residuals'] = y_test - opt_test_pred
test_data['abs_error'] = np.abs(test_data['residuals'])

# Performance by store
store_performance = test_data.groupby('store_id')['abs_error'].mean().sort_values(ascending=False)

print("Top 10 stores with highest prediction error:")
print(store_performance.head(10))

# Performance by promotion
promo_performance = test_data.groupby('promotions_flag')['abs_error'].mean()
print(f"\nPrediction error by promotion:")
print(f"No promotion: {promo_performance[0]:.3f}")
print(f"With promotion: {promo_performance[1]:.3f}")

In [None]:
# Model summary
print("=" * 50)
print("MODEL SUMMARY")
print("=" * 50)
print(f"Algorithm: Gradient Boosting Regressor")
print(f"Best parameters: {grid_search.best_params_}")
print(f"")
print(f"Performance Metrics:")
print(f"- Test MAE: {opt_test_mae:.3f}")
print(f"- Test RMSE: {opt_test_rmse:.3f}")
print(f"- Improvement over baseline: {((test_mae - opt_test_mae) / test_mae * 100):.1f}%")
print(f"")
print(f"Key Insights:")
print(f"- Most important features: {', '.join(feature_importance.head(3)['feature'].tolist())}")
print(f"- Model performs better on non-promotional periods")
print(f"- Some stores are harder to predict than others")
print(f"")
print(f"Recommendations:")
print(f"- Consider store-specific models for high-error stores")
print(f"- Improve promotion effect modeling")
print(f"- Collect additional features (weather, events, etc.)")

## 7. Next Steps

1. **Production Implementation**: Use the `ml/scripts/train.py` script for production training
2. **Model Monitoring**: Set up automated retraining based on performance degradation
3. **Feature Enhancement**: Add external data sources (weather, holidays, events)
4. **Advanced Models**: Experiment with time series models (ARIMA, Prophet) or deep learning approaches
5. **Business Integration**: Implement inventory optimization based on predictions