# Walmart Sales Data - Comprehensive Analysis
## EDA, AI, ML, and Deep Learning

This notebook performs comprehensive analysis of Walmart sales data including:
- Exploratory Data Analysis (EDA)
- Traditional Machine Learning Models
- Deep Learning Models for Time Series Prediction
- Feature Engineering and Model Optimization

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("Libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")

## 1. Data Loading and Initial Exploration

In [None]:
# Load the data
df = pd.read_csv('walmart.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head(10)

In [None]:
# Basic information
print("Dataset Info:")
print("="*50)
df.info()

print("\n" + "="*50)
print("Missing Values:")
print(df.isnull().sum())

print("\n" + "="*50)
print("Duplicate Rows:", df.duplicated().sum())

In [None]:
# Statistical Summary
print("Statistical Summary:")
print("="*50)
df.describe().T

## 2. Data Preprocessing and Feature Engineering

In [None]:
# Convert Date to datetime and extract features
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Week'] = df['Date'].dt.isocalendar().week
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['Quarter'] = df['Date'].dt.quarter

# Sort by date
df = df.sort_values(['Store', 'Date']).reset_index(drop=True)

print("Feature engineering completed!")
print("\nNew columns added:", ['Year', 'Month', 'Week', 'Day', 'DayOfWeek', 'Quarter'])
print("\nDataset shape after feature engineering:", df.shape)
df.head()

In [None]:
# Create lag features for time series
for lag in [1, 2, 3, 4]:
    df[f'Sales_Lag_{lag}'] = df.groupby('Store')['Weekly_Sales'].shift(lag)

# Rolling statistics
df['Sales_Rolling_Mean_4'] = df.groupby('Store')['Weekly_Sales'].transform(lambda x: x.rolling(4, min_periods=1).mean())
df['Sales_Rolling_Std_4'] = df.groupby('Store')['Weekly_Sales'].transform(lambda x: x.rolling(4, min_periods=1).std())

# Store statistics
store_stats = df.groupby('Store')['Weekly_Sales'].agg(['mean', 'std']).reset_index()
store_stats.columns = ['Store', 'Store_Mean_Sales', 'Store_Std_Sales']
df = df.merge(store_stats, on='Store', how='left')

print("Time series features created!")
print("\nNew features:", [col for col in df.columns if 'Lag' in col or 'Rolling' in col or 'Store_' in col])

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Distribution of Weekly Sales
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Histogram
axes[0, 0].hist(df['Weekly_Sales'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Distribution of Weekly Sales', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Weekly Sales')
axes[0, 0].set_ylabel('Frequency')

# Box plot
axes[0, 1].boxplot(df['Weekly_Sales'])
axes[0, 1].set_title('Box Plot of Weekly Sales', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Weekly Sales')

# Sales by Holiday
holiday_sales = df.groupby('Holiday_Flag')['Weekly_Sales'].mean()
axes[1, 0].bar(['Non-Holiday', 'Holiday'], holiday_sales.values, color=['skyblue', 'coral'])
axes[1, 0].set_title('Average Sales: Holiday vs Non-Holiday', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Average Weekly Sales')

# Sales over time
monthly_sales = df.groupby(df['Date'].dt.to_period('M'))['Weekly_Sales'].mean()
axes[1, 1].plot(monthly_sales.index.astype(str), monthly_sales.values, marker='o', linewidth=2)
axes[1, 1].set_title('Average Weekly Sales Over Time', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Month')
axes[1, 1].set_ylabel('Average Weekly Sales')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].set_xticks(axes[1, 1].get_xticks()[::6])  # Show every 6th label

plt.tight_layout()
plt.savefig('eda_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nAverage Sales on Holidays: ${holiday_sales[1]:,.2f}")
print(f"Average Sales on Non-Holidays: ${holiday_sales[0]:,.2f}")
print(f"Percentage Increase on Holidays: {((holiday_sales[1]/holiday_sales[0])-1)*100:.2f}%")

In [None]:
# Correlation Analysis
plt.figure(figsize=(14, 10))

# Select numeric columns for correlation
numeric_cols = ['Weekly_Sales', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 
                'CPI', 'Unemployment', 'Month', 'Quarter']
correlation_matrix = df[numeric_cols].corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, fmt='.2f')
plt.title('Correlation Matrix of Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nCorrelation with Weekly Sales:")
print("="*50)
print(correlation_matrix['Weekly_Sales'].sort_values(ascending=False))

In [None]:
# Store-wise analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Average sales by store
store_avg_sales = df.groupby('Store')['Weekly_Sales'].mean().sort_values(ascending=False)
axes[0, 0].bar(range(len(store_avg_sales)), store_avg_sales.values, color='steelblue')
axes[0, 0].set_title('Average Weekly Sales by Store', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Store (sorted by sales)')
axes[0, 0].set_ylabel('Average Weekly Sales')

# Temperature vs Sales
axes[0, 1].scatter(df['Temperature'], df['Weekly_Sales'], alpha=0.3, s=10)
axes[0, 1].set_title('Temperature vs Weekly Sales', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Temperature')
axes[0, 1].set_ylabel('Weekly Sales')

# Fuel Price vs Sales
axes[1, 0].scatter(df['Fuel_Price'], df['Weekly_Sales'], alpha=0.3, s=10, color='orange')
axes[1, 0].set_title('Fuel Price vs Weekly Sales', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Fuel Price')
axes[1, 0].set_ylabel('Weekly Sales')

# Unemployment vs Sales
axes[1, 1].scatter(df['Unemployment'], df['Weekly_Sales'], alpha=0.3, s=10, color='green')
axes[1, 1].set_title('Unemployment vs Weekly Sales', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Unemployment Rate')
axes[1, 1].set_ylabel('Weekly Sales')

plt.tight_layout()
plt.savefig('feature_relationships.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nTop 5 Stores by Average Sales:")
print(store_avg_sales.head())

In [None]:
# Seasonal patterns
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Monthly patterns
monthly_avg = df.groupby('Month')['Weekly_Sales'].mean()
axes[0].plot(monthly_avg.index, monthly_avg.values, marker='o', linewidth=2, markersize=8)
axes[0].set_title('Average Sales by Month', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Month')
axes[0].set_ylabel('Average Weekly Sales')
axes[0].grid(True, alpha=0.3)

# Quarterly patterns
quarterly_avg = df.groupby('Quarter')['Weekly_Sales'].mean()
axes[1].bar(quarterly_avg.index, quarterly_avg.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A'])
axes[1].set_title('Average Sales by Quarter', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Quarter')
axes[1].set_ylabel('Average Weekly Sales')

# Day of week patterns
dow_avg = df.groupby('DayOfWeek')['Weekly_Sales'].mean()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[2].bar(range(7), dow_avg.values, color='coral')
axes[2].set_title('Average Sales by Day of Week', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Day of Week')
axes[2].set_ylabel('Average Weekly Sales')
axes[2].set_xticks(range(7))
axes[2].set_xticklabels(day_names)

plt.tight_layout()
plt.savefig('seasonal_patterns.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Prepare Data for Machine Learning

In [None]:
# Remove rows with NaN values (from lag features)
df_ml = df.dropna().copy()

# Select features for modeling
feature_cols = ['Store', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
                'Year', 'Month', 'Week', 'Quarter', 'DayOfWeek',
                'Sales_Lag_1', 'Sales_Lag_2', 'Sales_Lag_3', 'Sales_Lag_4',
                'Sales_Rolling_Mean_4', 'Sales_Rolling_Std_4',
                'Store_Mean_Sales', 'Store_Std_Sales']

X = df_ml[feature_cols]
y = df_ml['Weekly_Sales']

# Split data chronologically (80-20 split)
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"\nFeatures used: {len(feature_cols)}")
print(f"Feature names: {feature_cols}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed!")

## 5. Traditional Machine Learning Models

In [None]:
# Function to evaluate models
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    results = {
        'Model': name,
        'Train R²': train_r2,
        'Test R²': test_r2,
        'Train RMSE': train_rmse,
        'Test RMSE': test_rmse,
        'Train MAE': train_mae,
        'Test MAE': test_mae,
        'Predictions': y_test_pred
    }
    
    return results

print("Model evaluation function defined!")

In [None]:
# Train multiple models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=10),
    'Lasso Regression': Lasso(alpha=100),
    'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
}

results_list = []

print("Training models...\n")
for name, model in models.items():
    print(f"Training {name}...")
    results = evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)
    results_list.append(results)
    print(f"  Train R²: {results['Train R²']:.4f}, Test R²: {results['Test R²']:.4f}")
    print(f"  Test RMSE: ${results['Test RMSE']:,.2f}\n")

print("All models trained!")

In [None]:
# Compare models
results_df = pd.DataFrame([{k: v for k, v in r.items() if k != 'Predictions'} for r in results_list])
results_df = results_df.sort_values('Test R²', ascending=False)

print("\n" + "="*80)
print("MODEL COMPARISON RESULTS")
print("="*80)
print(results_df.to_string(index=False))

# Visualize results
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# R² comparison
x = np.arange(len(results_df))
width = 0.35
axes[0].bar(x - width/2, results_df['Train R²'], width, label='Train R²', alpha=0.8)
axes[0].bar(x + width/2, results_df['Test R²'], width, label='Test R²', alpha=0.8)
axes[0].set_xlabel('Model')
axes[0].set_ylabel('R² Score')
axes[0].set_title('R² Score Comparison', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(results_df['Model'], rotation=45, ha='right')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# RMSE comparison
axes[1].bar(x - width/2, results_df['Train RMSE'], width, label='Train RMSE', alpha=0.8)
axes[1].bar(x + width/2, results_df['Test RMSE'], width, label='Test RMSE', alpha=0.8)
axes[1].set_xlabel('Model')
axes[1].set_ylabel('RMSE')
axes[1].set_title('RMSE Comparison', fontsize=14, fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(results_df['Model'], rotation=45, ha='right')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Feature importance from Random Forest
rf_model = models['Random Forest']
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 8))
plt.barh(range(len(feature_importance)), feature_importance['Importance'])
plt.yticks(range(len(feature_importance)), feature_importance['Feature'])
plt.xlabel('Importance')
plt.title('Feature Importance (Random Forest)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

In [None]:
# Predictions vs Actual for best model
best_model_idx = results_df['Test R²'].idxmax()
best_model_name = results_df.loc[best_model_idx, 'Model']
best_predictions = results_list[best_model_idx]['Predictions']

plt.figure(figsize=(12, 6))
plt.plot(range(len(y_test)), y_test.values, label='Actual', alpha=0.7, linewidth=2)
plt.plot(range(len(y_test)), best_predictions, label='Predicted', alpha=0.7, linewidth=2)
plt.xlabel('Sample Index')
plt.ylabel('Weekly Sales')
plt.title(f'Actual vs Predicted Sales - {best_model_name}', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('predictions_vs_actual.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nBest Model: {best_model_name}")
print(f"Test R² Score: {results_df.loc[best_model_idx, 'Test R²']:.4f}")

## 6. Deep Learning Models

In [None]:
# Neural Network Model
def create_nn_model(input_dim):
    model = Sequential([
        Dense(256, activation='relu', input_dim=input_dim),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        
        Dense(32, activation='relu'),
        Dropout(0.2),
        
        Dense(1)
    ])
    
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
                  loss='mse',
                  metrics=['mae'])
    return model

# Create and train the model
nn_model = create_nn_model(X_train_scaled.shape[1])

print("Neural Network Architecture:")
print("="*50)
nn_model.summary()

In [None]:
# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7)

# Train the model
print("Training Neural Network...\n")
history = nn_model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

print("\nNeural Network training completed!")

In [None]:
# Evaluate Neural Network
nn_train_pred = nn_model.predict(X_train_scaled, verbose=0).flatten()
nn_test_pred = nn_model.predict(X_test_scaled, verbose=0).flatten()

nn_train_r2 = r2_score(y_train, nn_train_pred)
nn_test_r2 = r2_score(y_test, nn_test_pred)
nn_train_rmse = np.sqrt(mean_squared_error(y_train, nn_train_pred))
nn_test_rmse = np.sqrt(mean_squared_error(y_test, nn_test_pred))
nn_train_mae = mean_absolute_error(y_train, nn_train_pred)
nn_test_mae = mean_absolute_error(y_test, nn_test_pred)

print("\n" + "="*50)
print("NEURAL NETWORK RESULTS")
print("="*50)
print(f"Train R²: {nn_train_r2:.4f}")
print(f"Test R²: {nn_test_r2:.4f}")
print(f"Train RMSE: ${nn_train_rmse:,.2f}")
print(f"Test RMSE: ${nn_test_rmse:,.2f}")
print(f"Train MAE: ${nn_train_mae:,.2f}")
print(f"Test MAE: ${nn_test_mae:,.2f}")

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss
axes[0].plot(history.history['loss'], label='Train Loss', linewidth=2)
axes[0].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss (MSE)')
axes[0].set_title('Training and Validation Loss', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# MAE
axes[1].plot(history.history['mae'], label='Train MAE', linewidth=2)
axes[1].plot(history.history['val_mae'], label='Validation MAE', linewidth=2)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('MAE')
axes[1].set_title('Training and Validation MAE', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('nn_training_history.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Prepare data for LSTM
def create_sequences(data, target, seq_length):
    X_seq, y_seq = [], []
    for i in range(len(data) - seq_length):
        X_seq.append(data[i:i+seq_length])
        y_seq.append(target[i+seq_length])
    return np.array(X_seq), np.array(y_seq)

# Create sequences
seq_length = 10
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train.values, seq_length)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test.values, seq_length)

print(f"LSTM input shape: {X_train_seq.shape}")
print(f"LSTM output shape: {y_train_seq.shape}")

In [None]:
# Create LSTM Model
def create_lstm_model(seq_length, n_features):
    model = Sequential([
        LSTM(128, activation='tanh', return_sequences=True, input_shape=(seq_length, n_features)),
        Dropout(0.3),
        
        LSTM(64, activation='tanh', return_sequences=True),
        Dropout(0.3),
        
        LSTM(32, activation='tanh'),
        Dropout(0.2),
        
        Dense(16, activation='relu'),
        Dense(1)
    ])
    
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
                  loss='mse',
                  metrics=['mae'])
    return model

lstm_model = create_lstm_model(seq_length, X_train_scaled.shape[1])

print("LSTM Architecture:")
print("="*50)
lstm_model.summary()

In [None]:
# Train LSTM
print("Training LSTM Model...\n")
lstm_history = lstm_model.fit(
    X_train_seq, y_train_seq,
    validation_split=0.2,
    epochs=50,
    batch_size=64,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

print("\nLSTM training completed!")

In [None]:
# Evaluate LSTM
lstm_train_pred = lstm_model.predict(X_train_seq, verbose=0).flatten()
lstm_test_pred = lstm_model.predict(X_test_seq, verbose=0).flatten()

lstm_train_r2 = r2_score(y_train_seq, lstm_train_pred)
lstm_test_r2 = r2_score(y_test_seq, lstm_test_pred)
lstm_train_rmse = np.sqrt(mean_squared_error(y_train_seq, lstm_train_pred))
lstm_test_rmse = np.sqrt(mean_squared_error(y_test_seq, lstm_test_pred))
lstm_train_mae = mean_absolute_error(y_train_seq, lstm_train_pred)
lstm_test_mae = mean_absolute_error(y_test_seq, lstm_test_pred)

print("\n" + "="*50)
print("LSTM MODEL RESULTS")
print("="*50)
print(f"Train R²: {lstm_train_r2:.4f}")
print(f"Test R²: {lstm_test_r2:.4f}")
print(f"Train RMSE: ${lstm_train_rmse:,.2f}")
print(f"Test RMSE: ${lstm_test_rmse:,.2f}")
print(f"Train MAE: ${lstm_train_mae:,.2f}")
print(f"Test MAE: ${lstm_test_mae:,.2f}")

In [None]:
# Plot LSTM training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].plot(lstm_history.history['loss'], label='Train Loss', linewidth=2)
axes[0].plot(lstm_history.history['val_loss'], label='Validation Loss', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss (MSE)')
axes[0].set_title('LSTM Training and Validation Loss', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(lstm_history.history['mae'], label='Train MAE', linewidth=2)
axes[1].plot(lstm_history.history['val_mae'], label='Validation MAE', linewidth=2)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('MAE')
axes[1].set_title('LSTM Training and Validation MAE', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('lstm_training_history.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Final Model Comparison and Insights

In [None]:
# Compare all models including deep learning
all_models_results = pd.concat([
    results_df[['Model', 'Test R²', 'Test RMSE', 'Test MAE']],
    pd.DataFrame([{
        'Model': 'Neural Network',
        'Test R²': nn_test_r2,
        'Test RMSE': nn_test_rmse,
        'Test MAE': nn_test_mae
    }, {
        'Model': 'LSTM',
        'Test R²': lstm_test_r2,
        'Test RMSE': lstm_test_rmse,
        'Test MAE': lstm_test_mae
    }])
]).sort_values('Test R²', ascending=False).reset_index(drop=True)

print("\n" + "="*80)
print("FINAL MODEL COMPARISON")
print("="*80)
print(all_models_results.to_string(index=False))

# Visualize final comparison
plt.figure(figsize=(14, 6))
x = np.arange(len(all_models_results))
plt.bar(x, all_models_results['Test R²'], color='steelblue', alpha=0.8)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Test R² Score', fontsize=12)
plt.title('Final Model Comparison - Test R² Scores', fontsize=16, fontweight='bold')
plt.xticks(x, all_models_results['Model'], rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for i, v in enumerate(all_models_results['Test R²']):
    plt.text(i, v + 0.01, f'{v:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('final_model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Residual analysis for best model
best_overall_idx = all_models_results['Test R²'].idxmax()
best_overall_model = all_models_results.loc[best_overall_idx, 'Model']

if best_overall_model == 'Neural Network':
    residuals = y_test.values - nn_test_pred
elif best_overall_model == 'LSTM':
    residuals = y_test_seq - lstm_test_pred
else:
    model_idx = results_df[results_df['Model'] == best_overall_model].index[0]
    residuals = y_test.values - results_list[model_idx]['Predictions']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Residual plot
axes[0].scatter(range(len(residuals)), residuals, alpha=0.5, s=20)
axes[0].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[0].set_xlabel('Sample Index')
axes[0].set_ylabel('Residuals')
axes[0].set_title(f'Residual Plot - {best_overall_model}', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Residual distribution
axes[1].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Residuals')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Residual Distribution', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

# Q-Q plot
from scipy import stats
stats.probplot(residuals, dist="norm", plot=axes[2])
axes[2].set_title('Q-Q Plot', fontsize=14, fontweight='bold')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('residual_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nResidual Statistics for {best_overall_model}:")
print(f"Mean: ${np.mean(residuals):,.2f}")
print(f"Std: ${np.std(residuals):,.2f}")
print(f"Min: ${np.min(residuals):,.2f}")
print(f"Max: ${np.max(residuals):,.2f}")

## 8. Key Insights and Conclusions

In [None]:
print("\n" + "="*80)
print("KEY INSIGHTS FROM WALMART SALES ANALYSIS")
print("="*80)

print("\n1. DATA CHARACTERISTICS:")
print(f"   - Dataset contains {len(df)} records across {df['Store'].nunique()} stores")
print(f"   - Time period: {df['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')}")
print(f"   - Average weekly sales: ${df['Weekly_Sales'].mean():,.2f}")
print(f"   - Sales standard deviation: ${df['Weekly_Sales'].std():,.2f}")

print("\n2. BUSINESS INSIGHTS:")
holiday_impact = ((df[df['Holiday_Flag']==1]['Weekly_Sales'].mean() / 
                   df[df['Holiday_Flag']==0]['Weekly_Sales'].mean()) - 1) * 100
print(f"   - Holiday weeks show {holiday_impact:.2f}% {'higher' if holiday_impact > 0 else 'lower'} sales")
print(f"   - Best performing quarter: Q{quarterly_avg.idxmax()} with ${quarterly_avg.max():,.2f} avg sales")
print(f"   - Worst performing quarter: Q{quarterly_avg.idxmin()} with ${quarterly_avg.min():,.2f} avg sales")

print("\n3. CORRELATION INSIGHTS:")
top_correlations = correlation_matrix['Weekly_Sales'].drop('Weekly_Sales').abs().sort_values(ascending=False)
print(f"   - Strongest predictor: {top_correlations.index[0]} (correlation: {top_correlations.values[0]:.4f})")
print(f"   - Temperature shows {'positive' if correlation_matrix.loc['Temperature', 'Weekly_Sales'] > 0 else 'negative'} correlation with sales")
print(f"   - Unemployment shows {'positive' if correlation_matrix.loc['Unemployment', 'Weekly_Sales'] > 0 else 'negative'} correlation with sales")

print("\n4. MODEL PERFORMANCE:")
best_r2 = all_models_results['Test R²'].max()
best_model_name = all_models_results.loc[all_models_results['Test R²'].idxmax(), 'Model']
best_rmse = all_models_results.loc[all_models_results['Test R²'].idxmax(), 'Test RMSE']
print(f"   - Best performing model: {best_model_name}")
print(f"   - Best R² score: {best_r2:.4f} (explains {best_r2*100:.2f}% of variance)")
print(f"   - Best RMSE: ${best_rmse:,.2f}")
print(f"   - Average prediction error: {(best_rmse/df['Weekly_Sales'].mean())*100:.2f}% of mean sales")

print("\n5. FEATURE IMPORTANCE:")
print(f"   - Top 3 features: {', '.join(feature_importance.head(3)['Feature'].tolist())}")

print("\n6. RECOMMENDATIONS:")
print("   - Focus inventory and staffing on high-performing stores")
print("   - Prepare for seasonal variations, especially Q4 peaks")
if holiday_impact > 0:
    print("   - Increase inventory and promotions during holiday weeks")
print("   - Monitor unemployment and CPI as economic indicators for sales forecasting")
print(f"   - Use {best_model_name} for accurate sales predictions")

print("\n" + "="*80)

In [None]:
# Save models
import joblib

# Save best traditional ML model
best_trad_model = models[results_df.iloc[0]['Model']]
joblib.dump(best_trad_model, 'best_traditional_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Save deep learning models
nn_model.save('neural_network_model.h5')
lstm_model.save('lstm_model.h5')

print("\nModels saved successfully!")
print("Files created:")
print("  - best_traditional_model.pkl")
print("  - scaler.pkl")
print("  - neural_network_model.h5")
print("  - lstm_model.h5")

## Summary

This comprehensive analysis of Walmart sales data includes:

### 1. Exploratory Data Analysis (EDA)
- Statistical summaries and distributions
- Correlation analysis
- Time series patterns and seasonality
- Store-wise performance analysis

### 2. Traditional Machine Learning Models
- Linear Regression (baseline)
- Ridge and Lasso Regression (regularized)
- Decision Tree
- Random Forest
- Gradient Boosting

### 3. Deep Learning Models
- Deep Neural Network with dropout and batch normalization
- LSTM for time series prediction

### 4. Feature Engineering
- Temporal features (year, month, week, quarter)
- Lag features for time series
- Rolling statistics
- Store-level aggregations

### 5. Model Evaluation
- R² score for variance explained
- RMSE for prediction accuracy
- MAE for average error
- Residual analysis

The analysis provides actionable insights for business decision-making and accurate sales forecasting capabilities.