# BigMart Sales Prediction - Test Data Production Pipeline

This notebook uses the production-ready ensemble pipeline to generate predictions on test data.

## Overview
- Load the production pipeline and models
- Load test data from code folder
- Generate predictions using the champion ensemble model
- Save predictions for submission

In [1]:
print("Loading necessary libraries and production pipeline...")

# Standard data science libraries
import pandas as pd
import numpy as np
import os
import sys
from datetime import datetime

# Add the production pipeline to path
sys.path.append('../production_models_final')

# Import the production pipeline
from bigmart_production_pipeline import BigMartProductionPipeline, predict_bigmart_sales

print("Libraries loaded successfully")
print("Production pipeline imported successfully")

Loading necessary libraries and production pipeline...
Libraries loaded successfully
Production pipeline imported successfully


In [2]:
print("Loading test data from code folder...")

# Load test data
test_data_path = "../code/test_AbJTz2l.csv"
test_data = pd.read_csv(test_data_path)

print(f"Test data loaded successfully")
print(f"Test data shape: {test_data.shape}")
print(f"Test data columns: {test_data.columns.tolist()}")

# Display basic info about test data
print(f"\nTest data info:")
print(f"Number of samples: {len(test_data)}")
print(f"Number of features: {test_data.shape[1]}")

# Check for missing values
print(f"\nMissing values per column:")
missing_counts = test_data.isnull().sum()
for col, count in missing_counts.items():
    if count > 0:
        print(f"   {col}: {count} ({count/len(test_data)*100:.1f}%)")

# Display first few rows
print(f"\nFirst 5 rows of test data:")
print(test_data.head())

Loading test data from code folder...
Test data loaded successfully
Test data shape: (5681, 11)
Test data columns: ['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

Test data info:
Number of samples: 5681
Number of features: 11

Missing values per column:
   Item_Weight: 976 (17.2%)
   Outlet_Size: 1606 (28.3%)

First 5 rows of test data:
  Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility    Item_Type  \
0           FDW58       20.750          Low Fat         0.007565  Snack Foods   
1           FDW14        8.300              reg         0.038428        Dairy   
2           NCN55       14.600          Low Fat         0.099575       Others   
3           FDQ58        7.315          Low Fat         0.015388  Snack Foods   
4           FDY38          NaN          Regular         0.118599        Dairy   

   Item_MRP Outlet

In [3]:
print("Initializing production pipeline...")

# Set up paths to production models
models_directory = "../production_models_final"

# Find the latest ensemble configuration file
config_files = [f for f in os.listdir(models_directory) if f.startswith('ensemble_config_')]
if not config_files:
    raise ValueError(f"No ensemble config found in {models_directory}")

latest_config = max(config_files)  # Get the latest config file
config_path = os.path.join(models_directory, latest_config)

print(f"Using configuration: {latest_config}")
print(f"Models directory: {models_directory}")

# Initialize the production pipeline
pipeline = BigMartProductionPipeline()

# Load the pipeline with trained models
pipeline.load_pipeline(config_path, models_directory)

# Get pipeline information
pipeline_info = pipeline.get_pipeline_info()
print(f"\nPipeline Information:")
print(f"   Loaded models: {pipeline_info['loaded_models']}")
print(f"   Best strategy: {pipeline_info['best_strategy']}")
print(f"   Training R2: {pipeline_info['training_performance']['r2_score']:.6f}")
print(f"   Training RMSE: {pipeline_info['training_performance']['rmse']:.4f}")
print(f"   Sophisticated ensembles: {pipeline_info['sophisticated_ensembles']}")

print(f"\nProduction pipeline ready for predictions!")

Initializing production pipeline...
Using configuration: ensemble_config_20250907_121520.json
Models directory: ../production_models_final
Loading production pipeline from: ../production_models_final
✓ Configuration loaded
✓ Preprocessor loaded
✓ et_optimized_advanced loaded
✓ gb_optimized_advanced loaded
✓ xgb_optimized_advanced loaded
✓ rf_optimized_advanced loaded
✓ Sophisticated ensembles loaded: 11 models
✓ Pipeline fully loaded and ready for production!

Pipeline Information:
   Loaded models: ['et_optimized_advanced', 'gb_optimized_advanced', 'xgb_optimized_advanced', 'rf_optimized_advanced']
   Best strategy: equal_weights
   Training R2: 0.991305
   Training RMSE: 158.6118
   Sophisticated ensembles: ['stacking_neural_network', 'stacking_elastic_net', 'stacking_ridge', 'stacking_svr', 'voting_performance_squared', 'voting_performance_cubed', 'voting_softmax_performance', 'voting_rank_based_exponential', 'high_performer_ensemble', 'neural_adaptive', 'tree_adaptive']

Production

In [4]:
print("Generating predictions on test data...")

# Generate complete predictions using the production pipeline
test_results = pipeline.predict_complete(test_data)

print(f"Predictions generated successfully")
print(f"Processed data shape: {test_results['data_shape']}")

# Extract different prediction types
weighted_ensemble_predictions = test_results['weighted_ensemble']['ensemble_prediction']
neural_adaptive_predictions = test_results['neural_adaptive']
individual_predictions = test_results['weighted_ensemble']['individual_predictions']

print(f"\nPrediction Methods Available:")
print(f"   Weighted Ensemble: {len(weighted_ensemble_predictions)} predictions")
print(f"   Neural Adaptive: {len(neural_adaptive_predictions) if neural_adaptive_predictions is not None else 'Not available'} predictions")
print(f"   Individual models: {list(individual_predictions.keys())}")

# Use the champion model (Neural Adaptive) for final predictions
final_predictions = neural_adaptive_predictions if neural_adaptive_predictions is not None else weighted_ensemble_predictions
prediction_method = "Neural Adaptive" if neural_adaptive_predictions is not None else "Weighted Ensemble"

print(f"\nUsing {prediction_method} for final predictions")

# Display prediction statistics
print(f"\nPrediction Statistics:")
print(f"   Mean prediction: {np.mean(final_predictions):.2f}")
print(f"   Std prediction: {np.std(final_predictions):.2f}")
print(f"   Min prediction: {np.min(final_predictions):.2f}")
print(f"   Max prediction: {np.max(final_predictions):.2f}")

# Check for any unusual predictions
unusual_low = np.sum(final_predictions < 0)
unusual_high = np.sum(final_predictions > 10000)

print(f"\nPrediction Quality Check:")
print(f"   Negative predictions: {unusual_low}")
print(f"   Very high predictions (>10,000): {unusual_high}")

if unusual_low > 0 or unusual_high > 0:
    print("   Note: Some predictions may need review")
else:
    print("   All predictions within reasonable range")

Generating predictions on test data...
Transforming data with BigMartPreprocessor...
Handling missing values with smart imputation...
  - Imputing Item_Weight using multi-level groupby strategy...
  - Imputing Item_Weight using multi-level groupby strategy...
    ✓ Item_Weight imputed (remaining NaNs: 0)
  - Imputing Outlet_Size using outlet type and location patterns...
    ✓ Outlet_Size imputed (remaining NaNs: 0)
  - Checking for other missing values...
    - Found 353 zero Item_Visibility values, replacing with Item_Type median...
    ✓ Item_Visibility zeros handled (remaining zeros: 0)
     Smart missing value imputation completed!
  Creating engineered features...
Adding statistical features...
Encoding categorical variables...
Final data cleanup...
Transformation complete! Final shape: (5681, 48)
Predictions generated successfully
Processed data shape: (5681, 48)

Prediction Methods Available:
   Weighted Ensemble: 5681 predictions
   Neural Adaptive: 5681 predictions
   Individ



In [7]:
print("Post-processing predictions to handle negative values...")

# Identify negative predictions
negative_mask = final_predictions < 0
num_negative = np.sum(negative_mask)

print(f"Found {num_negative} negative predictions that need correction")

if num_negative > 0:
    print(f"Negative prediction statistics:")
    negative_predictions = final_predictions[negative_mask]
    print(f"   Most negative: {np.min(negative_predictions):.2f}")
    print(f"   Mean of negatives: {np.mean(negative_predictions):.2f}")
    
    # Strategy 1: Replace with small positive values based on similar items
    # Get the weighted ensemble predictions for comparison
    weighted_negatives = weighted_ensemble_predictions[negative_mask]
    
    # Strategy 2: Use a combination of approaches
    corrected_predictions = final_predictions.copy()
    
    for i, is_negative in enumerate(negative_mask):
        if is_negative:
            # Option 1: Use weighted ensemble if it's positive
            if weighted_ensemble_predictions[i] > 0:
                corrected_predictions[i] = weighted_ensemble_predictions[i]
            # Option 2: Use individual model predictions (take the median of positive ones)
            else:
                individual_preds = [individual_predictions[model][i] for model in individual_predictions.keys()]
                positive_preds = [p for p in individual_preds if p > 0]
                
                if positive_preds:
                    corrected_predictions[i] = np.median(positive_preds)
                else:
                    # Option 3: Use a small positive value based on similar items
                    # Set to 10th percentile of all positive predictions
                    corrected_predictions[i] = np.percentile(final_predictions[final_predictions > 0], 10)
    
    # Verify correction
    remaining_negative = np.sum(corrected_predictions < 0)
    print(f"\nPost-processing results:")
    print(f"   Original negative predictions: {num_negative}")
    print(f"   Remaining negative predictions: {remaining_negative}")
    
    if remaining_negative == 0:
        print(f"   Status: All negative predictions successfully corrected")
        
        # Update final predictions
        final_predictions = corrected_predictions
        
        # Update prediction statistics
        print(f"\nUpdated prediction statistics:")
        print(f"   Mean prediction: {np.mean(final_predictions):.2f}")
        print(f"   Min prediction: {np.min(final_predictions):.2f}")
        print(f"   Max prediction: {np.max(final_predictions):.2f}")
        
        # Show correction details
        correction_applied = negative_mask
        if np.sum(correction_applied) > 0:
            print(f"\nCorrection details:")
            print(f"   Number of corrections: {np.sum(correction_applied)}")
            corrected_values = final_predictions[correction_applied]
            print(f"   Corrected values range: [{np.min(corrected_values):.2f}, {np.max(corrected_values):.2f}]")
    else:
        print(f"   Warning: {remaining_negative} predictions still negative - additional review needed")

else:
    print("No negative predictions found - no correction needed")

print("Post-processing completed")

Post-processing predictions to handle negative values...
Found 15 negative predictions that need correction
Negative prediction statistics:
   Most negative: -143.76
   Mean of negatives: -22.84

Post-processing results:
   Original negative predictions: 15
   Remaining negative predictions: 0
   Status: All negative predictions successfully corrected

Updated prediction statistics:
   Mean prediction: 2130.27
   Min prediction: 13.54
   Max prediction: 9574.84

Correction details:
   Number of corrections: 15
   Corrected values range: [31.50, 237.21]
Post-processing completed


In [8]:
print("Creating submission file...")

# Check if test data has Item_Identifier and Outlet_Identifier for submission
required_columns = ['Item_Identifier', 'Outlet_Identifier']
missing_cols = [col for col in required_columns if col not in test_data.columns]

if missing_cols:
    print(f"Warning: Missing required columns for submission: {missing_cols}")
    # Create a simple submission with index
    submission_df = pd.DataFrame({
        'ID': range(len(final_predictions)),
        'Item_Outlet_Sales': final_predictions
    })
else:
    # Create proper submission format
    submission_df = pd.DataFrame({
        'Item_Identifier': test_data['Item_Identifier'],
        'Outlet_Identifier': test_data['Outlet_Identifier'],
        'Item_Outlet_Sales': final_predictions
    })

# Generate timestamp for file naming
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save submission file
submission_file = f"bigmart_predictions_{prediction_method.lower().replace(' ', '_')}_{timestamp}.csv"
submission_df.to_csv(submission_file, index=False)

print(f"Submission file saved: {submission_file}")
print(f"Prediction method used: {prediction_method}")
print(f"Number of predictions: {len(final_predictions)}")

# Display first few predictions
print(f"\nFirst 10 predictions:")
print(submission_df.head(10))

# Save detailed results for analysis
detailed_results = pd.DataFrame({
    'Weighted_Ensemble': weighted_ensemble_predictions,
    'Neural_Adaptive': neural_adaptive_predictions if neural_adaptive_predictions is not None else np.nan,
    'Final_Prediction': final_predictions
})

# Add individual model predictions
for model_name, predictions in individual_predictions.items():
    detailed_results[f'Individual_{model_name}'] = predictions

# Save detailed results
detailed_file = f"detailed_predictions_{timestamp}.csv"
detailed_results.to_csv(detailed_file, index=False)

print(f"Detailed predictions saved: {detailed_file}")

# Save confidence scores if available
if 'confidence' in test_results['weighted_ensemble']:
    confidence_scores = test_results['weighted_ensemble']['confidence']
    confidence_df = pd.DataFrame({
        'Prediction_Confidence': confidence_scores,
        'Final_Prediction': final_predictions
    })
    
    confidence_file = f"prediction_confidence_{timestamp}.csv"
    confidence_df.to_csv(confidence_file, index=False)
    
    print(f"Confidence scores saved: {confidence_file}")
    print(f"Mean confidence: {np.mean(confidence_scores):.4f}")

print(f"\nPrediction generation complete!")
print(f"Files created:")
print(f"   1. {submission_file} (main submission)")
print(f"   2. {detailed_file} (detailed analysis)")
if 'confidence' in test_results['weighted_ensemble']:
    print(f"   3. {confidence_file} (confidence scores)")

Creating submission file...
Submission file saved: bigmart_predictions_neural_adaptive_20250907_123308.csv
Prediction method used: Neural Adaptive
Number of predictions: 5681

First 10 predictions:
  Item_Identifier Outlet_Identifier  Item_Outlet_Sales
0           FDW58            OUT049        1474.690858
1           FDW14            OUT017        1410.542196
2           NCN55            OUT010        1117.430953
3           FDQ58            OUT017        2500.559481
4           FDY38            OUT027        6206.035277
5           FDH56            OUT046        2393.738943
6           FDL48            OUT018         208.861138
7           FDC48            OUT027        2116.205241
8           FDN33            OUT045         856.188666
9           FDA36            OUT017        2274.243682
Detailed predictions saved: detailed_predictions_20250907_123308.csv
Confidence scores saved: prediction_confidence_20250907_123308.csv
Mean confidence: 0.0066

Prediction generation complete!
File

In [9]:
print("Final validation and summary...")

# Validation checks
print(f"VALIDATION SUMMARY:")
print(f"=" * 40)

# Check data consistency
print(f"Data consistency checks:")
print(f"   Test data samples: {len(test_data)}")
print(f"   Generated predictions: {len(final_predictions)}")
print(f"   Data consistency: {'PASS' if len(test_data) == len(final_predictions) else 'FAIL'}")

# Check prediction reasonableness
pred_mean = np.mean(final_predictions)
pred_median = np.median(final_predictions)
pred_std = np.std(final_predictions)

print(f"\nPrediction statistics:")
print(f"   Mean: {pred_mean:.2f}")
print(f"   Median: {pred_median:.2f}")
print(f"   Standard deviation: {pred_std:.2f}")
print(f"   Range: [{np.min(final_predictions):.2f}, {np.max(final_predictions):.2f}]")

# Model performance reference
training_performance = pipeline_info['training_performance']
print(f"\nModel performance reference (from training):")
print(f"   Training R2: {training_performance['r2_score']:.6f}")
print(f"   Training RMSE: {training_performance['rmse']:.4f}")
print(f"   Method used: {pipeline_info['best_strategy']}")

# Check for any data quality issues
print(f"\nData quality checks:")
nan_predictions = np.sum(np.isnan(final_predictions))
inf_predictions = np.sum(np.isinf(final_predictions))

print(f"   NaN predictions: {nan_predictions}")
print(f"   Infinite predictions: {inf_predictions}")
print(f"   Data quality: {'PASS' if nan_predictions == 0 and inf_predictions == 0 else 'FAIL'}")

# Final status
if (len(test_data) == len(final_predictions) and 
    nan_predictions == 0 and inf_predictions == 0):
    print(f"\nSTATUS: PREDICTIONS READY FOR SUBMISSION")
    print(f"Main submission file: {submission_file}")
else:
    print(f"\nSTATUS: REVIEW REQUIRED")
    print(f"Please check data quality issues above")

print(f"\nProduction pipeline execution completed successfully!")

Final validation and summary...
VALIDATION SUMMARY:
Data consistency checks:
   Test data samples: 5681
   Generated predictions: 5681
   Data consistency: PASS

Prediction statistics:
   Mean: 2130.27
   Median: 1946.19
   Standard deviation: 1401.78
   Range: [13.54, 9574.84]

Model performance reference (from training):
   Training R2: 0.991305
   Training RMSE: 158.6118
   Method used: equal_weights

Data quality checks:
   NaN predictions: 0
   Infinite predictions: 0
   Data quality: PASS

STATUS: PREDICTIONS READY FOR SUBMISSION
Main submission file: bigmart_predictions_neural_adaptive_20250907_123308.csv

Production pipeline execution completed successfully!
