# Store Sales STGAT Project - Phase 1: Data Foundation Implementation

**Objective**: Data-driven evaluation case selection for Corporación Favorita retail forecasting

**Key Goals**:
- Comprehensive data exploration and quality assessment
- Data-driven selection of 10 evaluation cases (not arbitrary combinations)
- Establish quality-based evaluation framework
- Create production-ready data modules

**Methodology**: Multi-criteria selection ensuring statistical validity and pattern diversity

In [1]:
# Setup and imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import json
import os
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Any
from scipy import stats
from sklearn.preprocessing import StandardScaler

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Create directories if they don't exist
os.makedirs('results', exist_ok=True)
os.makedirs('src/data', exist_ok=True)

print("📊 Store Sales STGAT Project - Phase 1: Data Foundation")
print("=" * 60)

📊 Store Sales STGAT Project - Phase 1: Data Foundation


In [None]:
class FavoritaDataExplorer:
    """
    Comprehensive data exploration and quality assessment for Corporación Favorita dataset
    
    Features:
    - Systematic data quality evaluation
    - Store-family combination analysis
    - Data-driven case selection algorithm
    - Production-ready evaluation case management
    """
    
    def __init__(self, data_path='../data/raw', results_path='../results/'):
        self.data_path = data_path
        self.results_path = results_path
        self.sales_data = None
        self.stores_data = None
        self.oil_data = None
        self.holidays_data = None
        self.combination_metrics = None
        self.selected_cases = None
        
        print(f"🔧 Initialized FavoritaDataExplorer")
        print(f"   Data path: {data_path}")
        print(f"   Results path: {results_path}")
    
    def load_datasets(self):
        """Load all Corporación Favorita datasets with comprehensive validation"""
        print("\n📁 Loading Corporación Favorita datasets...")
        
        try:
            # Load primary datasets
            self.sales_data = pd.read_csv(f'{self.data_path}/train.csv')
            self.stores_data = pd.read_csv(f'{self.data_path}/stores.csv')
            self.oil_data = pd.read_csv(f'{self.data_path}/oil.csv')
            self.holidays_data = pd.read_csv(f'{self.data_path}/holidays_events.csv')
            
            # Convert date columns
            self.sales_data['date'] = pd.to_datetime(self.sales_data['date'])
            self.oil_data['date'] = pd.to_datetime(self.oil_data['date'])
            self.holidays_data['date'] = pd.to_datetime(self.holidays_data['date'])
            
            # Display dataset overview
            print(f"✅ Sales data: {len(self.sales_data):,} records")
            print(f"   • Date range: {self.sales_data['date'].min()} to {self.sales_data['date'].max()}")
            print(f"   • Stores: {self.sales_data['store_nbr'].nunique()}")
            print(f"   • Product families: {self.sales_data['family'].nunique()}")
            print(f"   • Total days: {(self.sales_data['date'].max() - self.sales_data['date'].min()).days}")
            
            print(f"✅ Stores metadata: {len(self.stores_data)} stores")
            print(f"✅ Oil prices: {len(self.oil_data)} records")
            print(f"✅ Holidays data: {len(self.holidays_data)} events")
            
            return True
            
        except Exception as e:
            print(f"❌ Error loading datasets: {e}")
            print("📋 Expected files in data/raw/:")
            print("   • train.csv (sales data)")
            print("   • stores.csv (store metadata)")
            print("   • oil.csv (oil prices)")
            print("   • holidays_events.csv (holidays)")
            return False

    # Additional methods will be added in subsequent cells...

## 1. Setup and Data Loading

In [4]:
def comprehensive_data_assessment(self):
    """
    Systematic data quality evaluation for academic rigor
    
    Returns comprehensive quality metrics for case selection
    """
    print("\n🔍 Comprehensive Data Quality Assessment")
    print("-" * 50)
    
    if self.sales_data is None:
        print("❌ Please load datasets first using load_datasets()")
        return None
    
    # Core data quality metrics
    quality_metrics = {
        'dataset_overview': {
            'total_records': len(self.sales_data),
            'date_range': {
                'start': self.sales_data['date'].min(),
                'end': self.sales_data['date'].max(),
                'total_days': (self.sales_data['date'].max() - self.sales_data['date'].min()).days
            },
            'stores_count': self.sales_data['store_nbr'].nunique(),
            'families_count': self.sales_data['family'].nunique(),
            'unique_combinations': self.sales_data.groupby(['store_nbr', 'family']).ngroups
        },
        
        'data_quality': {
            'missing_values': self.sales_data.isnull().sum().to_dict(),
            'zero_sales_records': (self.sales_data['sales'] == 0).sum(),
            'zero_sales_percentage': (self.sales_data['sales'] == 0).mean() * 100,
            'negative_sales': (self.sales_data['sales'] < 0).sum(),
            'sales_statistics': self.sales_data['sales'].describe().to_dict()
        },
        
        'temporal_coverage': {
            'records_per_day': len(self.sales_data) / ((self.sales_data['date'].max() - self.sales_data['date'].min()).days + 1),
            'expected_records_per_day': self.sales_data['store_nbr'].nunique() * self.sales_data['family'].nunique(),
            'coverage_ratio': None  # Will calculate below
        }
    }
    
    # Calculate coverage ratio
    expected_daily = quality_metrics['dataset_overview']['stores_count'] * quality_metrics['dataset_overview']['families_count']
    quality_metrics['temporal_coverage']['coverage_ratio'] = quality_metrics['temporal_coverage']['records_per_day'] / expected_daily
    
    # Display key findings
    print(f"📊 Dataset Overview:")
    print(f"   • Total records: {quality_metrics['dataset_overview']['total_records']:,}")
    print(f"   • Date range: {quality_metrics['dataset_overview']['date_range']['total_days']} days")
    print(f"   • Store-family combinations: {quality_metrics['dataset_overview']['unique_combinations']:,}")
    
    print(f"\n📈 Data Quality:")
    print(f"   • Zero sales: {quality_metrics['data_quality']['zero_sales_percentage']:.1f}%")
    print(f"   • Negative sales: {quality_metrics['data_quality']['negative_sales']:,} records")
    print(f"   • Average daily sales: {quality_metrics['data_quality']['sales_statistics']['mean']:.2f}")
    print(f"   • Coverage ratio: {quality_metrics['temporal_coverage']['coverage_ratio']:.3f}")
    
    self.quality_metrics = quality_metrics
    return quality_metrics

# Add this method to the FavoritaDataExplorer class
FavoritaDataExplorer.comprehensive_data_assessment = comprehensive_data_assessment

## 2. Comprehensive Data Explorer Class

In [5]:
explorer = FavoritaDataExplorer()

🔧 Initialized FavoritaDataExplorer
   Data path: data/raw/
   Results path: results/


In [6]:
if explorer.load_datasets():
    print("✅ Data loaded successfully!")
else:
    print("❌ Check data files in data/raw/")


📁 Loading Corporación Favorita datasets...
❌ Error loading datasets: [Errno 2] No such file or directory: 'data/raw//train.csv'
📋 Expected files in data/raw/:
   • train.csv (sales data)
   • stores.csv (store metadata)
   • oil.csv (oil prices)
   • holidays_events.csv (holidays)
❌ Check data files in data/raw/


## 3. Execute Comprehensive Data Exploration

## 4. Data Quality Assessment

## 5. Store-Family Combination Analysis

In [None]:
# Restart kernel and run this cell to verify Phase 1 completion

print("🎯 FINAL PHASE 1 VERIFICATION")
print("=" * 50)

import os
import sys
sys.path.append('../src')

# Check file existence
files_to_check = {
    'JSON evaluation cases': '../results/evaluation_cases.json',
    'Production case manager': '../src/data/evaluation_cases.py',
    'Data module init': '../src/data/__init__.py',
    'Source package init': '../src/__init__.py'
}

all_files_exist = True
for name, filepath in files_to_check.items():
    if os.path.exists(filepath):
        size = os.path.getsize(filepath)
        print(f"✅ {name}: {size:,} bytes")
    else:
        print(f"❌ Missing {name}: {filepath}")
        all_files_exist = False

# Test production module
try:
    from data.evaluation_cases import EvaluationCaseManager
    
    manager = EvaluationCaseManager()
    cases = manager.get_cases_list()
    metadata = manager.get_metadata()
    
    print(f"\n✅ Production Module Test:")
    print(f"   • Loaded {len(cases)} evaluation cases")
    print(f"   • Manager initialized successfully")
    print(f"   • Selection method: {metadata.get('selection_method', 'N/A')}")
    
    # Fix the formatting issue by checking if value is numeric before applying comma formatting
    total_candidates = metadata.get('total_candidates', 'N/A')
    final_selected = metadata.get('final_selected', 'N/A')
    
    if isinstance(total_candidates, (int, float)):
        print(f"   • Total candidates: {total_candidates:,}")
    else:
        print(f"   • Total candidates: {total_candidates}")
        
    if isinstance(final_selected, (int, float)):
        print(f"   • Final selected: {final_selected:,}")
    else:
        print(f"   • Final selected: {final_selected}")
    
    if len(cases) == 10:
        print("\n✅ PHASE 1 COMPLETE - All verification checks passed!")
    else:
        print("\n⚠️  PHASE 1 NEEDS ATTENTION - Please resolve issues above")
        
except Exception as e:
    print(f"❌ Production module test failed: {e}")
    print("⚠️  PHASE 1 NEEDS ATTENTION - Please resolve issues above")
    all_files_exist = False