# Data Exploration for Budget Forecasting

This notebook explores the profit data from stores to understand patterns and prepare for modeling.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import sys
sys.path.append('..')

from src.utils import calculate_seasonal_metrics, plot_error_distribution

%matplotlib inline
plt.style.use('seaborn')
sns.set_theme(style="whitegrid")

## 1. Load and Inspect Data

In [None]:
# Load the cleaned data
data_path = '../data/processed/Cleaned_Combined_Profit_Data_2023_2024.xlsx'
df = pd.read_excel(data_path)

print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
df.head()

## 2. Data Summary Statistics

In [None]:
# Calculate basic statistics for each month
monthly_stats = df.iloc[:, 1:].describe()
print("Monthly Statistics:")
monthly_stats

## 3. Data Quality Analysis

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values[missing_values > 0])

# Check for duplicates
duplicates = df['Store Name'].duplicated().sum()
print(f"\nDuplicate Store Names: {duplicates}")

# Check for negative profits
negative_profits = (df.iloc[:, 1:] < 0).sum()
print("\nNegative Profit Counts per Month:")
print(negative_profits[negative_profits > 0])

## 4. Profit Distribution Analysis

In [None]:
# Plot profit distributions for selected months
plt.figure(figsize=(15, 5))
sample_months = ['Jan2023', 'Jun2023', 'Dec2023', 'Jun2024']

for i, month in enumerate(sample_months, 1):
    plt.subplot(1, 4, i)
    sns.histplot(df[month].dropna(), kde=True)
    plt.title(f'{month} Profit Distribution')
    plt.xlabel('Profit')
    
plt.tight_layout()

## 5. Seasonal Analysis

In [None]:
# Plot average monthly profits
monthly_means = df.iloc[:, 1:].mean()
plt.figure(figsize=(12, 6))
monthly_means.plot(kind='line', marker='o')
plt.title('Average Monthly Profits Across All Stores')
plt.xlabel('Month')
plt.ylabel('Average Profit')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()

# Calculate seasonal metrics
seasonal_metrics = calculate_seasonal_metrics(df)
print("\nSeasonal Metrics Summary:")
print(seasonal_metrics.describe())

## 6. Store Performance Analysis

In [None]:
# Calculate average performance metrics
store_metrics = pd.DataFrame({
    'Store Name': df['Store Name'],
    'Average Profit': df.iloc[:, 1:].mean(axis=1),
    'Profit Volatility': df.iloc[:, 1:].std(axis=1),
    'Growth Rate': (df['Jun2024'] - df['Jan2023']) / df['Jan2023'] * 100
})

# Plot store performance distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
sns.scatterplot(data=store_metrics, x='Average Profit', y='Profit Volatility')
plt.title('Store Performance Profile')

plt.subplot(1, 2, 2)
sns.histplot(store_metrics['Growth Rate'].dropna(), kde=True)
plt.title('Growth Rate Distribution')

plt.tight_layout()

## 7. Correlation Analysis

In [None]:
# Calculate correlation between months
correlation_matrix = df.iloc[:, 1:].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, 
            annot=True, 
            cmap='coolwarm', 
            center=0,
            fmt='.2f')
plt.title('Correlation Between Monthly Profits')
plt.tight_layout()

## 8. Outlier Analysis

In [None]:
# Calculate Z-scores for each month
z_scores = df.iloc[:, 1:].apply(lambda x: (x - x.mean()) / x.std())

# Identify potential outliers (|Z-score| > 3)
outliers = (np.abs(z_scores) > 3).sum()

print("Number of potential outliers per month:")
print(outliers)

# Plot boxplot for selected months
plt.figure(figsize=(15, 6))
df[sample_months].boxplot()
plt.title('Profit Distribution Boxplots')
plt.xticks(rotation=45)
plt.ylabel('Profit')
plt.tight_layout()

## 9. Summary and Insights

Key findings from the data exploration:

1. Data Quality:
   - Missing values distribution
   - Outlier patterns
   - Data consistency issues

2. Profit Patterns:
   - Seasonal trends
   - Growth patterns
   - Store performance variations

3. Model Considerations:
   - Features to consider
   - Preprocessing steps needed
   - Potential challenges