In [None]:
# %%
# Walmart Sales Forecasting - Exploratory Data Analysis & Feature Importance
# ---------------------------------------------------------------------------
# This notebook explores the processed Walmart sales dataset and visualizes 
# patterns, seasonality, and model feature importance.

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import lightgbm as lgb
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

# %%
# Paths
DATA_DIR = Path("data/processed")
MODEL_DIR = Path("models")

# Load processed data
train = pd.read_parquet(DATA_DIR / "train.parquet")
val = pd.read_parquet(DATA_DIR / "val.parquet")

print(f"Train shape: {train.shape}, Val shape: {val.shape}")
train.head()

# %%
# Basic statistics
train.describe(include='all').T

# %%
# Check date range and unique stores/departments
print(f"Date range: {train['Date'].min()} to {train['Date'].max()}")
print(f"Stores: {train['Store'].nunique()}, Departments: {train['Dept'].nunique()}")


In [None]:

# %%
# Aggregate weekly sales trends over time
agg_sales = train.groupby("Date")["Weekly_Sales"].sum().reset_index()

plt.figure(figsize=(14, 6))
plt.plot(agg_sales["Date"], agg_sales["Weekly_Sales"], color="tab:blue")
plt.title("Total Weekly Sales Over Time", fontsize=14)
plt.xlabel("Date")
plt.ylabel("Total Weekly Sales")
plt.grid(alpha=0.3)
plt.show()


In [None]:

# %%
# Average sales by month and week of year
train["month"] = train["Date"].dt.month
train["weekofyear"] = train["Date"].dt.isocalendar().week.astype(int)

monthly_sales = train.groupby("month")["Weekly_Sales"].mean()
plt.figure(figsize=(10,5))
monthly_sales.plot(kind="bar", color="tab:green")
plt.title("Average Weekly Sales by Month")
plt.xlabel("Month")
plt.ylabel("Avg Weekly Sales")
plt.show()


In [None]:

# %%
# Average sales by store (top 10)
top_stores = (
    train.groupby("Store")["Weekly_Sales"]
    .mean()
    .sort_values(ascending=False)
    .head(10)
)
plt.figure(figsize=(10,5))
sns.barplot(x=top_stores.index, y=top_stores.values, palette="Blues_d")
plt.title("Top 10 Stores by Average Weekly Sales")
plt.xlabel("Store")
plt.ylabel("Avg Weekly Sales")
plt.show()


In [None]:

# %%
# Holiday vs Non-holiday comparison
holiday_sales = train.groupby("IsHoliday")["Weekly_Sales"].mean()
sns.barplot(x=holiday_sales.index, y=holiday_sales.values, palette="coolwarm")
plt.title("Holiday vs Non-Holiday Weekly Sales")
plt.xlabel("IsHoliday")
plt.ylabel("Average Weekly Sales")
plt.show()

# %%
# Correlation matrix for numeric features
num_cols = train.select_dtypes(include=np.number).columns
corr = train[num_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=False, cmap="coolwarm", center=0)
plt.title("Feature Correlation Heatmap")
plt.show()

# %%
# Distribution of Weekly Sales
plt.figure(figsize=(10,5))
sns.histplot(train["Weekly_Sales"], bins=50, kde=True)
plt.title("Distribution of Weekly Sales")
plt.xlabel("Weekly Sales")
plt.ylabel("Frequency")
plt.show()


In [None]:

# %%
# Load trained LightGBM model and feature list
model = joblib.load(MODEL_DIR / "lgb_model.joblib")
features = joblib.load(MODEL_DIR / "feature_list.joblib")

# %%
# Feature importance visualization
importance = model.feature_importance(importance_type="gain")
importance_df = pd.DataFrame({"feature": features, "importance": importance})
importance_df = importance_df.sort_values("importance", ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(data=importance_df.head(15), x="importance", y="feature", palette="viridis")
plt.title("Top 15 Feature Importances (LightGBM Gain)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

# %%
# Feature importance (split count)
split_importance = model.feature_importance(importance_type="split")
split_df = pd.DataFrame({"feature": features, "importance": split_importance}).sort_values("importance", ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(data=split_df.head(15), x="importance", y="feature", palette="mako")
plt.title("Top 15 Feature Importances (Split Count)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

# %%
# Weekly pattern for a single store/department
example = train[(train["Store"] == "1") & (train["Dept"] == "1")].copy()
plt.figure(figsize=(12,5))
plt.plot(example["Date"], example["Weekly_Sales"], color="tab:orange")
plt.title("Example: Store 1 Dept 1 Weekly Sales")
plt.xlabel("Date")
plt.ylabel("Weekly Sales")
plt.grid(alpha=0.3)
plt.show()

# %%
# Time vs Lag relationship check
plt.figure(figsize=(8,6))
sns.scatterplot(x="lag_1", y="Weekly_Sales", data=train.sample(5000, random_state=42), alpha=0.3)
plt.title("Weekly Sales vs Lag_1 Feature")
plt.xlabel("Previous Week Sales (lag_1)")
plt.ylabel("Current Week Sales")
plt.show()

# %%
# Save correlation & feature importance data for reporting
importance_df.to_csv(MODEL_DIR / "feature_importance.csv", index=False)
corr.to_csv(MODEL_DIR / "correlation_matrix.csv")
print("Saved feature_importance.csv and correlation_matrix.csv in models/")

# %%
print("EDA and Feature Importance analysis completed.")
