In [1]:
import os
import numpy as np
import pandas as pd
from typing import Tuple

In [3]:
# Modeling
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

### 1) Load Data 

In [11]:
### Load Data ####
train = pd.read_csv("C:/Users/hp/Documents/Analytics_Vidhya/BigMart_SalesPrediction/train_v9rqX0R.csv")
test = pd.read_csv("C:/Users/hp/Documents/Analytics_Vidhya/BigMart_SalesPrediction/test_AbJTz2l.csv")

In [14]:
# keep raw ids for submission later
id_cols = ["Item_Identifier", "Outlet_Identifier"]

### 2) Combine & Clean

In [15]:
train["Source"] = "train"
test["Source"] = "test"
test["Item_Outlet_Sales"] = np.nan

df = pd.concat([train, test], ignore_index=True)

In [16]:
# --- Normalize Item_Fat_Content ---
df["Item_Fat_Content"] = df["Item_Fat_Content"].replace({
    "LF": "Low Fat",
    "low fat": "Low Fat",
    "reg": "Regular"
})

In [17]:
# Item category from ID (FD=Food, DR=Drinks, NC=Non-Consumable)
df["Item_Category"] = df["Item_Identifier"].str[:2]

In [18]:
# Non-Consumables should not carry fat tag
non_edible_mask = df["Item_Category"] == "NC"
df.loc[non_edible_mask, "Item_Fat_Content"] = "Non-Edible"

In [19]:
# --- Impute Item_Weight using hierarchical medians ---
# 1) by Item_Identifier, 2) by Item_Type, 3) global median
w_by_item = df.groupby("Item_Identifier")["Item_Weight"].median()
df["Item_Weight"] = df["Item_Weight"].fillna(df["Item_Identifier"].map(w_by_item))
w_by_type = df.groupby("Item_Type")["Item_Weight"].transform("median")
df["Item_Weight"] = df["Item_Weight"].fillna(w_by_type)
df["Item_Weight"] = df["Item_Weight"].fillna(df["Item_Weight"].median())

In [20]:
# --- Fix Item_Visibility zeros & NAs ---
# Replace zeros with item-wise mean (excluding zeros), fallback to global mean
vis_nonzero = df.loc[df["Item_Visibility"] > 0, "Item_Visibility"]
vis_global_mean = vis_nonzero.mean()
vis_by_item = (df.assign(iv=lambda x: x["Item_Visibility"].where(x["Item_Visibility"] > 0, np.nan))
                 .groupby("Item_Identifier")["iv"].mean())
df["Item_Visibility"] = df.apply(
    lambda r: vis_by_item.get(r["Item_Identifier"], np.nan)
              if r["Item_Visibility"] == 0 else r["Item_Visibility"], axis=1)
df["Item_Visibility"] = df["Item_Visibility"].fillna(vis_global_mean)

In [21]:
# --- Impute Outlet_Size via Outlet_Type mode ---
size_mode_by_type = (df.groupby("Outlet_Type")["Outlet_Size"]
                      .agg(lambda s: s.mode().iat[0] if not s.mode().empty else np.nan))
df["Outlet_Size"] = df["Outlet_Size"].fillna(df["Outlet_Type"].map(size_mode_by_type))
# fallback to global mode
if df["Outlet_Size"].isna().any():
    global_mode = df["Outlet_Size"].mode()
    if not global_mode.empty:
        df["Outlet_Size"] = df["Outlet_Size"].fillna(global_mode.iat[0])


#### 3) Feature Engineering

In [22]:
# Year reference is 2013 per competition statement
df["Outlet_Age"] = 2013 - df["Outlet_Establishment_Year"].astype(int)

In [23]:
# Visibility features
df["Visibility_Mean_Item"] = df.groupby("Item_Identifier")["Item_Visibility"].transform("mean")
df["Visibility_Ratio"] = df["Item_Visibility"] / (df["Visibility_Mean_Item"] + 1e-9)


In [24]:
# Price transforms and bins (non-linear effects)
df["Item_MRP_log"] = np.log1p(df["Item_MRP"])
df["Item_Visibility_log"] = np.log1p(df["Item_Visibility"]) 

In [25]:
# Quartile bins (labels as integers 0..k-1)
df["MRP_Bin"] = pd.qcut(df["Item_MRP"], q=4, labels=False, duplicates="drop")

In [26]:
# Frequency encodings (compact signal without exploding dims)
df["Item_ID_Count"] = df.groupby("Item_Identifier")["Item_Identifier"].transform("count")
df["Outlet_ID_Count"] = df.groupby("Outlet_Identifier")["Outlet_Identifier"].transform("count")

In [27]:
# Optional polynomial feature for MRP
df["Item_MRP_sq"] = df["Item_MRP"] ** 2

### 4) Encode categoricals (one-hot on full combined data to keep aligned)

In [28]:
cat_cols = [
    "Item_Fat_Content", "Item_Type", "Outlet_Identifier", "Outlet_Size",
    "Outlet_Location_Type", "Outlet_Type", "Item_Category", "MRP_Bin"
]
num_cols = [
    "Item_Weight", "Item_Visibility", "Item_MRP", "Outlet_Age",
    "Visibility_Mean_Item", "Visibility_Ratio", "Item_MRP_log",
    "Item_Visibility_log", "Item_ID_Count", "Outlet_ID_Count", "Item_MRP_sq"
]

In [29]:
# Ensure MRP_Bin is treated as categorical
df["MRP_Bin"] = df["MRP_Bin"].astype("category")

# Drop columns that should not be used directly
drop_cols = ["Item_Outlet_Sales", "Outlet_Establishment_Year", "Source", "Item_Identifier"]

X_full = pd.get_dummies(df[cat_cols + num_cols], drop_first=False)

In [30]:
# Split back
mask_train = df["Source"] == "train"
X = X_full.loc[mask_train].copy()
y_raw = train["Item_Outlet_Sales"].values.astype(float)
X_test = X_full.loc[~mask_train].copy()

In [31]:
# Target: log-transform for stability
y = np.log1p(y_raw)

### 5) KFold CV + Models (LGB + XGB, fallback to HistGB)

In [33]:
try:
    import lightgbm as lgb
    HAS_LGB = True
except Exception:
    HAS_LGB = False

try:
    import xgboost as xgb
    HAS_XGB = True
except Exception:
    HAS_XGB = False

kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

oof_pred_linear = np.zeros(len(X))  # store predictions on original sales scale
test_pred_linear_accum = np.zeros(len(X_test))

# store per-model preds for blending
oof_parts = []
test_parts = []

fold_idx = 1
for tr_idx, va_idx in kf.split(X):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    fold_preds_linear = []
    fold_test_preds_linear = []

    # --- LightGBM ---
    if HAS_LGB:
        lgb_train = lgb.Dataset(X_tr, label=y_tr)
        lgb_valid = lgb.Dataset(X_va, label=y_va)
        lgb_params = {
            "objective": "regression",
            "metric": "rmse",
            "learning_rate": 0.035,
            "num_leaves": 31,
            "feature_fraction": 0.85,
            "bagging_fraction": 0.85,
            "bagging_freq": 1,
            "min_data_in_leaf": 20,
            "lambda_l1": 0.0,
            "lambda_l2": 1.0,
            "verbosity": -1,
            "seed": RANDOM_STATE
        }
        lgb_model = lgb.train(
            lgb_params, lgb_train,
            num_boost_round=5000,
            valid_sets=[lgb_valid],
            early_stopping_rounds=200,
            verbose_eval=False
        )
        va_pred_log = lgb_model.predict(X_va, num_iteration=lgb_model.best_iteration)
        te_pred_log = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
        fold_preds_linear.append(np.expm1(va_pred_log))
        fold_test_preds_linear.append(np.expm1(te_pred_log))

    # --- XGBoost ---
    if HAS_XGB:
        xgb_model = xgb.XGBRegressor(
            n_estimators=5000,
            learning_rate=0.03,
            max_depth=7,
            subsample=0.85,
            colsample_bytree=0.85,
            reg_alpha=0.0,
            reg_lambda=1.0,
            min_child_weight=1.0,
            gamma=0.0,
            tree_method="hist",
            random_state=RANDOM_STATE,
        )
        xgb_model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="rmse",
            verbose=False,
            early_stopping_rounds=200,
        )
        va_pred_log = xgb_model.predict(X_va)
        te_pred_log = xgb_model.predict(X_test)
        fold_preds_linear.append(np.expm1(va_pred_log))
        fold_test_preds_linear.append(np.expm1(te_pred_log))

    # --- Fallback: HistGradientBoosting (sklearn) ---
    if not HAS_LGB and not HAS_XGB:
        hgb = HistGradientBoostingRegressor(
            max_depth=None,
            learning_rate=0.06,
            max_iter=1200,
            min_samples_leaf=20,
            l2_regularization=0.0,
            random_state=RANDOM_STATE
        )
        hgb.fit(X_tr, y_tr)
        va_pred_log = hgb.predict(X_va)
        te_pred_log = hgb.predict(X_test)
        fold_preds_linear.append(np.expm1(va_pred_log))
        fold_test_preds_linear.append(np.expm1(te_pred_log))

    # Blend within-fold (weighted average). Prefer LGB over XGB slightly if both exist
    if len(fold_preds_linear) == 2:
        va_blend = 0.6 * fold_preds_linear[0] + 0.4 * fold_preds_linear[1]
        te_blend = 0.6 * fold_test_preds_linear[0] + 0.4 * fold_test_preds_linear[1]
    else:
        va_blend = fold_preds_linear[0]
        te_blend = fold_test_preds_linear[0]

    oof_pred_linear[va_idx] = va_blend
    test_pred_linear_accum += te_blend
    oof_parts.append(va_blend)
    test_parts.append(te_blend)

    fold_idx += 1

# Average test predictions over folds
test_pred_linear = test_pred_linear_accum / kf.get_n_splits()

# Evaluate CV RMSE on original scale
rmse = np.sqrt(mean_squared_error(y_true=y_raw, y_pred=oof_pred_linear))
print(f"CV RMSE: {rmse:.4f}")


CV RMSE: 1216.1019


#### 6) Save Submission

In [36]:
# sub = df.loc[~mask_train, id_cols].copy()
# sub["Item_Outlet_Sales"] = test_pred_linear
# sub.to_csv("bigmart_submission.csv", index=False)
# print("Submission saved to: bigmart_submission.csv")

# pd.DataFrame({"Item_Outlet_Sales_OOF": oof_pred_linear}).to_csv("oof_predictions.csv", index=False)
# print("OOF predictions saved to: oof_predictions.csv")


Submission saved to: bigmart_submission.csv
OOF predictions saved to: oof_predictions.csv


In [38]:
from datetime import datetime

# After predictions are generated:
id_cols = ["Item_Identifier", "Outlet_Identifier"]
sub = df.loc[~mask_train, id_cols].copy()
sub["Item_Outlet_Sales"] = test_pred_linear

# Generate timestamp string
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save submission in current working directory
sub_filename = f"bigmart_submission_{timestamp}.csv"
sub.to_csv(sub_filename, index=False)
print(f"Submission saved to: {sub_filename}")

# Also save OOF predictions (optional)
oof_filename = f"oof_predictions_{timestamp}.csv"
pd.DataFrame({"Item_Outlet_Sales_OOF": oof_pred_linear}).to_csv(oof_filename, index=False)
print(f"OOF predictions saved to: {oof_filename}")


Submission saved to: bigmart_submission_20250817_163008.csv
OOF predictions saved to: oof_predictions_20250817_163008.csv
