In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool
df_train = pd.read_csv("train.csv")
df_test2= pd.read_csv("test.csv")
# --- Step 1: Copy data ---
df, df_test = df_train.copy(), df_test2.copy()

# --- Step 2: Fill Missing Item_Weight ---
# Create mapping of (Item_Identifier, Item_Type) -> mean Item_Weight
item_weight_map = df.groupby(['Item_Identifier', 'Item_Type'])['Item_Weight'].mean().to_dict()

# Function to fill missing Item_Weight
def fill_item_weight(row):
    if pd.isna(row['Item_Weight']):
        return item_weight_map.get((row['Item_Identifier'], row['Item_Type']), np.nan)
    return row['Item_Weight']

# Apply to train and test
df['Item_Weight'] = df.apply(fill_item_weight, axis=1)
df['Item_Weight'] = df['Item_Weight'].fillna(df['Item_Weight'].mean())

df_test['Item_Weight'] = df_test.apply(fill_item_weight, axis=1)
df_test['Item_Weight'] = df_test['Item_Weight'].fillna(df['Item_Weight'].mean())
# --- Step 3: Fill Missing Outlet_Size ---
outlet_type_size_mode = df.groupby(['Outlet_Type'])['Outlet_Size'].agg(lambda x: x.mode()[0])
df['Outlet_Size'] = df.apply(lambda row: outlet_type_size_mode[row['Outlet_Type']] if pd.isna(row['Outlet_Size']) else row['Outlet_Size'], axis=1)
df_test['Outlet_Size'] = df_test.apply(lambda row: outlet_type_size_mode[row['Outlet_Type']] if pd.isna(row['Outlet_Size']) else row['Outlet_Size'], axis=1)
# --- Step 3: Fill Missing Outlet_Size ---
# Fill Outlet_Size using Outlet_Type + Outlet_Location_Type mode
# Fill Outlet_Size using grouped mode with global fallback
# outlet_type_size_mode = (
#     df.groupby(['Outlet_Type', 'Outlet_Location_Type'])['Outlet_Size']
#     .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
#     .to_dict()
# )

# global_mode = df['Outlet_Size'].mode()[0]

# def fill_outlet_size(row):
#     if pd.isna(row['Outlet_Size']):
#         key = (row['Outlet_Type'], row['Outlet_Location_Type'])
#         return outlet_type_size_mode.get(key, global_mode)  # fallback
#     return row['Outlet_Size']

# df['Outlet_Size'] = df.apply(fill_outlet_size, axis=1)
# df_test['Outlet_Size'] = df_test.apply(fill_outlet_size, axis=1)

# # 🔑 Fix: ensure no NaN and force string type
# df['Outlet_Size'] = df['Outlet_Size'].astype(str).fillna(global_mode)
# df_test['Outlet_Size'] = df_test['Outlet_Size'].astype(str).fillna(global_mode)



# --- Step 4: Standardize Fat Content ---
fat_map = {'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'}
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace(fat_map)
df_test['Item_Fat_Content'] = df_test['Item_Fat_Content'].replace(fat_map)

# --- Step 5: Feature Engineering ---
df['Outlet_Age'] = 2025 - df['Outlet_Establishment_Year']
df_test['Outlet_Age'] = 2025 - df_test['Outlet_Establishment_Year']

df['MRP_Band'] = pd.cut(df['Item_MRP'], bins=[0,70,140,200,300], labels=['Low','Medium','High','Premium'])
df_test['MRP_Band'] = pd.cut(df_test['Item_MRP'], bins=[0,70,140,200,300], labels=['Low','Medium','High','Premium'])



df['Price_per_Unit'] = df['Item_MRP'] / df['Item_Weight']
df_test['Price_per_Unit'] = df_test['Item_MRP'] / df_test['Item_Weight']

df['Outlet_Item_Interaction'] = df['Outlet_Type'] + '_' + df['MRP_Band'].astype(str)
df_test['Outlet_Item_Interaction'] = df_test['Outlet_Type'] + '_' + df_test['MRP_Band'].astype(str)

df['Outlet_Item_Interaction1'] = df['Outlet_Size'] + '_' + df['Outlet_Age'].astype(str)+ '_' + df['Outlet_Identifier']
df_test['Outlet_Item_Interaction1'] = df_test['Outlet_Size'] + '_' + df_test['Outlet_Age'].astype(str)+ '_' + df_test['Outlet_Identifier']

# --- Step 5b: Feature Interactions ---
df['Weight_Visibility'] = df['Item_Weight'] * df['Item_Visibility']
df_test['Weight_Visibility'] = df_test['Item_Weight'] * df_test['Item_Visibility']

df['MRP_per_Visibility'] = df['Item_MRP'] / (df['Item_Visibility'] + 1e-5)
df_test['MRP_per_Visibility'] = df_test['Item_MRP'] / (df_test['Item_Visibility'] + 1e-5)


# df_test['Outlet_Item_Interaction_TE'] = df_test['Outlet_Item_Interaction'].map(df['Outlet_Item_Interaction_TE'])
# df_test['Outlet_Item_Interaction_TE'] = df_test['Outlet_Item_Interaction_TE'].fillna(global_mean)

# Count frequency of each Item_Type
item_type_counts = df['Item_Type'].value_counts()

# Map counts back to dataframe
df['Item_Type_Count'] = df['Item_Type'].map(item_type_counts)
df_test['Item_Type_Count'] = df_test['Item_Type'].map(item_type_counts).fillna(0)

# Create bands (you can adjust bins)
df['Item_Type_Band'] = pd.cut(df['Item_Type_Count'],
                              bins=[0, 100, 300, 600, 1000],
                              labels=['Rare', 'Occasional', 'Common', 'Frequent']).astype(str)

df_test['Item_Type_Band'] = pd.cut(df_test['Item_Type_Count'],
                                   bins=[0, 100, 300, 600, 1000],
                                   labels=['Rare', 'Occasional', 'Common', 'Frequent']).astype(str)

# Handle zeros

# Mean visibility ratio
item_vis_mean = df.groupby('Item_Identifier')['Item_Visibility'].transform('mean')
df['Visibility_MeanRatio'] = df['Item_Visibility'] / item_vis_mean
item_vis_mean_test = df_test.groupby('Item_Identifier')['Item_Visibility'].transform('mean')
df_test['Visibility_MeanRatio'] = df_test['Item_Visibility'] / item_vis_mean_test

item_avg_visibility = df.groupby(['Outlet_Identifier'])['Item_Visibility'].transform('mean')
df['Visibility_MeanRatio_2'] = df['Item_Visibility'] / item_avg_visibility
item_avg_visibility_test = df_test.groupby(['Outlet_Identifier'])['Item_Visibility'].transform('mean')
df_test['Visibility_MeanRatio_2'] = df_test['Item_Visibility'] / item_avg_visibility_test

# --- Step 6: Prepare Features ---
features = [
    'Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year',
    'Outlet_Age', 'Visibility_MeanRatio', 'Price_per_Unit', 'Outlet_Item_Interaction','Outlet_Item_Interaction1',
    'Weight_Visibility', 'MRP_per_Visibility','Visibility_MeanRatio_2',
    'Item_Fat_Content', 'Outlet_Type', 'Outlet_Size', 'Outlet_Location_Type', 'MRP_Band',
    'Item_Type','Item_Type_Band', 'Item_Type_Count'
]
# features = [
#     'Outlet_Type','Item_MRP','MRP_per_Visibility', 'Item_Weight']


X = df[features]
y = df['Item_Outlet_Sales']
X_test_final = df_test[features]

# cat_features = [ 'Outlet_Type']

cat_features = ['Item_Fat_Content', 'Outlet_Type', 'Outlet_Size', 'Outlet_Location_Type', 
                'MRP_Band','Item_Type','Item_Type_Band','Item_Type_Count','Outlet_Item_Interaction','Outlet_Item_Interaction1']

# --- Step 7: K-Fold Cross Validation ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred_test_total = np.zeros(X_test_final.shape[0])
val_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    val_pool = Pool(X_val, y_val, cat_features=cat_features)
    test_pool = Pool(X_test_final, cat_features=cat_features)
    
    model = CatBoostRegressor(
        iterations=3000,
        learning_rate=0.04,
        depth=6,
        l2_leaf_reg=10,
        bagging_temperature=0.82,
        eval_metric='RMSE',
        random_seed=42,
        early_stopping_rounds=50,
        verbose=100 )
    
    model.fit(train_pool, eval_set=val_pool, use_best_model=True)
    
    y_val_pred = (model.predict(val_pool))
    val_rmse = np.sqrt(mean_squared_error(df['Item_Outlet_Sales'].iloc[val_idx], y_val_pred))
    val_scores.append(val_rmse)
    
    y_pred_test_total += np.maximum((model.predict(test_pool)), 0) / kf.n_splits

print("Average CV RMSE:", np.mean(val_scores))

# --- Step 8: Submission ---
submission_df = pd.DataFrame({
    'Item_Identifier': df_test['Item_Identifier'],
    'Outlet_Identifier': df_test['Outlet_Identifier'],
    'Item_Outlet_Sales': y_pred_test_total
})

submission_df.to_csv('submission_catboost_final.csv', index=False)
print(submission_df.head()) 

Fold 1
0:	learn: 1682.4522348	test: 1614.1551215	best: 1614.1551215 (0)	total: 217ms	remaining: 10m 50s


In [None]:
# # --- Step 7: Hyperparameter Tuning with Optuna ---
# def objective(trial):
#     params = {
#         'iterations': trial.suggest_int('iterations', 1000, 4000),
#         'depth': trial.suggest_int('depth', 4, 10),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
#         'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
#         'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
#         'random_seed': 42,
#         'eval_metric': 'RMSE',
#         'early_stopping_rounds': 50,
#         'verbose': 0
#     }

#     kf = KFold(n_splits=5, shuffle=True, random_state=42)
#     val_scores = []

#     for train_idx, val_idx in kf.split(X):
#         X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         train_pool = Pool(X_train, y_train, cat_features=cat_features)
#         val_pool = Pool(X_val, y_val, cat_features=cat_features)

#         model = CatBoostRegressor(**params)
#         model.fit(train_pool, eval_set=val_pool, use_best_model=True)
#         y_val_pred = model.predict(val_pool)
#         val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
#         val_scores.append(val_rmse)

#     return np.mean(val_scores)

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=20)  # adjust n_trials for more tuning

# print("Best params:", study.best_params)
# print("Best CV RMSE:", study.best_value)

# # --- Step 8: Train final model with best params ---
# best_params = study.best_params
# best_params.update({
#     'random_seed': 42,
#     'eval_metric': 'RMSE',
#     'early_stopping_rounds': 50,
#     'verbose': 100
# })

# final_model = CatBoostRegressor(**best_params)
# final_model.fit(Pool(X, y, cat_features=cat_features))

# # --- Step 9: Predict on test set ---
# test_pool = Pool(X_test_final, cat_features=cat_features)
# y_pred_test = np.maximum(final_model.predict(test_pool), 0)
