In [75]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [76]:
train = pd.read_csv("data/train_v9rqX0R.csv", low_memory=False)
test = pd.read_csv("data/test_AbJTz2l.csv", low_memory=False)
test['Item_Outlet_Sales'] = np.nan

In [77]:

data = pd.concat([train, test], ignore_index=True)

In [78]:
data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({
    'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'
})

In [79]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [80]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            14204 non-null  object 
 1   Item_Weight                11765 non-null  float64
 2   Item_Fat_Content           14204 non-null  object 
 3   Item_Visibility            14204 non-null  float64
 4   Item_Type                  14204 non-null  object 
 5   Item_MRP                   14204 non-null  float64
 6   Outlet_Identifier          14204 non-null  object 
 7   Outlet_Establishment_Year  14204 non-null  int64  
 8   Outlet_Size                10188 non-null  object 
 9   Outlet_Location_Type       14204 non-null  object 
 10  Outlet_Type                14204 non-null  object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 1.3+ MB


In [81]:
data.isna().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
dtype: int64

In [82]:
# Fill missing Item_Weight with mean
data['Item_Weight'].fillna(data['Item_Weight'].median(), inplace=True)
data['Item_Weight'].isna().sum()


0

In [83]:
data['Price_per_kg'] = data['Item_MRP'] / data['Item_Weight']

In [84]:
data['Outlet_Size'].unique()

array(['Medium', nan, 'High', 'Small'], dtype=object)

In [85]:
data['Outlet_Type'].unique()

array(['Supermarket Type1', 'Supermarket Type2', 'Grocery Store',
       'Supermarket Type3'], dtype=object)

In [86]:
# Fill missing Outlet_Size using mode per Outlet_Type
data['Outlet_Size'].fillna(data.groupby('Outlet_Type')['Outlet_Size'].transform(lambda x: x.mode()[0]), inplace=True)
data['Outlet_Size'].isna().sum()

0

In [87]:
# Create New Feature: Outlet_Years
data['Outlet_Years'] = 2025 - data['Outlet_Establishment_Year']

In [88]:
data['Item_Type'].unique()

array(['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables',
       'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
       'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
       'Breads', 'Starchy Foods', 'Others', 'Seafood'], dtype=object)

In [89]:
## make groups for item type
perishables = ['Dairy', 'Fruits and Vegetables', 'Meat', 'Frozen Foods', 'Seafood', 'Breads', 'Breakfast']
non_perishables = ['Canned', 'Baking Goods', 'Starchy Foods', 'Household', 'Health and Hygiene']
consumables = ['Snack Foods', 'Soft Drinks', 'Hard Drinks']
other = ['Others']

def group_item_type(x):
    if x in perishables:
        return 'Perishable'
    elif x in non_perishables:
        return 'Non-Perishable'
    elif x in consumables:
        return 'Consumables'
    else:
        return 'Others'

data['Item_Category_Grouped'] = data['Item_Type'].apply(group_item_type)



In [90]:
data['Item_Category_Grouped'].unique()

array(['Perishable', 'Consumables', 'Non-Perishable', 'Others'],
      dtype=object)

In [91]:
data['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular'], dtype=object)

In [92]:
data['Outlet_Location_Type'].unique()

array(['Tier 1', 'Tier 3', 'Tier 2'], dtype=object)

In [93]:

# Create New Feature: Item_Visibility_MeanRatio
data['Item_Visibility_MeanRatio'] = data['Item_Visibility'] / data.groupby('Item_Identifier')['Item_Visibility'].transform('mean')
data['Item_Visibility_MeanRatio'].replace([np.inf, -np.inf], 0, inplace=True)
data['Item_Visibility_MeanRatio'].fillna(0, inplace=True)

In [94]:
# Encode Categorical Variables
le = LabelEncoder()
categorical_cols = ['Item_Fat_Content', 'Outlet_Location_Type', 'Outlet_Size', 'Outlet_Type', 'Item_Category_Grouped']

# for col in categorical_cols:
#     data[col] = le.fit_transform(data[col])

In [95]:
data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Price_per_kg,Outlet_Years,Item_Category_Grouped,Item_Visibility_MeanRatio
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,26.861204,26,Perishable,0.931078
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,8.153581,16,Consumables,0.933420
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,8.092457,26,Perishable,0.960069
3,FDX07,19.20,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,Small,Tier 3,Grocery Store,732.3800,9.484115,27,Perishable,0.000000
4,NCD19,8.93,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,6.031512,38,Non-Perishable,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,,13.458610,28,Consumables,0.874729
14200,FDD47,7.60,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,,22.255895,16,Non-Perishable,0.878292
14201,NCO17,10.00,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,Small,Tier 2,Supermarket Type1,,11.874400,23,Non-Perishable,1.162245
14202,FDJ26,15.30,Regular,0.000000,Canned,214.6218,OUT017,2007,Small,Tier 2,Supermarket Type1,,14.027569,18,Non-Perishable,0.000000


In [96]:
ID_columns = ['Item_Identifier', 'Outlet_Identifier']

In [97]:
train_df = data[~data['Item_Outlet_Sales'].isna()]
### outlet specific metrics

# outlet_avg_sales = train_df.groupby('Outlet_Identifier')['Item_Outlet_Sales'].mean().to_dict()
# data['Outlet_Avg_Sales'] = data['Outlet_Identifier'].map(outlet_avg_sales)

# train_df = data[~data['Item_Outlet_Sales'].isna()]
test_df = data[data['Item_Outlet_Sales'].isna()]

In [98]:
# Final list of features
features = [
    'Item_Weight', 'Item_Visibility', 'Item_MRP',
    'Outlet_Years',
    'Item_Fat_Content', 'Outlet_Location_Type',
    'Outlet_Size', 'Outlet_Type', 'Item_Category_Grouped',
    # 'Outlet_Avg_Sales'
]

# features = [
#     'Item_Weight', 'Item_Visibility', 'Item_MRP',
#     'Outlet_Years',
#     'Item_Fat_Content', 'Outlet_Location_Type',
#     'Outlet_Size', 'Outlet_Type', 'Item_Type']

In [237]:
#### now further split train into train and validation

X = train_df[features]
y = train_df['Item_Outlet_Sales']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [238]:

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "ElasticNet": ElasticNet(alpha=0.01, l1_ratio=0.5),
    
    "Decision Tree": DecisionTreeRegressor(max_depth=10, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    "Extra Trees": ExtraTreesRegressor(n_estimators=100, random_state=42),
    
    "AdaBoost": AdaBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42),
    
    "K-Nearest Neighbors": KNeighborsRegressor(n_neighbors=5),
    "Support Vector Regressor": SVR(kernel='rbf', C=1.0, epsilon=0.2)
}

In [239]:

# metrics = []
# for name, model in models.items():
#     model.fit(X_train, y_train)
#     train_preds = model.predict(X_train)
#     val_preds = model.predict(X_val)
    
#     rmse_train = np.sqrt(mean_squared_error(y_train, train_preds))
#     r2_train = r2_score(y_train, train_preds)
#     rmse_val = np.sqrt(mean_squared_error(y_val, val_preds))
#     r2_val = r2_score(y_val, val_preds)
    
#     metrics.append({
#         'Model': name,
#         'RMSE_Train': round(rmse_train, 2),
#         'R2_Train': round(r2_train, 2),
#         'RMSE_val': round(rmse_val, 2),
#         'R2_val': round(r2_val, 2)
#     })

# # Convert to DataFrame
# metrics_df = pd.DataFrame(metrics)

In [240]:
# metrics_df

In [241]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import RandomizedSearchCV

# # Define parameter grid for RandomizedSearchCV
# param_dist = {
#     'n_estimators': [100, 200, 300, 500],
#     'max_depth': [None, 5, 10, 15, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2']
# }

# # Instantiate the base model
# rf = RandomForestRegressor(random_state=42)

# # RandomizedSearchCV
# random_search = RandomizedSearchCV(
#     estimator=rf,
#     param_distributions=param_dist,
#     n_iter=30,  # number of combinations to try
#     cv=5,
#     scoring='neg_root_mean_squared_error',
#     n_jobs=-1,
#     verbose=2,
#     random_state=42
# )

# # Fit on training set

# # Combine X_train and X_val
# X_full_train = pd.concat([X_train, X_val], axis=0)
# y_full_train = pd.concat([y_train, y_val], axis=0)

# random_search.fit(X_full_train, y_full_train)

# # Best model and params
# best_rf = random_search.best_estimator_
# print("\nBest Parameters found:")
# print(random_search.best_params_)

# # Evaluate on validation set
# train_preds = best_rf.predict(X_train)
# val_preds = best_rf.predict(X_val)

# rmse_train = np.sqrt(mean_squared_error(y_train, train_preds))
# r2_train = r2_score(y_train, train_preds)
# rmse_val = np.sqrt(mean_squared_error(y_val, val_preds))
# r2_val = r2_score(y_val, val_preds)

# print(f"\n Tuned Random Forest Performance:")
# print(f"RMSE Train: {rmse_train:.2f}, R2 Train: {r2_train:.2f}")
# print(f"RMSE Validation: {rmse_val:.2f}, R2 Validation: {r2_val:.2f}")


In [242]:
# # Predict on test set using the best tuned model
# final_test_preds = best_rf.predict(test_df[features])

# # Attach predictions to test set
# test_df['Item_Outlet_Sales'] = final_test_preds

# # Create submission file
# submission = test_df[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']]
# submission.to_csv("BigMart_Prediction_Submission.csv", index=False)


In [243]:
# from xgboost import XGBRegressor
# from sklearn.model_selection import RandomizedSearchCV

# param_grid_xgb = {
#     'n_estimators': [100, 200, 300, 500],
#     'max_depth': [3, 5, 7, 10],
#     'learning_rate': [0.01, 0.05, 0.1, 0.2],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0],
#     'reg_alpha': [0, 0.01, 0.1, 1],
#     'reg_lambda': [1, 1.5, 2, 3]
# }

# xgb = XGBRegressor(random_state=42)

# random_search_xgb = RandomizedSearchCV(
#     estimator=xgb,
#     param_distributions=param_grid_xgb,
#     n_iter=30,
#     cv=5,
#     scoring='neg_root_mean_squared_error',
#     verbose=2,
#     n_jobs=-1,
#     random_state=42
# )

# random_search_xgb.fit(X_full_train, y_full_train)

# best_xgb = random_search_xgb.best_estimator_
# print("\nBest Parameters (XGBoost):")
# print(random_search_xgb.best_params_)


In [244]:
# # Predict on the final test set using tuned XGBoost model
# test_preds = best_xgb.predict(test_df[features])

# # Prepare submission DataFrame
# submission = test[['Item_Identifier', 'Outlet_Identifier']].copy()
# submission['Item_Outlet_Sales'] = test_preds


# submission['Item_Outlet_Sales'].clip(lower=0, inplace=True)
# # Save to CSV
# submission.to_csv('submission_xgboost.csv', index=False)

# print("✅ Submission file 'submission_xgboost.csv' saved.")


In [245]:
### ANN

In [246]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras import backend as K
import tensorflow as tf

tf.keras.utils.set_random_seed(42)

In [247]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, QuantileTransformer, FunctionTransformer
from sklearn.pipeline import Pipeline

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols),
        ('num', RobustScaler(), [col for col in X_train.columns if col not in categorical_cols])
    ]
)


In [248]:
X_test  = test_df[features]
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_val_scaled = scaler.transform(X_val)
# X_test_scaled = scaler.transform(X_test)


# Fit on training data
X_train_encoded = preprocessor.fit_transform(X_train)
X_val_encoded = preprocessor.transform(X_val)
X_test_encoded = preprocessor.transform(X_test)



# X_train_scaled = scaler.fit_transform(X_full_train)
# # X_val_scaled = scaler.transform(X_val)
# # X_test_scaled = scaler.transform(X_test)



In [259]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau



# early_stop = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.8,        # reduce by half
    patience=50,        # if val_loss doesn’t improve for 5 epochs
    min_lr=1e-6,
    verbose=1
)

def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

model = Sequential([
    Dense(64, input_dim=X_train_encoded.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1)  # Output layer for regression
])

model.compile(optimizer=Adam(learning_rate=0.001),    loss=rmse,     
    metrics=[rmse])


In [260]:
# early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(
    X_train_encoded, y_train,
    validation_data=(X_val_encoded, y_val),
    epochs=250,
    batch_size=64,
    callbacks=[reduce_lr],
    verbose=1, 
)


Epoch 1/250
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 2767.2991 - rmse: 2767.3286 - val_loss: 2639.2170 - val_rmse: 2640.1516 - learning_rate: 0.0010
Epoch 2/250
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2648.6875 - rmse: 2648.6685 - val_loss: 2128.3611 - val_rmse: 2130.2087 - learning_rate: 0.0010
Epoch 3/250
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1783.2035 - rmse: 1783.1779 - val_loss: 1181.5394 - val_rmse: 1179.3330 - learning_rate: 0.0010
Epoch 4/250
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1288.7994 - rmse: 1288.7979 - val_loss: 1141.5710 - val_rmse: 1139.2694 - learning_rate: 0.0010
Epoch 5/250
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1214.9454 - rmse: 1214.9390 - val_loss: 1067.9230 - val_rmse: 1065.9745 - learning_rate: 0.0010
Epoch 6/250
[1m107/107[0m [32m━━━━━━━━━━━━

In [256]:
# # early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# history = model.fit(
#     X_train_scaled, y_train,
#     validation_data=(X_val_scaled, y_val),
#     epochs=200,
#     batch_size=64,
#     callbacks=[reduce_lr],
#     verbose=1
# )


In [257]:
test_preds_ann = model.predict(X_test_encoded).flatten()
submission_ann = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': test_preds_ann
})
submission_ann.to_csv("submission_ann.csv", index=False)


[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
