# All ML Pipeline Combined 

In [77]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# 1 : Exploratary Data Analysis

In [78]:
# 1. Split data train and test
train = pd.read_csv("data/train_v9rqX0R.csv")
test = pd.read_csv("data/test_AbJTz2l.csv")
output_file = 'ABB_submission.csv'

# 2. Combine train and test for consistent preprocessing
test['Item_Outlet_Sales'] = np.nan
data = pd.concat([train, test], sort=False)

In [79]:
# 3. Standardize Item_Fat_Content
data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({
    'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'})

In [80]:
# 4. Create Item_Type_Combined
data['Item_Type_Combined'] = data['Item_Identifier'].apply(lambda x: {'F': 'Food', 'D': 'Drinks', 'N': 'Non-Consumable'}.get(x[0], 'Other'))

# 2 : Feature Engineering

In [81]:
# 5. Non consumables lets make it non edible
data.loc[data['Item_Type_Combined'] == 'Non-Consumable', 'Item_Fat_Content'] = 'Non-Edible'


In [82]:
# 6. Impute Item_Weight with group mean
data['Item_Weight'] = data['Item_Weight'].fillna(
                      data.groupby('Item_Type_Combined')['Item_Weight'].transform('mean'))

In [83]:
# 7. Flag zero visibility and replace with NaN and impute
data['Zero_Visibility_Flag'] = (data['Item_Visibility'] == 0).astype(int)
data['Item_Visibility'] = data['Item_Visibility'].replace(0, np.nan)
data['Item_Visibility'] = data['Item_Visibility'].fillna(
    data.groupby(['Item_Type_Combined', 'Outlet_Type'])['Item_Visibility'].transform('median'))

In [84]:
# 8. Impute Outlet_Size
def fill_outlet_size(row):
    mode = data[(data['Outlet_Type'] == row['Outlet_Type']) & 
                (data['Outlet_Location_Type'] == row['Outlet_Location_Type'])]['Outlet_Size'].mode()
    return mode.iloc[0] if not mode.empty else 'Medium'
data['Outlet_Size'] = data.apply(lambda row: fill_outlet_size(row) if pd.isna(row['Outlet_Size']) else row['Outlet_Size'], axis=1)

In [85]:
# 9. Outlet years feature
data['Outlet_Years'] = 2013 - data['Outlet_Establishment_Year']

# 10. Log transform features
data['Item_MRP_log'] = np.log1p(data['Item_MRP'])
data['Item_Visibility_log'] = np.log1p(data['Item_Visibility'])

# 11. Mean visibility ratio
data['Item_Visibility_MeanRatio'] = data['Item_Visibility'] / (
    data.groupby(['Item_Identifier', 'Outlet_Type'])['Item_Visibility'].transform('mean') + 1e-8
)

# 12. Interaction features
data['MRP_Outlet_Years'] = data['Item_MRP'] * data['Outlet_Years']
data['MRP_Visibility'] = data['Item_MRP'] * data['Item_Visibility']
data['MRP_Weight'] = data['Item_MRP'] * data['Item_Weight']

# 13. Category features
data['Item_Category'] = data['Item_Identifier'].str[:2]
train_mask = ~data['Item_Outlet_Sales'].isna()
overall_mean = data[train_mask]['Item_Outlet_Sales'].mean()


# 3 : Encoding Categorical Variable

In [86]:
# 14. Target encoding
category_stats = data[train_mask].groupby('Item_Category')['Item_Outlet_Sales'].agg(['mean', 'count'])
category_stats['smoothed'] = (category_stats['count'] * category_stats['mean'] + 10 * overall_mean) / (category_stats['count'] + 10)
data['Item_Category_TargetEncoded'] = data['Item_Category'].map(category_stats['smoothed']).fillna(overall_mean)

In [87]:
# 15. Ordinal encoding
sorted_means = category_stats['mean'].sort_values()
category_to_ordinal = {cat: idx for idx, cat in enumerate(sorted_means.index)}
data['Item_Category_Ordinal'] = data['Item_Category'].map(category_to_ordinal).fillna(len(category_to_ordinal))

In [88]:
# 16. Mean sales encoding
data['Type_Combined_MeanSales'] = data['Item_Type_Combined'].map(data[train_mask].groupby('Item_Type_Combined')['Item_Outlet_Sales'].mean())
data['Outlet_MeanSales'] = data['Outlet_Identifier'].map(data[train_mask].groupby('Outlet_Identifier')['Item_Outlet_Sales'].mean())
data['Category_MeanSales'] = data['Item_Category'].map(data[train_mask].groupby('Item_Category')['Item_Outlet_Sales'].mean())

In [89]:
# 17. Label encode remaining categorical features
le_cols = ['Item_Fat_Content', 'Outlet_Location_Type', 'Outlet_Size', 'Outlet_Type', 'Item_Type_Combined', 'Outlet_Identifier']
for col in le_cols:
    data[col] = LabelEncoder().fit_transform(data[col].astype(str))

In [90]:
# 18. Spliting train and test data from data to original form
train_clean = data[~data['Item_Outlet_Sales'].isna()].copy()
test_clean = data[data['Item_Outlet_Sales'].isna()].copy()


# 4 : Feature Selection

In [91]:
# 19. Final feature list after removing less relevant feature and non important feature
features = [
    'Item_Weight', 'Item_Visibility', 'Item_Fat_Content', 'Item_Type_Combined',
    'Item_MRP', 'Outlet_Identifier', 'Outlet_Size',
    'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Years',
    'Item_Visibility_MeanRatio', 'Zero_Visibility_Flag',
    'Item_MRP_log', 'Item_Visibility_log',
    'MRP_Outlet_Years', 'MRP_Visibility', 'MRP_Weight',
    'Type_Combined_MeanSales', 'Outlet_MeanSales', 'Category_MeanSales',
    'Item_Category_TargetEncoded', 'Item_Category_Ordinal']

# 5 : Data Imputation and Scaling

In [92]:
# 20. Imputation using KNN

X = train_clean[features]
y = train_clean['Item_Outlet_Sales']
X_test = test_clean[features]


imputer = KNNImputer(n_neighbors=3)

X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)


In [93]:
# 21. Scaling data using Min Max scalaer
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_test_scaled = scaler.transform(X_test_imputed)


# 6 : Spliting based on training and testing data

In [94]:

# 22. Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# 7 . ML Modelling with different regression model

In [95]:
# 23. Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_val)
lr_rmse = np.sqrt(mean_squared_error(y_val, lr_preds))
print(f'Linear Regression RMSE: {lr_rmse:.2f}')

Linear Regression RMSE: 1065.44


In [96]:
# 24. Ridge Regression
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge_preds = ridge.predict(X_val)
ridge_rmse = np.sqrt(mean_squared_error(y_val, ridge_preds))
print(f'Ridge Regression RMSE: {ridge_rmse:.2f}')

Ridge Regression RMSE: 1064.84


In [97]:
# 25. Lasso Regression
lasso = Lasso()
lasso.fit(X_train, y_train)
lasso_preds = lasso.predict(X_val)
lasso_rmse = np.sqrt(mean_squared_error(y_val, lasso_preds))
print(f'Lasso Regression RMSE: {lasso_rmse:.2f}')


Lasso Regression RMSE: 1063.91


In [98]:
# 26. ElasticNet
enet = ElasticNet()
enet.fit(X_train, y_train)
enet_preds = enet.predict(X_val)
enet_rmse = np.sqrt(mean_squared_error(y_val, enet_preds))
print(f'ElasticNet RMSE: {enet_rmse:.2f}')


ElasticNet RMSE: 1441.71


In [99]:
# 27. Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_val)
dt_rmse = np.sqrt(mean_squared_error(y_val, dt_preds))
print(f'Decision Tree RMSE: {dt_rmse:.2f}')

Decision Tree RMSE: 1501.42


In [100]:
# 28. Train & validate baseline RandomForest
rf_base = RandomForestRegressor(random_state=42)
rf_base.fit(X_train, y_train)
base_preds = rf_base.predict(X_val)
base_rmse = np.sqrt(mean_squared_error(y_val, base_preds))
print(f'Baseline RF RMSE: {base_rmse:.2f}')

Baseline RF RMSE: 1063.89


In [101]:
# 29. Gradient Boosting
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_train)
gbr_preds = gbr.predict(X_val)
gbr_rmse = np.sqrt(mean_squared_error(y_val, gbr_preds))
print(f'Gradient Boosting RMSE: {gbr_rmse:.2f}')

Gradient Boosting RMSE: 1033.29


In [102]:
# 30. AdaBoost
adb = AdaBoostRegressor(random_state=42)
adb.fit(X_train, y_train)
adb_preds = adb.predict(X_val)
adb_rmse = np.sqrt(mean_squared_error(y_val, adb_preds))
print(f'AdaBoost RMSE: {adb_rmse:.2f}')

AdaBoost RMSE: 1297.63


In [103]:
# 31. KNN Regressor
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_val)
knn_rmse = np.sqrt(mean_squared_error(y_val, knn_preds))
print(f'KNN Regressor RMSE: {knn_rmse:.2f}')

KNN Regressor RMSE: 1120.96


In [104]:
# 32. Support Vector Regressor
svr = SVR()
svr.fit(X_train, y_train)
svr_preds = svr.predict(X_val)
svr_rmse = np.sqrt(mean_squared_error(y_val, svr_preds))
print(f'SVR RMSE: {svr_rmse:.2f}')

SVR RMSE: 1569.34


In [105]:
# 33. XGBoost
xgb = XGBRegressor(random_state=42, verbosity=0)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_val)
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_preds))
print(f'XGBoost RMSE: {xgb_rmse:.2f}')

XGBoost RMSE: 1121.04


### Based on the observation above Random forest and Gradient Boosing perform better than resr of the model so i'll take these 2 as refrence and work on hyper paramter tuning and cehck the result

In [106]:
# 34. Hyperparameter tuning for Random Forest
grid_params = {'n_estimators': [300,400], 'max_depth': [6,8], 
               'min_samples_split': [10,12], 'min_samples_leaf': [2,4]}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

grid = GridSearchCV(rf, grid_params, cv=3, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

val_preds = grid.predict(X_val)
val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f'Random Forest Validation RMSE: {val_rmse:.2f}')

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Random Forest Validation RMSE: 1020.94


In [107]:
# 35. Hyperparameter tuning Gradient Boosting
grid_params = {
    'n_estimators': [300, 400], 'max_depth': [6, 8],'min_samples_split': [10, 12],
    'min_samples_leaf': [2, 4],'learning_rate': [0.05, 0.1]}

gbr = GradientBoostingRegressor(random_state=42)

grid = GridSearchCV(gbr,grid_params,cv=3,scoring='neg_root_mean_squared_error',verbose=1,n_jobs=-1)

grid.fit(X_train, y_train)

val_preds = grid.predict(X_val)
val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f'Gradient Boosting Validation RMSE: {val_rmse:.2f}')

Fitting 3 folds for each of 32 candidates, totalling 96 fits
Gradient Boosting Validation RMSE: 1068.45


# After submission I tried with hyperparamter tuning and got best stable result with cross validation 3 with grid_param combination Random Forest gives best RMSE which is 1020.94

# Final Model fitting

In [108]:
# 36. Final selection Random Forest
grid_params = {'n_estimators': [400], 'max_depth': [6], 
               'min_samples_split': [10], 'min_samples_leaf': [2]}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

grid = GridSearchCV(rf, grid_params, cv=3, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)


rf_final = RandomForestRegressor(**grid.best_params_, random_state=42, n_jobs=-1)
rf_final.fit(X_scaled, y)
final_preds = rf_final.predict(X_test_scaled)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


# 9 : Final Submission

In [109]:
# 37. Final submission
submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'].values,
    'Outlet_Identifier': test['Outlet_Identifier'].values,
    'Item_Outlet_Sales': final_preds
})

submission.to_csv(output_file, index=False)