In [None]:
from dython.nominal import associations
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis,randint,uniform

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV,train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [None]:
###Reading the data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.head(5)

In [None]:
##Shape of training and test dataset
print(df_train.shape)
print(df_test.shape)

In [None]:
##Information of training data
df_train.info()

In [None]:
##Statistical information of numerical column
df_train.describe()

In [None]:
#description of categorical column
df_train.select_dtypes(include=['object', 'category']).describe()

In [None]:
##Calculation of Age of outlet
df_train['Outlet_Establishment_Year'] = 2025-df_train['Outlet_Establishment_Year']
df_test['Outlet_Establishment_Year'] = 2025- df_test['Outlet_Establishment_Year']
df_train.describe()

In [None]:
# Function that calculates the percentage of missing values
def calc_percent_NAs(df):
    nans = pd.DataFrame(df.isnull().sum().sort_values(ascending=False)/len(df), columns=['percent']) 
    idx = nans['percent'] > 0
    return nans[idx]
print('Training data missing data percentage \n')
print(calc_percent_NAs(df_train))
print('\nTest data missing data percentage \n')
print(calc_percent_NAs(df_test))

In [None]:
#Function to analyse Numerical Column

def analyze_numerical_columns(df):
    # Select numeric columns
    num_cols = df.select_dtypes(include=['number']).columns
    
    print(f"\n Numerical columns found: {list(num_cols)}")
    
    for col in num_cols:
        print(f"\n Analysis for: {col}")
        print("-" * 40)
        print(f"Skewness: {skew(df[col].dropna()):.2f}")
        print(f"Kurtosis: {kurtosis(df[col].dropna()):.2f}")
        
        # Distribution Plot
        plt.figure(figsize=(8, 4))
        sns.histplot(df[col].dropna(), kde=True, bins=30)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.tight_layout()
        plt.show()
        
        # Boxplot
        plt.figure(figsize=(8, 3))
        sns.boxplot(x=df[col])
        plt.title(f'Boxplot of {col}')
        plt.tight_layout()
        plt.show()

In [None]:
##Analysis of Numerical Columns
analyze_numerical_columns(df_train)

In [None]:
##Function to analyse the categorical column
def analyze_categorical_columns(df):
    # Identify columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns

    print(f"\n Categorical columns: {list(categorical_cols)}")
    # Column to remove as unique item identifier are very high
    remove_col = 'Item_Identifier' 

    # Remove the column if it exists
    categorical_cols = [col for col in categorical_cols if col != remove_col]

    for cat in categorical_cols:
        print(f"\n Categorical Column: {cat}")
        print(df[cat].value_counts(dropna=False))
        plt.figure(figsize=(8, 4))
        sns.countplot(x=cat, data=df)
        plt.title(f'Count of records by {cat}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        ##Categorical column Vs Sales
        plt.figure(figsize=(8, 4))
        sns.barplot(x=cat, y='Item_Outlet_Sales', data=df, estimator='mean')
        plt.title(f'Average Item Outlet Sales by {cat}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()



In [None]:
##Analysis of Categorical columns
analyze_categorical_columns(df_train)

Imputation of Missing Values

1. Outlet size: Outlet size is related to outlet information

In [None]:
##Calculate mode based on outlet type & outlet location type
mode_per_group = (
    df_train.groupby(['Outlet_Type', 'Outlet_Location_Type'])['Outlet_Size']
    .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
    .reset_index()
    .rename(columns={'Outlet_Size': 'Mode_Outlet_Size'})
)

print(mode_per_group)
#Insights: if Grocery Store then outlet size is Small

In [None]:
##Imputation of Missing Value fpr Outlet size if Outlet Type Grocery Store,
df_train.loc[(df_train['Outlet_Size'].isna()) & (df_train['Outlet_Type'] == 'Grocery Store'), 'Outlet_Size'] = 'Small'
df_test.loc[(df_test['Outlet_Size'].isna()) & (df_test['Outlet_Type'] == 'Grocery Store'), 'Outlet_Size'] = 'Small'

In [None]:
#else replace with mode of coombination of outlet type and output location type
# Step 1: Calculate mode of Outlet Size for each group (Outlet_Type, output location type) remove grocery store
mode_df = (
    df_train[df_train.Outlet_Type!='Grocery Store'].groupby(['Outlet_Type', 'Outlet_Location_Type'])['Outlet_Size']
    .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    .reset_index()
    .rename(columns={'Outlet_Size': 'Outlet_Size_mode'})
)
#Step 2: Merge the mode back to original df
df_train = df_train.merge(mode_df, on=['Outlet_Type', 'Outlet_Location_Type'], how='left')
df_test = df_test.merge(mode_df, on=['Outlet_Type', 'Outlet_Location_Type'], how='left')

df_train['Outlet_Size'] = np.where(df_train['Outlet_Size'].isna() & (df_train['Outlet_Type'] != 'Grocery Store'), df_train['Outlet_Size_mode'], df_train['Outlet_Size'])
df_test['Outlet_Size'] = np.where(df_test['Outlet_Size'].isna() & (df_test['Outlet_Type'] != 'Grocery Store'), df_test['Outlet_Size_mode'], df_test['Outlet_Size'])

df_train = df_train.drop('Outlet_Size_mode',axis=1)
df_test = df_test.drop('Outlet_Size_mode',axis=1)

2. Item Weight: Item weight must be related to Item Identifer

In [None]:
##Relation between Item identifier and Item Weight (to verify if Item weight is not related to item identifier)
df_new = df_train[df_train['Item_Weight'].notna()]
unique_weights = df_new.groupby('Item_Identifier')['Item_Weight'].nunique()
# Filter to show only those with more than 1 unique value
inconsistent_weights = unique_weights[unique_weights > 1]

# Display the result
print(inconsistent_weights)

#Insights: It shows there is one to one mapping between item identifier and item weight

In [None]:
##Impute the missing value of item weight with the corresponding item weight of identifier
weight_map = df_train[df_train.Item_Weight.notna()].groupby('Item_Identifier')['Item_Weight'].mean()
df_train['Item_Weight'] = df_train['Item_Weight'].fillna(df_train['Item_Identifier'].map(weight_map))
df_test['Item_Weight'] = df_test['Item_Weight'].fillna(df_test['Item_Identifier'].map(weight_map))

In [None]:
calc_percent_NAs(df_train)

In [None]:
#There are few missing item weight of identifer for which weight is not available
##Replace those missing values with median of item weight
weight_map1 = df_train[df_train.Item_Weight.notna()].groupby('Item_Type')['Item_Weight'].mean()
df_train['Item_Weight'] = df_train['Item_Weight'].fillna(df_train['Item_Type'].map(weight_map1))
df_test['Item_Weight'] = df_test['Item_Weight'].fillna(df_test['Item_Type'].map(weight_map1))

In [None]:

calc_percent_NAs(df_train)

Feature Engineering

In [None]:
##Calculation of Item Visbility Mean Ratio
# Group by Item_Identifier to calculate mean visibility
item_visibility_avg = df_train.groupby('Item_Identifier')['Item_Visibility'].mean()

df_train['Item_Visibility_Avg'] = df_train['Item_Identifier'].map(item_visibility_avg)
df_train['Item_Visibility_MeanRatio'] = df_train['Item_Visibility'] / df_train['Item_Visibility_Avg']

df_test['Item_Visibility_Avg'] = df_test['Item_Identifier'].map(item_visibility_avg)
df_test['Item_Visibility_MeanRatio'] = df_test['Item_Visibility'] / df_test['Item_Visibility_Avg']

df_train = df_train.drop('Item_Visibility_Avg',axis=1)
df_test = df_test.drop('Item_Visibility_Avg',axis=1)

In [None]:
##Regularize the value of Item Fat Content
df_train['Item_Fat_Content'] = df_train['Item_Fat_Content'].replace({
    'reg': 'Regular',
    'LF': 'Low Fat',
    'low fat': 'Low Fat'
})

df_test['Item_Fat_Content'] = df_test['Item_Fat_Content'].replace({
    'reg': 'Regular',
    'LF': 'Low Fat',
    'low fat': 'Low Fat'
})

In [None]:
##Correlation analysis
fig,ax =plt.subplots(figsize = (8,8))
heatmap = associations(df_train,num_num_assoc='spearman',nom_num_assoc='correlation_ratio',nom_nom_assoc='cramer',ax=ax,cmap='viridis')

cbar=ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=8)
cbar.ax.set_ylabel("Correlation")
cbar.ax.figure.axes[-1].set_aspect(20)
cbar.ax.figure.colorbar(cbar.ax.collections[0],shrink=0.5)

plt.show()

In [None]:
#Categoricol column
object=df_train.select_dtypes(include='object').columns
object

In [None]:
##Outlet size is ordinal and there are only categories in Item Fat Content
df_train['Item_Fat_Content'] = df_train['Item_Fat_Content'].map({'Regular':0,'Low Fat':1})
df_test['Item_Fat_Content'] = df_test['Item_Fat_Content'].map({'Regular':0,'Low Fat':1})


df_train['Outlet_Size'] = df_train['Outlet_Size'].map({'Small'  : 1,
                                                 'Medium' : 2,
                                                 'High'   : 3
                                                 }).astype(int)

df_test['Outlet_Size'] = df_test['Outlet_Size'].map({'Small'  : 1,
                                                 'Medium' : 2,
                                                 'High'   : 3
                                                 }).astype(int)

In [None]:
df_train1 = df_train.copy()
df_test1 = df_test.copy()

In [None]:
#Label Encoder for Item identifer as there are many unique values 

encoder = LabelEncoder()
df_train['Item_Identifier'] = encoder.fit_transform(df_train['Item_Identifier'])
df_test['Item_Identifier'] = encoder.transform(df_test['Item_Identifier'])

In [None]:
#Onehot encoding for remaining columns
features_label = ['Item_Type','Outlet_Location_Type','Outlet_Identifier','Outlet_Type']
df_train = pd.get_dummies(df_train, columns=features_label, drop_first=True)
df_test  = pd.get_dummies(df_test,  columns=features_label, drop_first=True)

In [None]:
X = df_train.drop('Item_Outlet_Sales', axis=1)
y = df_train['Item_Outlet_Sales']

In [None]:
# splitting into training set and test set 80%-20%

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
models = [DecisionTreeRegressor(), RandomForestRegressor(), XGBRegressor(),LGBMRegressor()]

for model in models:
    reg = model
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    print(f'{model} MSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

In [None]:
#Random Forest
RF_model = RandomForestRegressor(random_state=42)
RF_model.fit(X_train, y_train)
y_pred1 = RF_model.predict(X_test)
# MSE
mse = mean_squared_error(y_test, y_pred1)
# RMSE
rmse = np.sqrt(mse)
# R²
r2 = r2_score(y_test, y_pred1)
# Print results
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

In [None]:
importances = RF_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(8, 5))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel("Feature Importance (MSE Reduction)")
plt.title("Random Forest Regressor Feature Importances")
plt.gca().invert_yaxis()
plt.show()


Hyperparameter tuning of Random Forest using RandomSearchCV

In [None]:
rf = RandomForestRegressor(random_state=42)

# Hyperparameter grid
param_dist = {
    'n_estimators': randint(400, 500),
    'max_depth': [None, 10, 20],
    'min_samples_split': [5, 10,20],
    'min_samples_leaf': [2, 4],
    'max_features': ['sqrt', 'log2',None]
}

# Randomized search
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit
random_search.fit(X_train, y_train)

In [None]:
# Best model
best_model = random_search.best_estimator_
print("Best parameters:", random_search.best_params_)

# Evaluate
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Validation RMSE:", rmse)

In [None]:
# Evaluate
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Validation RMSE:", rmse)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, color='dodgerblue', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Perfect prediction line
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title(f"Actual vs Predicted (R² = {r2_score(y_test, y_pred):.2f})")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
model = LGBMRegressor(random_state=42)

LGB_model = model.fit(X_train, y_train)
y_pred1 = LGB_model.predict(X_test)
# MSE
mse = mean_squared_error(y_test, y_pred1)
# RMSE
rmse = np.sqrt(mse)
# R²
r2 = r2_score(y_test, y_pred1)
# Print results
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

Hyperparameter tuning of LGBM using RandomSearchCV

In [None]:
param_dist = {
    'num_leaves': randint(20, 150),
    'max_depth': randint(3, 15),
    'learning_rate': uniform(0.01, 0.05),
    'n_estimators': randint(300, 500),
    'min_child_samples': randint(10, 50),
    'subsample': uniform(0.6, 0.4),  # 0.6 to 1.0
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': uniform(0.0, 1.0),
    'reg_lambda': uniform(0.0, 1.0)
}

model = LGBMRegressor(random_state=42)

random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter settings sampled
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)

best_model1 = random_search.best_estimator_

# Predict and evaluate
y_pred = best_model1.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, color='dodgerblue', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Perfect prediction line
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title(f"Actual vs Predicted (R² = {r2_score(y_test, y_pred):.2f})")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#Prediction using Random Forest
y_prediction = best_model.predict(df_test)
df_pred = pd.DataFrame(y_prediction,columns=['Item_Outlet_Sales'])
df_test2 = pd.read_csv('test.csv')
df_test2 = df_test2[['Item_Identifier','Outlet_Identifier']]
df_combined = pd.concat([df_test2, df_pred], axis=1)
df_combined.to_csv('Submission1.csv', index=False)

In [None]:
#Prediction using LGBM
y_prediction = best_model1.predict(df_test)
df_pred = pd.DataFrame(y_prediction,columns=['Item_Outlet_Sales'])
df_test2 = pd.read_csv('test.csv')
df_test2 = df_test2[['Item_Identifier','Outlet_Identifier']]
df_combined = pd.concat([df_test2, df_pred], axis=1)
df_combined.to_csv('Submission2.csv', index=False)

In [None]:
#Prediction using RF and LGBM
y_prediction1 = best_model.predict(df_test)
y_prediction2 = best_model1.predict(df_test)
df_pred = pd.DataFrame({
    'Item_Outlet_Sales_RF': y_prediction1,
    'Item_Outlet_Sales_LGB': y_prediction2
})

df_pred['Item_Outlet_Sales_LGB'] = np.where(df_pred['Item_Outlet_Sales_LGB'] < 0, df_pred['Item_Outlet_Sales_RF'], df_pred['Item_Outlet_Sales_LGB'])
df_pred['Item_Outlet_Sales'] = 0.5*df_pred['Item_Outlet_Sales_LGB'] + 0.5*df_pred['Item_Outlet_Sales_RF']

df_test2 = pd.read_csv('test.csv')
df_combined = pd.concat([df_test2, df_pred], axis=1)
df_combined = df_combined[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]
df_combined.to_csv('Submission3.csv', index=False)

Implement CatBoost

In [None]:
features_label = ['Item_Type','Item_Identifier','Outlet_Location_Type','Outlet_Identifier','Outlet_Type']

X = df_train1.drop('Item_Outlet_Sales', axis=1)
y = df_train1['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.01, depth=6,\
#                          loss_function='RMSE', cat_features=list(features_label),nan_mode='Min')
cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.01, depth=10,\
                          loss_function='RMSE', cat_features=list(features_label),nan_mode='Min')

cat_model.fit(X_train, y_train, logging_level='Silent')

y_pred = cat_model.predict(X_test)
print(f'{cat_model} MSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')

In [None]:
y_prediction = cat_model.predict(df_test1)
df_pred = pd.DataFrame(y_prediction,columns=['Item_Outlet_Sales'])
df_test2 = pd.read_csv('test.csv')
df_test2 = df_test2[['Item_Identifier','Outlet_Identifier']]
df_combined = pd.concat([df_test2, df_pred], axis=1)
df_combined.to_csv('Submission4.csv', index=False)