In [73]:
# Import necessary libraries
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [75]:
# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

In [77]:
# Load the training and test datasets
big_mart_df_train = pd.read_csv("C:/Users/DELL/Downloads/train_v9rqX0R(2).csv")
big_mart_df_test = pd.read_csv("C:/Users/DELL/Downloads/test_AbJTz2l(3).csv")

In [79]:
# Combine both training and test datasets for preprocessing
big_mart_df = pd.concat([big_mart_df_train, big_mart_df_test], axis=0)

In [81]:
# Display first few rows of the dataset for inspection
print("First few rows of the dataset:")
big_mart_df.head()

First few rows of the dataset:


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


Data Cleaning and Preprocessing

In [83]:
# Structure of data
print(big_mart_df_train.shape, big_mart_df_test.shape, big_mart_df.shape)

(8523, 12) (5681, 11) (14204, 12)


In [85]:
#Information of variables to check their data types.
big_mart_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14204 entries, 0 to 5680
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            14204 non-null  object 
 1   Item_Weight                11765 non-null  float64
 2   Item_Fat_Content           14204 non-null  object 
 3   Item_Visibility            14204 non-null  float64
 4   Item_Type                  14204 non-null  object 
 5   Item_MRP                   14204 non-null  float64
 6   Outlet_Identifier          14204 non-null  object 
 7   Outlet_Establishment_Year  14204 non-null  int64  
 8   Outlet_Size                10188 non-null  object 
 9   Outlet_Location_Type       14204 non-null  object 
 10  Outlet_Type                14204 non-null  object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 1.4+ MB


In [87]:
#calculate the percentage of missing values in uber_df.
round(big_mart_df.isna().sum()/len(big_mart_df)*100,2).sort_values()

Item_Identifier               0.00
Item_Fat_Content              0.00
Item_Visibility               0.00
Item_Type                     0.00
Item_MRP                      0.00
Outlet_Identifier             0.00
Outlet_Establishment_Year     0.00
Outlet_Location_Type          0.00
Outlet_Type                   0.00
Item_Weight                  17.17
Outlet_Size                  28.27
Item_Outlet_Sales            40.00
dtype: float64

In [7]:
# Convert all string columns (object type) to uppercase
big_mart_df = big_mart_df.apply(lambda x: x.str.upper() if x.dtype == 'O' else x)

In [8]:
# Correct values in 'Item_Fat_Content' column: standardize 'LF' and 'REG'
big_mart_df['Item_Fat_Content'].replace({'LF': 'LOW FAT', 'REG': 'REGULAR'}, inplace=True)

In [9]:
# Fill missing values in the 'Item_Weight' column with the mean
big_mart_df['Item_Weight'].fillna(big_mart_df['Item_Weight'].mean(),inplace=True)

In [10]:
# Fill missing values in the 'Outlet_Size' column with 'OTHERS'
big_mart_df['Outlet_Size'].fillna('OTHERS', inplace=True)

In [11]:
# Create a copy of the test data with 'Item_Identifier' and 'Outlet_Identifier'
big_mart_df_test_with_ids = big_mart_df_test[['Item_Identifier', 'Outlet_Identifier']].copy()

In [12]:
# Drop 'Item_Identifier' and 'Outlet_Identifier' columns as they are not needed for modeling
big_mart_df.drop(['Item_Identifier', 'Outlet_Identifier'], axis=1, inplace=True)

One-hot Encoding

In [14]:
# Perform one-hot encoding for categorical variables (drop the first column to avoid multicollinearity)
big_mart_df = pd.get_dummies(big_mart_df, drop_first=True, dtype=int)

In [15]:
# Split the combined data back into training and test datasets
big_mart_df_train = big_mart_df.loc[~big_mart_df["Item_Outlet_Sales"].isna(),]
big_mart_df_test = big_mart_df.loc[big_mart_df["Item_Outlet_Sales"].isna(),]

In [16]:
print("Shape of training data:", big_mart_df_train.shape)
print("Shape of test data:", big_mart_df_test.shape)

Shape of training data: (8523, 29)
Shape of test data: (5681, 29)


Feature and Target Separation

In [18]:
# Define the features (X) and target variable (y) for training the model
X = big_mart_df_train.drop(["Item_Outlet_Sales"], axis=1)  # Drop target column from features
y = big_mart_df_train["Item_Outlet_Sales"]  # Target column

In [19]:
# Split the training data into training and testing sets (for cross-validation)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

print(f"Training data shapes: {X_train.shape}, {y_train.shape}")
print(f"Testing data shapes: {X_test.shape}, {y_test.shape}")


Training data shapes: (6392, 28), (6392,)
Testing data shapes: (2131, 28), (2131,)


Model Initialization and Training

In [21]:
# Initialize a Random Forest Regressor model
RF_regressor = RandomForestRegressor(random_state=10)

In [22]:
# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [3, 5, 7, 10, None],  # Maximum depth of trees
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],    # Minimum samples required at a leaf node
    'max_features': ['auto', 'sqrt', 'log2', None],  # Number of features to consider for the best split
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

In [43]:
# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=RF_regressor, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')


In [45]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


In [47]:
# Get the best parameters from GridSearchCV
print(f"Best Hyperparameters: {grid_search.best_params_}")

Best Hyperparameters: {'bootstrap': True, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


In [49]:
# Train the model using the best hyperparameters
best_RF_regressor = grid_search.best_estimator_

In [51]:
# Predict on training data and evaluate model performance
y_pred_train = best_RF_regressor.predict(X_train)
MAE = mean_absolute_error(y_train, y_pred_train)
R2 = r2_score(y_train, y_pred_train)
MSE = mean_squared_error(y_train, y_pred_train)
RMSE = np.sqrt(MSE)

In [53]:
# Print evaluation metrics for training data
print("Training Data Evaluation Metrics:")
print(f"R^2: {R2}")
print(f"MAE: {MAE}")
print(f"MSE: {MSE}")
print(f"RMSE: {RMSE}")

Training Data Evaluation Metrics:
R^2: 0.6202350338540674
MAE: 738.1329236373928
MSE: 1104988.7701224189
RMSE: 1051.18446055981


In [55]:
# Predict on test data and evaluate model performance
y_pred_test = best_RF_regressor.predict(X_test)
MAE_test = mean_absolute_error(y_test, y_pred_test)
R2_test = r2_score(y_test, y_pred_test)
MSE_test = mean_squared_error(y_test, y_pred_test)
RMSE_test = np.sqrt(MSE_test)

In [57]:
# Print evaluation metrics for testing data
print("Testing Data Evaluation Metrics:")
print(f"R^2: {R2_test}")
print(f"MAE: {MAE_test}")
print(f"MSE: {MSE_test}")
print(f"RMSE: {RMSE_test}")

Testing Data Evaluation Metrics:
R^2: 0.5953880181048771
MAE: 761.8254225563404
MSE: 1180666.5847474185
RMSE: 1086.5848263009284


In [59]:
# Ensure that the test dataset has the same feature columns as the training dataset
X_test_final = big_mart_df_test.drop(["Item_Outlet_Sales"], axis=1)  # Remove the target column from test data


In [61]:
# Predict using the trained regression model
RF_predictions = best_RF_regressor.predict(X_test_final)

In [63]:
big_mart_df_test.loc[ : , 'Item_Outlet_Sales'] = RF_predictions

big_mart_df_test.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,Item_Fat_Content_REGULAR,Item_Type_BREADS,Item_Type_BREAKFAST,Item_Type_CANNED,Item_Type_DAIRY,...,Item_Type_SOFT DRINKS,Item_Type_STARCHY FOODS,Outlet_Size_MEDIUM,Outlet_Size_OTHERS,Outlet_Size_SMALL,Outlet_Location_Type_TIER 2,Outlet_Location_Type_TIER 3,Outlet_Type_SUPERMARKET TYPE1,Outlet_Type_SUPERMARKET TYPE2,Outlet_Type_SUPERMARKET TYPE3
0,20.75,0.007565,107.8622,1999,1698.988567,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,8.3,0.038428,87.3198,2007,1428.570304,1,0,0,0,1,...,0,0,0,1,0,1,0,1,0,0
2,14.6,0.099575,241.7538,1998,550.08664,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,7.315,0.015388,155.034,2007,2504.117045,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
4,12.792854,0.118599,234.23,1985,5558.461011,1,0,0,0,1,...,0,0,1,0,0,0,1,0,0,1


In [65]:
big_mart_df_test['Item_Outlet_Sales']

0       1698.988567
1       1428.570304
2        550.086640
3       2504.117045
4       5558.461011
           ...     
5676    2110.225085
5677    2518.875599
5678    2045.384403
5679    3783.323079
5680    1402.653221
Name: Item_Outlet_Sales, Length: 5681, dtype: float64

In [67]:
# Prepare the final test data for submission
big_mart_df_test_with_ids['Item_Outlet_Sales'] = RF_predictions  # Add predicted sales


In [69]:
big_mart_df_test_with_ids.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1698.988567
1,FDW14,OUT017,1428.570304
2,NCN55,OUT010,550.08664
3,FDQ58,OUT017,2504.117045
4,FDY38,OUT027,5558.461011


In [71]:
# Save the final output (predictions) to a CSV file
big_mart_df_test_with_ids.to_csv('BM_submission_RF-15-02-25.csv', index=False)