#### **IMPORTING LIBRARIES**

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pickle

#### **DATA PREPROCESSING**

In [35]:
# Defining a function to preprocess the data

def data_process(data):
    
    # Step 1: Handling Missing values
    # Finding the mean of 'Item_Visibility' where it is greater than 0, as a placeholder for missing values
    non_zero_mean = data.loc[data['Item_Visibility'] > 0, 'Item_Visibility'].mean()
    data['Item_Visibility'] = data['Item_Visibility'].replace(0, non_zero_mean)

    # Filling missing 'Item_Weight' based on median of 'Item_Type'.
    data['Item_Weight'] = data['Item_Weight'].fillna(data.groupby('Item_Type')['Item_Weight'].transform('median'))

    # Filling missing 'Outlet_Size' with the mode of each 'Outlet_Type' group.
    data['Outlet_Size'] = data['Outlet_Size'].fillna(
        data.groupby('Outlet_Type')['Outlet_Size'].transform(lambda x: x.mode()[0] if not x.mode().empty else 'Unknown')
    )

    # The column has inconsistent labels for 'Low Fat' and 'Regular', so standardizing these values.
    data.replace({'Item_Fat_Content': {'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'}}, inplace=True)

    # Step 2: Feature Derivation
    # Creating a new column 'Outlet_age' to calculate the store's age based on its establishment year
    data['Outlet_age'] = 2024 - data['Outlet_Establishment_Year']

    # Step 3: Encoding
    # Target Encoding for high-cardinality categorical features, assigns a mean 'Item_Outlet_Sales' value to each category.
    high_cardinality_columns = ['Item_Identifier', 'Outlet_Identifier']
    target_encoder = TargetEncoder(cols=high_cardinality_columns)
    data = target_encoder.fit_transform(data, data['Item_Outlet_Sales'])

    # Identifying categorical and numerical columns for encoding and scaling
    nominal_columns = ['Item_Fat_Content', 'Item_Type', 'Outlet_Location_Type', 'Outlet_Type']
    ordinal_columns = ['Outlet_Size']
    numerical_columns = ['Item_Weight', 'Item_Visibility', 'Item_MRP']

    # Mapping for ordinal encoding for 'Outlet_Size'
    Outlet_Size_mapping = ['Small', 'Medium', 'High']
    ohe = OneHotEncoder(drop='first', sparse_output=False)  # One-Hot Encoder for nominal columns
    ode = OrdinalEncoder(categories=[Outlet_Size_mapping])  # Ordinal Encoder for 'Outlet_Size'
    scaler = StandardScaler()  # Standard Scaler for numerical columns

    # Step 4: Column Transformer setup
    ct = make_column_transformer(
        (ohe, nominal_columns),
        (ode, ordinal_columns),
        (scaler, numerical_columns),
        remainder='passthrough'  # Remaining columns are kept as-is
    )
    ct.set_output(transform='pandas')
    df_encoded = ct.fit_transform(data)

    # We clean up encoded column names for easier interpretation and further analysis.
    df_encoded.columns = [
        col.replace("onehotencoder__", "")
           .replace("ordinalencoder__", "")
           .replace("standardscaler__", "")
           .replace("remainder__", "")
        for col in df_encoded.columns
    ]

    # Step 5: Outlier Detection and Handling
    # Z-score to indentify outliers and using cap method to handle them instead of removing
    continuous_columns = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales']
    z_threshold = 2.5  # as 3 results with 0 outliers & 2 with many so fixed T as 2.5

    for col in continuous_columns:
        df_encoded[col + '_zscore'] = np.abs((df_encoded[col] - df_encoded[col].mean()) / df_encoded[col].std())
        upper_bound = df_encoded[col].mean() + z_threshold * df_encoded[col].std()
        lower_bound = df_encoded[col].mean() - z_threshold * df_encoded[col].std()
        df_encoded[col] = np.where(df_encoded[col] > upper_bound, upper_bound,
                                   np.where(df_encoded[col] < lower_bound, lower_bound, df_encoded[col]))

    # Step 6: Feature Scaling using Min-Max Normalization 
    min_max_scaler = MinMaxScaler()
    df_encoded[['Item_MRP', 'Item_Visibility']] = min_max_scaler.fit_transform(df_encoded[['Item_MRP', 'Item_Visibility']])

    # Save encoders and scalers
    with open('target_encoder.pkl', 'wb') as file:
        pickle.dump(target_encoder, file)
    with open('onehot_encoder.pkl', 'wb') as file:
        pickle.dump(ohe, file)
    with open('ordinal_encoder.pkl', 'wb') as file:
        pickle.dump(ode, file)
    with open('scaler.pkl', 'wb') as file:
        pickle.dump(scaler, file)
    with open('minmax_scaler.pkl', 'wb') as file:
        pickle.dump(min_max_scaler, file)

    #print("Encoders and scalers have been saved successfully.")
    
    # Return the processed DataFrame
    return df_encoded
    

# Loading the raw data
data = pd.read_csv('C:\\Users\\Kamlesh P Panchal\\Documents\\Infosys Internship\\train_og\\Train.csv')

# Splittig into training and testing sets

from sklearn.model_selection import train_test_split
training, testing = train_test_split(data)

# Passing the training and testing data
training_data_processed = data_process(training)
testing_data_processed = data_process(testing)

print(training_data_processed.head())
print(testing_data_processed.head())

      Item_Fat_Content_Regular  Item_Type_Breads  Item_Type_Breakfast  \
7095                       1.0               0.0                  0.0   
2936                       1.0               0.0                  0.0   
2231                       0.0               0.0                  0.0   
2135                       0.0               0.0                  0.0   
724                        1.0               0.0                  0.0   

      Item_Type_Canned  Item_Type_Dairy  Item_Type_Frozen Foods  \
7095               0.0              0.0                     1.0   
2936               0.0              0.0                     0.0   
2231               1.0              0.0                     0.0   
2135               0.0              0.0                     0.0   
724                0.0              0.0                     0.0   

      Item_Type_Fruits and Vegetables  Item_Type_Hard Drinks  \
7095                              0.0                    0.0   
2936                          

In [36]:
# Shape of the Training & Testing Data
print("Shape of the Training Data:",training_data_processed.shape)
print("Shape of the Testing Data:",testing_data_processed.shape)

Shape of the Training Data: (6392, 34)
Shape of the Testing Data: (2131, 34)


In [37]:
# Checking again for missing values
print("Train Data:", training_data_processed.isnull().sum())

Train Data: Item_Fat_Content_Regular           0
Item_Type_Breads                   0
Item_Type_Breakfast                0
Item_Type_Canned                   0
Item_Type_Dairy                    0
Item_Type_Frozen Foods             0
Item_Type_Fruits and Vegetables    0
Item_Type_Hard Drinks              0
Item_Type_Health and Hygiene       0
Item_Type_Household                0
Item_Type_Meat                     0
Item_Type_Others                   0
Item_Type_Seafood                  0
Item_Type_Snack Foods              0
Item_Type_Soft Drinks              0
Item_Type_Starchy Foods            0
Outlet_Location_Type_Tier 2        0
Outlet_Location_Type_Tier 3        0
Outlet_Type_Supermarket Type1      0
Outlet_Type_Supermarket Type2      0
Outlet_Type_Supermarket Type3      0
Outlet_Size                        0
Item_Weight                        0
Item_Visibility                    0
Item_MRP                           0
Item_Identifier                    0
Outlet_Identifier         

In [38]:
print("Test Data:",testing_data_processed.isnull().sum())

Test Data: Item_Fat_Content_Regular           0
Item_Type_Breads                   0
Item_Type_Breakfast                0
Item_Type_Canned                   0
Item_Type_Dairy                    0
Item_Type_Frozen Foods             0
Item_Type_Fruits and Vegetables    0
Item_Type_Hard Drinks              0
Item_Type_Health and Hygiene       0
Item_Type_Household                0
Item_Type_Meat                     0
Item_Type_Others                   0
Item_Type_Seafood                  0
Item_Type_Snack Foods              0
Item_Type_Soft Drinks              0
Item_Type_Starchy Foods            0
Outlet_Location_Type_Tier 2        0
Outlet_Location_Type_Tier 3        0
Outlet_Type_Supermarket Type1      0
Outlet_Type_Supermarket Type2      0
Outlet_Type_Supermarket Type3      0
Outlet_Size                        0
Item_Weight                        0
Item_Visibility                    0
Item_MRP                           0
Item_Identifier                    0
Outlet_Identifier          

#### **TRAINING & TESTING THE DATA**

In [39]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import pandas as pd

# Separating features and target variable
X_train = training_data_processed.drop('Item_Outlet_Sales', axis=1)
y_train = training_data_processed['Item_Outlet_Sales']

X_test = testing_data_processed.drop('Item_Outlet_Sales', axis=1)
y_test = testing_data_processed['Item_Outlet_Sales']

# Initializing models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Regression': SVR(),
    'KNN Regression': KNeighborsRegressor(),
    'XGBoost Regression': XGBRegressor(),
    'Lasso Regression': Lasso(),
    'Ridge Regression': Ridge(),
    'AdaBoost Regression': AdaBoostRegressor(),
    'Gradient Boosting Regression': GradientBoostingRegressor()
}

# Dictionary to store R² scores
r2_scores = {'Model': [], 'Training R²': [], 'Testing R²': []}

# Training the models and calculating R² scores
for model_name, model in models.items():

    model.fit(X_train, y_train)

    # Predicting on training and testing data
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    # Calculating R² scores
    train_r2 = r2_score(y_train, train_preds)
    test_r2 = r2_score(y_test, test_preds)

    # Storing results
    r2_scores['Model'].append(model_name)
    r2_scores['Training R²'].append(train_r2)
    r2_scores['Testing R²'].append(test_r2)

# Converting results to a DataFrame 
r2_scores_df = pd.DataFrame(r2_scores)

print(r2_scores_df)

                          Model  Training R²  Testing R²
0             Linear Regression     0.766201    0.786412
1                 Random Forest     0.985079    0.915124
2     Support Vector Regression     0.052654    0.050052
3                KNN Regression     0.763015    0.673817
4            XGBoost Regression     0.983446    0.859878
5              Lasso Regression     0.765896    0.786240
6              Ridge Regression     0.766170    0.786079
7           AdaBoost Regression     0.824312    0.811841
8  Gradient Boosting Regression     0.901413    0.912770


In [40]:
# Import necessary libraries
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
import numpy as np

# ----------------------------------
# Random Forest Regressor
# ----------------------------------
from sklearn.ensemble import RandomForestRegressor

# Define parameter grid for Random Forest
param_dist_rf = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]  # Valid options for max_features
}

# Initialize Random Forest model
rf = RandomForestRegressor()

# Perform RandomizedSearchCV for Random Forest
random_search_rf = RandomizedSearchCV(rf, param_distributions=param_dist_rf, n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search_rf.fit(X_train, y_train)

# Best Random Forest model
best_rf = random_search_rf.best_estimator_

# Predictions for Random Forest
y_train_pred_rf = best_rf.predict(X_train)
y_test_pred_rf = best_rf.predict(X_test)

# Calculate R² scores for Random Forest
train_r2_rf = r2_score(y_train, y_train_pred_rf)
test_r2_rf = r2_score(y_test, y_test_pred_rf)

# Print Random Forest results
print("Best Parameters for Random Forest:", random_search_rf.best_params_)
print(f"R² on Training Data (Random Forest): {train_r2_rf:.4f}")
print(f"R² on Testing Data (Random Forest): {test_r2_rf:.4f}")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters for Random Forest: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 10}
R² on Training Data (Random Forest): 0.9377
R² on Testing Data (Random Forest): 0.9202


In [41]:
# ----------------------------------
# XGBoost Regressor
# ----------------------------------

# Define parameter grid for XGBoost
param_dist_xgb = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# Initialize XGBoost model
xgb = XGBRegressor()

# Perform RandomizedSearchCV for XGBoost
random_search_xgb = RandomizedSearchCV(xgb, param_distributions=param_dist_xgb, n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search_xgb.fit(X_train, y_train)

# Best XGBoost model
best_xgb = random_search_xgb.best_estimator_

# Predictions for XGBoost
y_train_pred_xgb = best_xgb.predict(X_train)
y_test_pred_xgb = best_xgb.predict(X_test)

# Calculate R² scores for XGBoost
train_r2_xgb = r2_score(y_train, y_train_pred_xgb)
test_r2_xgb = r2_score(y_test, y_test_pred_xgb)

# Print XGBoost results
print("Best Parameters for XGBoost:", random_search_xgb.best_params_)
print(f"R² on Training Data (XGBoost): {train_r2_xgb:.4f}")
print(f"R² on Testing Data (XGBoost): {test_r2_xgb:.4f}")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters for XGBoost: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.9}
R² on Training Data (XGBoost): 0.9280
R² on Testing Data (XGBoost): 0.8745


In [42]:

# ----------------------------------
# Gradient Boosting Regressor
# ----------------------------------

# Define parameter grid for Gradient Boosting
param_dist_gb = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0]
}

# Initialize Gradient Boosting model
gb = GradientBoostingRegressor()

# Perform RandomizedSearchCV for Gradient Boosting
random_search_gb = RandomizedSearchCV(gb, param_distributions=param_dist_gb, n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search_gb.fit(X_train, y_train)

# Best Gradient Boosting model
best_gb = random_search_gb.best_estimator_

# Predictions for Gradient Boosting
y_train_pred_gb = best_gb.predict(X_train)
y_test_pred_gb = best_gb.predict(X_test)

# Calculate R² scores for Gradient Boosting
train_r2_gb = r2_score(y_train, y_train_pred_gb)
test_r2_gb = r2_score(y_test, y_test_pred_gb)

# Print Gradient Boosting results
print("Best Parameters for Gradient Boosting:", random_search_gb.best_params_)
print(f"R² on Training Data (Gradient Boosting): {train_r2_gb:.4f}")
print(f"R² on Testing Data (Gradient Boosting): {test_r2_gb:.4f}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters for Gradient Boosting: {'subsample': 1.0, 'n_estimators': 50, 'max_depth': 7, 'learning_rate': 0.1}
R² on Training Data (Gradient Boosting): 0.9442
R² on Testing Data (Gradient Boosting): 0.9178


In [43]:
import joblib

# Save the best XGBoost model
joblib.dump(best_xgb, 'best_xgb_model.pkl')

print("Model saved as 'best_xgb_model.pkl'")

Model saved as 'best_xgb_model.pkl'
