In [None]:
# 1. Importing necessary libraries for data preprocessing, modeling, and evaluation.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCVx
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import joblib

In [3]:
# Step 1: Preprocessing function to handle missing values, derive new features, and transform data.
def preprocess_data(data):
    data['Item_Weight'] = data['Item_Weight'].fillna(data.groupby('Item_Type')['Item_Weight'].transform('mean'))
    data['Outlet_Size'] = data['Outlet_Size'].fillna(data['Outlet_Size'].mode()[0])

    # Feature derivation
    data['Outlet_Age'] = 2024 - data['Outlet_Establishment_Year']
    data['Price_Per_Unit_Weight'] = data['Item_MRP'] / data['Item_Weight']
    
    # Simplifying Item_Fat_Content
    data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'})

    # Log Transformation for Item Visibility
    data['Item_Visibility_Log'] = np.log1p(data['Item_Visibility'])
    
    # MRP Categorization
    min_value = data['Item_MRP'].min()
    max_value = data['Item_MRP'].max()
    range_value = max_value - min_value
    data['MRP_Tier'] = data['Item_MRP'].apply(lambda x: 'Low' if x <= min_value + 0.33 * range_value else
                                              'Medium' if x <= min_value + 0.66 * range_value else 'High')

    return data

# Step 2: Loading data and splitting into training and testing sets.
data = pd.read_csv('C:\\Users\\Kamlesh P Panchal\\Documents\\Infosys Internship\\train_og\\Train.csv')
training_data, testing_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Preprocessing training and testing datasets.
training_data = preprocess_data(training_data)
testing_data = preprocess_data(testing_data)

# Step 4: Log-transforming the target variable for normalization.
training_data['Item_Outlet_Sales'] = np.log1p(training_data['Item_Outlet_Sales'])
testing_data['Item_Outlet_Sales'] = np.log1p(testing_data['Item_Outlet_Sales'])

In [4]:
data.head(1)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138


In [5]:
# Step 5: Function for encoding categorical and scaling numerical features.

def encode_data(data, is_training=True, encoders=None):
    # feature groups
    numeric_features = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Age', 'Price_Per_Unit_Weight']
    ordinal_features = ['Outlet_Size', 'MRP_Tier']
    nominal_features = ['Item_Fat_Content', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Type']
    label_features = ['Item_Identifier', 'Outlet_Identifier']

    # Filtering the columns existing in the dataset
    numeric_features = [col for col in numeric_features if col in data.columns]
    ordinal_features = [col for col in ordinal_features if col in data.columns]
    nominal_features = [col for col in nominal_features if col in data.columns]
    label_features = [col for col in label_features if col in data.columns]

    if is_training:
        # Fitting encoders on training data
        encoders = {
            'ordinal': OrdinalEncoder().fit(data[ordinal_features]) if ordinal_features else None,
            'nominal': OneHotEncoder(sparse_output=False, drop='first').fit(data[nominal_features]) if nominal_features else None,
            'label': {col: LabelEncoder().fit(data[col]) for col in label_features},
            'scaler': StandardScaler().fit(data[numeric_features]) if numeric_features else None
        }

    # Applying transformations
    if numeric_features:
        data[numeric_features] = encoders['scaler'].transform(data[numeric_features])
    if ordinal_features:
        data[ordinal_features] = encoders['ordinal'].transform(data[ordinal_features])
    if nominal_features:
        nominal_encoded = encoders['nominal'].transform(data[nominal_features])
        nominal_cols = encoders['nominal'].get_feature_names_out(nominal_features)
        data = pd.concat([data.reset_index(drop=True), pd.DataFrame(nominal_encoded, columns=nominal_cols)], axis=1)
        data.drop(columns=nominal_features, inplace=True)

    # Label encode ID columns
    for label_feature in label_features:
        le = encoders['label'][label_feature]
        data[label_feature] = data[label_feature].map(
            lambda x: le.transform([x])[0] if x in le.classes_ else -1
        )
        
    # Saving encoders to reuse them for new data.
    joblib.dump(encoders['ordinal'], 'ordinal_encoder.pkl')
    joblib.dump(encoders['nominal'], 'onehot_encoder.pkl')
    joblib.dump(encoders['scaler'], 'standard_scaler.pkl')

    # Save label encoders for ID features
    for label_feature in label_features:
        joblib.dump(encoders['label'][label_feature], f'{label_feature}_label_encoder.pkl')

    return (data, encoders) if is_training else data

# Step 4: Encode training and testing data
training_data, encoders = encode_data(training_data, is_training=True)  # Encoding on training data

In [6]:
# Loading encoders and encoding testing data.   
ordinal_encoder = joblib.load('ordinal_encoder.pkl')
onehot_encoder = joblib.load('onehot_encoder.pkl')
scaler = joblib.load('standard_scaler.pkl')

item_identifier_encoder = joblib.load('Item_Identifier_label_encoder.pkl')
outlet_identifier_encoder = joblib.load('Outlet_Identifier_label_encoder.pkl')

testing_data = encode_data(testing_data, is_training=False, encoders=encoders)


In [7]:
training_data.dtypes

Item_Identifier                      int32
Item_Weight                        float64
Item_Visibility                    float64
Item_MRP                           float64
Outlet_Identifier                    int32
Outlet_Establishment_Year            int64
Outlet_Size                        float64
Item_Outlet_Sales                  float64
Outlet_Age                         float64
Price_Per_Unit_Weight              float64
Item_Visibility_Log                float64
MRP_Tier                           float64
Item_Fat_Content_Regular           float64
Outlet_Location_Type_Tier 2        float64
Outlet_Location_Type_Tier 3        float64
Outlet_Type_Supermarket Type1      float64
Outlet_Type_Supermarket Type2      float64
Outlet_Type_Supermarket Type3      float64
Item_Type_Breads                   float64
Item_Type_Breakfast                float64
Item_Type_Canned                   float64
Item_Type_Dairy                    float64
Item_Type_Frozen Foods             float64
Item_Type_F

In [8]:
# Step 6: Splitting datasets into features (X) and target variable (y).

X_train = training_data.drop('Item_Outlet_Sales', axis=1)
y_train = training_data['Item_Outlet_Sales']
X_test = testing_data.drop('Item_Outlet_Sales', axis=1)
y_test = testing_data['Item_Outlet_Sales']

# Step 7: Training models and evaluating performance.
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor()
}

results = []
for model_name, model in models.items():
    model.fit(X_train, y_train)
    train_r2 = r2_score(y_train, model.predict(X_train))
    test_r2 = r2_score(y_test, model.predict(X_test))
    results.append({'Model': model_name, 'Train R²': train_r2, 'Test R²': test_r2})

results_df = pd.DataFrame(results)
print(results_df)


               Model  Train R²   Test R²
0  Linear Regression  0.727077  0.736358
1      Random Forest  0.958539  0.719032
2            XGBoost  0.907070  0.702246
3              Lasso  0.000041 -0.003547
4              Ridge  0.726909  0.736102
5  Gradient Boosting  0.756606  0.741898
6           AdaBoost  0.576584  0.589278


In [9]:
# Step 8: Hyperparameter tuning using GridSearchCV or RandomizedSearchCV.

param_grids = {
    'Linear Regression': {},
    'Random Forest': {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    'XGBoost': {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.3], 'max_depth': [3, 5, 7], 'subsample': [0.8, 0.9, 1.0]},
    'Lasso': {'alpha': [0.1, 1, 10, 100]},
    'Ridge': {'alpha': [0.1, 1, 10, 100]},
    'Gradient Boosting': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.05, 0.1], 'max_depth': [3, 5, 7], 'subsample': [0.8, 0.9, 1.0]},
    'AdaBoost': {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 1.0]}
}

results = []
for model_name, model in models.items():
    print(f"Training {model_name}...")
    if model_name in ['Linear Regression', 'Lasso', 'Ridge']:
        search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=3, scoring='r2', n_jobs=-1)
    else:
        search = RandomizedSearchCV(estimator=model, param_distributions=param_grids[model_name], n_iter=10, cv=3, scoring='r2', random_state=42, n_jobs=-1)
        
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    train_r2 = r2_score(y_train, best_model.predict(X_train))
    test_r2 = r2_score(y_test, best_model.predict(X_test))
    results.append({'Model': model_name, 'Best Params': search.best_params_, 'Train R²': train_r2, 'Test R²': test_r2})

results_df = pd.DataFrame(results)
print(results_df)


Training Linear Regression...
Training Random Forest...
Training XGBoost...
Training Lasso...
Training Ridge...
Training Gradient Boosting...
Training AdaBoost...




               Model                                        Best Params  \
0  Linear Regression                                                 {}   
1      Random Forest  {'n_estimators': 100, 'min_samples_split': 2, ...   
2            XGBoost  {'subsample': 0.9, 'n_estimators': 100, 'max_d...   
3              Lasso                                     {'alpha': 0.1}   
4              Ridge                                     {'alpha': 0.1}   
5  Gradient Boosting  {'subsample': 0.9, 'n_estimators': 100, 'max_d...   
6           AdaBoost        {'n_estimators': 100, 'learning_rate': 0.1}   

   Train R²   Test R²  
0  0.727077  0.736358  
1  0.805910  0.738656  
2  0.754794  0.744645  
3  0.380920  0.381786  
4  0.726965  0.736347  
5  0.744495  0.740051  
6  0.649401  0.657255  


In [10]:
# Extracting best parameters from the results and retraining the models
best_params = {
    "Linear Regression": {},
    "XGBoost": {'subsample': 0.9, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1},
    "Ridge": {'alpha': 0.1},
    "Gradient Boosting": {'subsample': 0.9, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1},
    "AdaBoost": {'n_estimators': 100, 'learning_rate': 0.1}
}

# Models to be trained
model_classes = {
    "Linear Regression": LinearRegression,
    "XGBoost": XGBRegressor,
    "Ridge": Ridge,
    "Gradient Boosting": GradientBoostingRegressor,
    "AdaBoost": AdaBoostRegressor
}

trained_models = {}

for model_name, params in best_params.items():
    print(f"Training {model_name} with best parameters: {params}...")
    model = model_classes[model_name](**params)  # Instantiate model with best parameters
    model.fit(X_train, y_train)  # Train the model
    trained_models[model_name] = model  # Store the trained model

    # Save the trained model for future use
    joblib.dump(model, f"{model_name.lower().replace(' ', '_')}_model.pkl")
    print(f"Saved {model_name} model.")

# Evaluate the retrained models
evaluation_results = []
for model_name, model in trained_models.items():
    train_r2 = r2_score(y_train, model.predict(X_train))
    test_r2 = r2_score(y_test, model.predict(X_test))
    evaluation_results.append({"Model": model_name, "Train R²": train_r2, "Test R²": test_r2})

evaluation_results_df = pd.DataFrame(evaluation_results)
print(evaluation_results_df)


Training Linear Regression with best parameters: {}...
Saved Linear Regression model.
Training XGBoost with best parameters: {'subsample': 0.9, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1}...
Saved XGBoost model.
Training Ridge with best parameters: {'alpha': 0.1}...
Saved Ridge model.
Training Gradient Boosting with best parameters: {'subsample': 0.9, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}...
Saved Gradient Boosting model.
Training AdaBoost with best parameters: {'n_estimators': 100, 'learning_rate': 0.1}...
Saved AdaBoost model.
               Model  Train R²   Test R²
0  Linear Regression  0.727077  0.736358
1            XGBoost  0.754794  0.744645
2              Ridge  0.726965  0.736347
3  Gradient Boosting  0.800216  0.737493
4           AdaBoost  0.650757  0.659912
