<a href="https://colab.research.google.com/github/vengie/Project42_ELOGISTIX/blob/main/OOPS%20ML%20WF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from numpy import median
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

class DataPreprocessor:
    def __init__(self, data):
        self.data = data
        self.cleaned_data = None
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None

    def clean_data(self):
        # Dropping rows with missing values in the target column
        self.cleaned_data = self.data.dropna(subset=['Reached.on.Time_Y.N'])

        # Convert categorical columns to numerical representations
        label_encoder = LabelEncoder()
        categorical_cols = ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']

        for col in categorical_cols:
            self.cleaned_data[col] = label_encoder.fit_transform(self.cleaned_data[col])

        # Ensure 'Reached.on.Time_Y.N' is numeric (if it's not already)
        self.cleaned_data['Reached.on.Time_Y.N'] = pd.to_numeric(self.cleaned_data['Reached.on.Time_Y.N'], errors='coerce').fillna(0).astype(int)

        # Handle outliers
        self.cleaned_data['Weight_in_gms'] = self._handle_outliers(self.cleaned_data['Weight_in_gms'])

        # Handling missing values in other columns using SimpleImputer
        numerical_cols = ['Customer_care_calls', 'Customer_rating', 'Prior_purchases', 'Weight_in_gms']
        categorical_cols = ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']

        imputer_num = SimpleImputer(strategy='median')
        self.cleaned_data[numerical_cols] = imputer_num.fit_transform(self.cleaned_data[numerical_cols])

        imputer_cat = SimpleImputer(strategy='most_frequent')
        self.cleaned_data[categorical_cols] = imputer_cat.fit_transform(self.cleaned_data[categorical_cols])

        # Check for any remaining missing values
        if self.cleaned_data.isnull().values.any():
            raise ValueError("There are still missing values in the dataset after preprocessing.")

    @staticmethod
    def _handle_outliers(series):
        # Find the median and interquartile range
        median_val = series.median()
        IQR = series.quantile(0.75) - series.quantile(0.25)

        # Define limits for outliers
        lower_limit = series.quantile(0.25) - 1.5 * IQR
        upper_limit = series.quantile(0.75) + 1.5 * IQR

        # Replace outliers with the closest limit
        series = series.apply(lambda x: upper_limit if x > upper_limit else (lower_limit if x < lower_limit else x))
        return series

    def feature_engineering(self):
        if self.cleaned_data is not None:
            try:
                initial_columns = self.cleaned_data.columns.tolist()

                # Mapping categorical variables to numerical values
                self.cleaned_data['Warehouse_block'] = self.cleaned_data['Warehouse_block'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'F': 4})
                self.cleaned_data['Mode_of_Shipment'] = self.cleaned_data['Mode_of_Shipment'].map({'Flight': 0, 'Ship': 1})

                # Creating a new feature based on 'Cost_of_the_Product' and 'Discount_offered'
                self.cleaned_data['Discounted_Cost'] = self.cleaned_data['Cost_of_the_Product'] - (self.cleaned_data['Cost_of_the_Product'] * self.cleaned_data['Discount_offered'] / 100)

                # Encoding categorical variables
                encoded_cols = pd.get_dummies(self.cleaned_data[['Product_importance', 'Gender']])
                self.cleaned_data = pd.concat([self.cleaned_data, encoded_cols], axis=1)  # Concatenate the encoded columns

                # Dropping irrelevant or transformed columns
                self.cleaned_data.drop(['Cost_of_the_Product', 'Discount_offered', 'Product_importance', 'Gender'], axis=1, inplace=True)

                final_columns = self.cleaned_data.columns.tolist()
                print("Initial columns:", initial_columns)
                print("Final columns:", final_columns)

                self.cleaned_data.reset_index(drop=True, inplace=True)  # Resetting indices

            except Exception as e:
                print(f"Error in feature engineering: {e}")
                raise ValueError("An error occurred during feature engineering.")
        else:
            raise ValueError("No data available. Please run the 'clean_data' method first.")


    def handle_missing_values(self):
        if self.cleaned_data is not None:
            try:
                imputer = SimpleImputer(strategy='mean')
                imputed_data = imputer.fit_transform(self.cleaned_data)

                # Ensure the columns are aligned with the original DataFrame
                imputed_df = pd.DataFrame(imputed_data, columns=self.cleaned_data.columns)

                self.cleaned_data = imputed_df  # Replace the entire cleaned_data with the imputed DataFrame

            except Exception as e:
                print(f"Error in handling missing values: {e}")
                raise ValueError("An error occurred while handling missing values.")
        else:
            raise ValueError("No data available. Please run the 'clean_data' method first.")


    def split_data(self, target_column, test_size=0.2, random_state=42):
        if self.cleaned_data is None:
            raise ValueError("Data hasn't been cleaned yet. Please run 'clean_data' method first.")

        features = self.cleaned_data.drop(columns=[target_column])
        target = self.cleaned_data[target_column]

        # Splitting the data into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(features, target, test_size=test_size, random_state=random_state)



    def perform_preprocessing(self, target_column):
        self.clean_data()
        self.feature_engineering()
        # self.handle_missing_values()
        self.split_data(target_column)




In [2]:
# if __name__ == "__main__":
#     # Load your dataset into 'data'
#     data_source='https://raw.githubusercontent.com/vengie/Project42_ELOGISTIX/main/Data/DSMMProject42-CPL-5559-Ecom_Shipping_stride.csv'
#     data = pd.read_csv(data_source)

#     # Create an instance of DataPreprocessor
#     preprocessor = DataPreprocessor(data)

#     # Perform preprocessing (specify the target column name)
#     target_column = 'Reached.on.Time_Y.N'
#     preprocessor.perform_preprocessing(target_column)

#     # Access the processed data
#     X_train = preprocessor.X_train
#     X_test = preprocessor.X_test
#     y_train = preprocessor.y_train
#     y_test = preprocessor.y_test

# Now you can use X_train, X_test, y_train, y_test for training your models
# For example:
# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression()
# model.fit(X_train, y_train)
# ... (continue with model training and evaluation)

Model Selection and Training

In [3]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from xgboost import XGBClassifier
# from sklearn.svm import SVC

# class ModelTrainer:
#     def __init__(self, data_preprocessor):
#         self.data_preprocessor = data_preprocessor
#         self.models = {
#             'Logistic Regression': LogisticRegression(),
#             'Decision Tree': DecisionTreeClassifier(),
#             'XGBoost': XGBClassifier(),
#             'SVM': SVC()
#         }
#         self.trained_models = {}

#     def train_models(self):
#         for model_name, model in self.models.items():
#             X_train = self.data_preprocessor.train_data.drop(columns=['Reached.on.Time_Y.N'])
#             y_train = self.data_preprocessor.train_data['Reached.on.Time_Y.N']
#             model.fit(X_train, y_train)
#             self.trained_models[model_name] = model


In [4]:
#model_trainer.py
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
# from model_evaluator import ModelEvaluator  # Importing ModelEvaluator from model_evaluator.py

class ModelTrainer:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.models = {
            'Logistic Regression': LogisticRegression(),
            'Decision Tree': DecisionTreeClassifier(),
            'XGBoost': XGBClassifier(),
            'SVM': SVC()
        }

    def train_models(self):
        # Impute missing values in X_train if any
        imputer = SimpleImputer(strategy='mean')  # Or use strategy='median' or 'most_frequent' as needed
        self.X_train = imputer.fit_transform(self.X_train)

        for name, model in self.models.items():
            print(f"Training {name}...")
            model.fit(self.X_train, self.y_train)
            self.models[name] = model

    def predict(self):
        predictions = {}
        for name, model in self.models.items():
            # Impute missing values in X_test if any
            imputer = SimpleImputer(strategy='mean')  # Or use strategy='median' or 'most_frequent' as needed
            X_test_imputed = imputer.fit_transform(self.X_test)

            print(f"Making predictions using {name}...")
            y_pred = model.predict(X_test_imputed)
            predictions[name] = y_pred

            # Evaluate model
            accuracy = accuracy_score(self.y_test, y_pred)
            report = classification_report(self.y_test, y_pred)

            print(f"{name} Accuracy: {accuracy}")
            print(f"{name} Report: \n{report}")

        return predictions

    def evaluate_models(self):
        # Impute missing values in X_test if any
        imputer = SimpleImputer(strategy='mean')  # Or use strategy='median' or 'most_frequent' as needed
        self.X_test = imputer.fit_transform(self.X_test)

        for name, model in self.models.items():
            print(f"Evaluating {name}...")
            y_pred = model.predict(self.X_test)

            # Evaluate model
            accuracy = accuracy_score(self.y_test, y_pred)
            report = classification_report(self.y_test, y_pred)

            print(f"{name} Accuracy: {accuracy}")
            print(f"{name} Report: \n{report}")





In [5]:

# # model_evaluation.py

# class ModelEvaluator:
#     def __init__(self, X_test, y_test):
#         self.X_test = X_test
#         self.y_test = y_test

#     def evaluate_model(self, model, name):
#         print(f"Evaluating {name}...")
#         y_pred = model.predict(self.X_test)

#         # Evaluate model
#         accuracy = accuracy_score(self.y_test, y_pred)
#         report = classification_report(self.y_test, y_pred)

#         print(f"{name} Accuracy: {accuracy}")
#         print(f"{name} Report: \n{report}")


In [6]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters for each model
param_grid = {
    'Logistic Regression': {'C': [0.1, 1.0, 10.0], 'solver': ['liblinear', 'lbfgs']},
    'Decision Tree': {'max_depth': [None, 5, 10, 15], 'min_samples_split': [2, 5, 10]},
    'XGBoost': {'max_depth': [3, 5, 7], 'n_estimators': [50, 100, 150]},
    'SVM': {'C': [0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf']}
}

class ModelTrainerWithHyperparamTuning:
    def __init__(self, X_train, X_test, y_train, y_test):
        # Same as before
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.models = {
            'Logistic Regression': LogisticRegression(max_iter=1000),
            'Decision Tree': DecisionTreeClassifier(),
            'XGBoost': XGBClassifier(),
            'SVM': SVC()
        }

    def handle_missing_values(self):
        # Create an imputer and fit it to the training data
        imputer = SimpleImputer(strategy='mean')
        imputer.fit(self.X_train)

        # Transform both training and test data
        self.X_train = imputer.transform(self.X_train)
        self.X_test = imputer.transform(self.X_test)

    def train_models_with_hyperparam_tuning(self):
        self.handle_missing_values()  # Handle missing values before training models
        for name, model in self.models.items():
            if name == 'Logistic Regression':
                model.set_params(max_iter=1000)  # Increase max_iter for Logistic Regression

            print(f"Tuning hyperparameters for {name}...")
            param_grid_model = param_grid[name]
            grid_search = GridSearchCV(estimator=model, param_grid=param_grid_model, scoring='accuracy', cv=5)
            grid_search.fit(self.X_train, self.y_train)

            best_params = grid_search.best_params_
            print(f"Best hyperparameters for {name}: {best_params}")

            # Set the model with the best hyperparameters
            self.models[name] = grid_search.best_estimator_


    def perform_evaluation(self, X_test, y_test):
        evaluation_results = {}
        for name, model in self.models.items():
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            evaluation_results[name] = {
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1
            }

        return evaluation_results



# if __name__ == "__main__":
#     # Existing code to load data, preprocess, and get X_train, X_test, y_train, y_test

#     trainer_with_hyperparam_tuning = ModelTrainerWithHyperparamTuning(X_train, X_test, y_train, y_test)
#     trainer_with_hyperparam_tuning.train_models_with_hyperparam_tuning()
#     trainer_with_hyperparam_tuning.evaluate_models()


In [7]:
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# import time

# if __name__ == "__main__":
#     # data_source = 'https://raw.githubusercontent.com/vengie/Project42_ELOGISTIX/main/Data/DSMMProject42-CPL-5559-Ecom_Shipping_stride.csv'
#     # data = pd.read_csv(data_source)

#     # preprocessor = DataPreprocessor(data)
#     # target_column = 'Reached.on.Time_Y.N'
#     # preprocessor.perform_preprocessing(target_column)

#     # X_train = preprocessor.X_train
#     # X_test = preprocessor.X_test
#     # y_train = preprocessor.y_train
#     # y_test = preprocessor.y_test

#     # trainer = ModelTrainer(X_train, X_test, y_train, y_test)
#     # trainer.train_models()
#     # trainer.evaluate_models()

#     # # Instantiate ModelTrainerWithHyperparamTuning class and train models
#     # trainer_with_hyperparam_tuning = ModelTrainerWithHyperparamTuning(X_train, X_test, y_train, y_test)
#     # trainer_with_hyperparam_tuning.train_models_with_hyperparam_tuning()
#     # # Handle missing values in the test data
#     # imputer = SimpleImputer(strategy='mean')
#     # X_test_imputed = imputer.fit_transform(X_test)
#     # # Evaluate models using the cleaned test data
#     # evaluation_results = trainer_with_hyperparam_tuning.perform_evaluation(X_test_imputed, y_test)

#     start_total_time = time.time()

#     data_source = 'https://raw.githubusercontent.com/vengie/Project42_ELOGISTIX/main/Data/DSMMProject42-CPL-5559-Ecom_Shipping_stride.csv'
#     data = pd.read_csv(data_source)

#     start_preprocessing_time = time.time()
#     preprocessor = DataPreprocessor(data)
#     target_column = 'Reached.on.Time_Y.N'
#     preprocessor.perform_preprocessing(target_column)
#     end_preprocessing_time = time.time()

#     X_train = preprocessor.X_train
#     X_test = preprocessor.X_test
#     y_train = preprocessor.y_train
#     y_test = preprocessor.y_test

#     start_trainer_time = time.time()
#     trainer = ModelTrainer(X_train, X_test, y_train, y_test)
#     trainer.train_models()
#     end_trainer_time = time.time()
#     trainer.evaluate_models()

#     start_hyperparam_trainer_time = time.time()
#     trainer_with_hyperparam_tuning = ModelTrainerWithHyperparamTuning(X_train, X_test, y_train, y_test)
#     trainer_with_hyperparam_tuning.train_models_with_hyperparam_tuning()
#     end_hyperparam_trainer_time = time.time()

#     imputer = SimpleImputer(strategy='mean')
#     start_imputer_time = time.time()
#     X_test_imputed = imputer.fit_transform(X_test)
#     end_imputer_time = time.time()

#     start_evaluation_time = time.time()
#     evaluation_results = trainer_with_hyperparam_tuning.perform_evaluation(X_test_imputed, y_test)
#     end_evaluation_time = time.time()

#     total_time = end_total_time - start_total_time
#     preprocessing_time = end_preprocessing_time - start_preprocessing_time
#     trainer_time = end_trainer_time - start_trainer_time
#     hyperparam_trainer_time = end_hyperparam_trainer_time - start_hyperparam_trainer_time
#     imputer_time = end_imputer_time - start_imputer_time
#     evaluation_time = end_evaluation_time - start_evaluation_time

#     print(f"Total execution time: {total_time} seconds")
#     print(f"Preprocessing time: {preprocessing_time} seconds")
#     print(f"Trainer time: {trainer_time} seconds")
#     print(f"Hyperparameter trainer time: {hyperparam_trainer_time} seconds")
#     print(f"Imputer time: {imputer_time} seconds")
#     print(f"Evaluation time: {evaluation_time} seconds")


In [8]:
# import pandas as pd
# from sklearn.impute import SimpleImputer
# from ModelPreprocessor import DataPreprocessor
# from ModelTrainer import ModelTrainer, ModelTrainerWithHyperparamTuning
import time

if __name__ == "__main__":
    start_total_time = time.time()

    data_source = 'https://raw.githubusercontent.com/vengie/Project42_ELOGISTIX/main/Data/DSMMProject42-CPL-5559-Ecom_Shipping_stride.csv'
    data = pd.read_csv(data_source)

    start_preprocessing_time = time.time()
    preprocessor = DataPreprocessor(data)
    target_column = 'Reached.on.Time_Y.N'
    preprocessor.perform_preprocessing(target_column)
    end_preprocessing_time = time.time()

    X_train = preprocessor.X_train
    X_test = preprocessor.X_test
    y_train = preprocessor.y_train
    y_test = preprocessor.y_test

    start_trainer_time = time.time()
    trainer = ModelTrainer(X_train, X_test, y_train, y_test)
    trainer.train_models()
    end_trainer_time = time.time()
    trainer.evaluate_models()

    start_hyperparam_trainer_time = time.time()
    trainer_with_hyperparam_tuning = ModelTrainerWithHyperparamTuning(X_train, X_test, y_train, y_test)
    trainer_with_hyperparam_tuning.train_models_with_hyperparam_tuning()
    end_hyperparam_trainer_time = time.time()

    start_predict_time = time.time()
    predictions = trainer.predict()
    end_predict_time = time.time()

    total_time = end_predict_time - start_total_time
    preprocessing_time = end_preprocessing_time - start_preprocessing_time
    trainer_time = end_trainer_time - start_trainer_time
    hyperparam_trainer_time = end_hyperparam_trainer_time - start_hyperparam_trainer_time
    predict_time = end_predict_time - start_predict_time

    print(f"Total execution time: {total_time} seconds")
    print(f"Preprocessing time: {preprocessing_time} seconds")
    print(f"Trainer time: {trainer_time} seconds")
    print(f"Hyperparameter trainer time: {hyperparam_trainer_time} seconds")
    print(f"Predict time: {predict_time} seconds")

Initial columns: ['ID', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms', 'Reached.on.Time_Y.N']
Final columns: ['ID', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls', 'Customer_rating', 'Prior_purchases', 'Weight_in_gms', 'Reached.on.Time_Y.N', 'Discounted_Cost']
Training Logistic Regression...
Training Decision Tree...
Training XGBoost...
Training SVM...
Evaluating Logistic Regression...
Logistic Regression Accuracy: 0.6504545454545455
Logistic Regression Report: 
              precision    recall  f1-score   support

           0       0.58      0.53      0.55       895
           1       0.70      0.73      0.71      1305

    accuracy                           0.65      2200
   macro avg       0.64      0.63      0.63      2200
weighted avg       0.65      0.65      0.65      2200

Evaluating Decision Tree...
Decision Tree Accur