# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [45]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from tqdm.notebook import tqdm
import multiprocessing
import warnings
import joblib
import os

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [46]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Extracts 'hour' and 'weekday' from the 'timestamp' column and drops the original 'timestamp'.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy['timestamp'] = pd.to_datetime(X_copy['timestamp'])
        X_copy['hour'] = X_copy['timestamp'].dt.hour
        X_copy['weekday'] = X_copy['timestamp'].dt.weekday
        X_copy = X_copy.drop('timestamp', axis=1)
        return X_copy

class MyOneHotEncoder(BaseEstimator, TransformerMixin):
    """
    Performs One-Hot Encoding on categorical features.
    Excludes the specified target column from encoding if it's also categorical.
    """
    def __init__(self, target_column=None):
        self.target_column = target_column
        self.encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        self.categorical_features = []

    def fit(self, X, y=None):
        self.categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
        if self.target_column and self.target_column in self.categorical_features:
            self.categorical_features.remove(self.target_column)

        if self.categorical_features:
            self.encoder.fit(X[self.categorical_features])
        return self

    def transform(self, X):
        X_copy = X.copy()
        y = None
        if self.target_column:
            y = X_copy[self.target_column]
            X_copy = X_copy.drop(columns=[self.target_column])

        if not self.categorical_features:
            return X_copy, y

        encoded_features = self.encoder.transform(X_copy[self.categorical_features])
        encoded_df = pd.DataFrame(encoded_features, columns=self.encoder.get_feature_names(self.categorical_features), index=X_copy.index)

        X_copy = X_copy.drop(columns=self.categorical_features)

        X_processed = pd.concat([X_copy, encoded_df], axis=1)

        return X_processed, y

class TrainValidationTest:
    """
    Splits data into training, validation, and test sets.
    Handles stratification dynamically for each split to avoid ValueError.
    """
    def __init__(self, test_size=0.2, random_state=21):
        self.test_size = test_size
        self.random_state = random_state

    def _get_stratify_param(self, y_data, split_name=""):
        """Checks the least populated class in y and returns the stratify parameter."""
        class_counts = y_data.value_counts()
        min_class_count = class_counts.min()
        if min_class_count < 2:
            print(f"Warning ({split_name} split): Least populated class has {min_class_count} member(s). Stratification set to FALSE.")
            return None
        else:
            return y_data

    def split(self, X, y):
        stratify_y_1 = self._get_stratify_param(y, "Initial")
        X_train, X_temp, y_train, y_temp = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=stratify_y_1
        )

        test_valid_ratio = 0.2
        stratify_y_2 = self._get_stratify_param(y_temp, "Valid/Test")
        X_valid, X_test, y_valid, y_test = train_test_split(
            X_temp, y_temp, test_size=test_valid_ratio, random_state=self.random_state, stratify=stratify_y_2
        )
        return X_train, X_valid, X_test, y_train, y_valid, y_test

## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.772727
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.801484
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.855288
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [47]:
class ModelSelection(BaseEstimator, TransformerMixin):
    """
    Class for selecting the best model from given GridSearchCV instances and
    returning the best results for each model class.
    """
    def __init__(self, grids: list, grid_dict: dict):
        if not isinstance(grids, list) or not all(isinstance(g, GridSearchCV) for g in grids):
            raise TypeError("`grids` must be a list containing only GridSearchCV instances.")
        if not isinstance(grid_dict, dict) or not all(isinstance(k, int) and isinstance(v, str) for k, v in grid_dict.items()):
            raise TypeError("`grid_dict` must be a dictionary with integer keys and string values.")
        if not all(i < len(grids) for i in grid_dict.keys()):
            raise ValueError("Indices in `grid_dict` must not exceed the length of the `grids` list.")

        self.grids = grids
        self.grid_dict = grid_dict
        self.best_classifiers = []
        self.best_overall_classifier_name = None
        self.best_overall_validation_score = -1.0
        self.best_overall_estimator = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

    def choose(self, X_train, y_train, X_valid, y_valid):
        self.best_classifiers = []
        self.best_overall_validation_score = -1.0
        self.best_overall_classifier_name = None
        self.best_overall_estimator = None

        sorted_keys = sorted(self.grid_dict.keys())

        for idx in tqdm(sorted_keys, desc="Training and evaluating models"):
            model_name = self.grid_dict[idx]
            gs_estimator = self.grids[idx]

            print(f"\nEstimator: {model_name}")

            gs_estimator.fit(X_train, y_train)

            best_estimator = gs_estimator.best_estimator_
            best_params = gs_estimator.best_params_
            best_train_score = gs_estimator.best_score_

            y_valid_pred = best_estimator.predict(X_valid)
            valid_score = accuracy_score(y_valid, y_valid_pred)

            print(f"Best params: {best_params}")
            print(f"Best training accuracy: {best_train_score:.3f}")
            print(f"Validation set accuracy score for best params: {valid_score:.3f}")

            self.best_classifiers.append({
                'model': model_name,
                'params': best_params,
                'valid_score': valid_score,
                'train_score': best_train_score
            })

            if valid_score > self.best_overall_validation_score:
                self.best_overall_validation_score = valid_score
                self.best_overall_classifier_name = model_name
                self.best_overall_estimator = best_estimator

        print(f"\nClassifier with best validation set accuracy: {self.best_overall_classifier_name}")
        return self.best_overall_classifier_name

    def best_results(self):
        if not self.best_classifiers:
            print("Please call `choose()` method first.")
            return pd.DataFrame(columns=['model', 'params', 'valid_score'])

        results_df = pd.DataFrame(self.best_classifiers)
        return results_df[['model', 'params', 'valid_score']]

    def get_best_estimator(self):
        return self.best_overall_estimator

## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [48]:
class Finalize(BaseEstimator, TransformerMixin):
    """
    Class for final training, evaluation, and saving of the selected estimator.
    """
    def __init__(self, estimator):
        if not hasattr(estimator, 'fit') or not hasattr(estimator, 'predict'):
            raise ValueError("The provided estimator must have 'fit' and 'predict' methods.")
        self.estimator = estimator
        self.final_model_accuracy = None

    def fit(self, X, y=None):
        self.estimator.fit(X, y)
        return self

    def transform(self, X):
        return self.estimator.predict(X)

    def final_score(self, X_train, y_train, X_test, y_test):
        """
        Trains the model on X_train, y_train and evaluates it on X_test, y_test.
        Returns the accuracy on the test dataset.
        """
        print(f"Final model ({type(self.estimator).__name__}) is being trained...")
        self.estimator.fit(X_train, y_train)

        print("Model is being evaluated on the test set...")
        y_pred = self.estimator.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        self.final_model_accuracy = accuracy
        print(f"Accuracy of the final model is {accuracy:.15f}")
        return accuracy

    def save_model(self, path):
        """
        Saves the trained model to the specified path.
        path: File path where the model will be saved (e.g., 'models/final_model.joblib').
        """
        try:
            joblib.dump(self.estimator, path)
            print(f"Model successfully saved to {path}.")
        except Exception as e:
            print(f"Error saving the model: {e}")

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [49]:
if __name__ == "__main__":
    df = pd.read_csv("data/checker_submits.csv")
    target_col = 'labname'

    print(f"Class distribution for '{target_col}' column (original):")
    print(df[target_col].value_counts())
    print("-" * 30)

    threshold_count_for_merge = 2
    value_counts = df[target_col].value_counts()
    rare_classes = value_counts[value_counts < threshold_count_for_merge].index.tolist()

    if rare_classes:
        print(f"Merging the following rare classes in '{target_col}' into 'Other_Labs': {rare_classes}")
        df[target_col] = df[target_col].replace(rare_classes, 'Other_Labs')
        print(f"Class distribution for '{target_col}' column (updated):")
        print(df[target_col].value_counts())
    else:
        print(f"No classes with less than {threshold_count_for_merge} members found in '{target_col}'. No merging performed.")
    print("-" * 30)

    preprocessing = Pipeline([
        ('feature_extractor', FeatureExtractor()),
        ('onehot_encoder', MyOneHotEncoder(target_column=target_col))
    ])

    print("Preprocessing pipeline is being applied...")
    X_processed, y_target = preprocessing.fit_transform(df)

    min_class_count_after_merge = y_target.value_counts().min()
    if min_class_count_after_merge < 2:
        print(f"\nWarning: Even after merging, '{target_col}' has classes with {min_class_count_after_merge} member(s). Splitter will handle stratification.")
    else:
        print(f"\nLeast populated class in '{target_col}' after merging: {min_class_count_after_merge}. Splitter will enable stratification.")


    print("\nSplitting data into training, validation, and test sets...")
    splitter = TrainValidationTest(test_size=0.2, random_state=21)
    X_train, X_valid, X_test, y_train, y_valid, y_test = splitter.split(X_processed, y_target)

    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_valid shape: {X_valid.shape}, y_valid shape: {y_valid.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
    print("-" * 30)


    jobs = multiprocessing.cpu_count() - 1
    if jobs == 0: jobs = 1
    print(f"Number of CPU cores to use (n_jobs): {jobs}")

    svm = SVC(random_state=21, probability=True)
    svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'),
                   'C':[0.01, 0.1, 1, 1.5, 5, 10],
                   'gamma': ['scale', 'auto'],
                   'class_weight':('balanced', None)}]
    gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs, verbose=0)

    tree = DecisionTreeClassifier(random_state=21)
    tree_params = [{'criterion':('gini', 'entropy'),
                    'max_depth': list(range(5, 23, 2)),
                    'class_weight':('balanced', None)}]
    gs_tree = GridSearchCV(estimator=tree, param_grid=tree_params, scoring='accuracy', cv=2, n_jobs=jobs, verbose=0)

    rf = RandomForestClassifier(random_state=21)
    rf_params = [{'n_estimators':[10, 25, 50],
                  'criterion':('gini', 'entropy'),
                  'max_depth': list(range(10, 24, 2)),
                  'class_weight':('balanced', None)}]
    gs_rf = GridSearchCV(estimator=rf, param_grid=rf_params, scoring='accuracy', cv=2, n_jobs=jobs, verbose=0)

    grids = [gs_svm, gs_tree, gs_rf]
    grid_dict = {0: 'SVM', 1: 'Decision Tree', 2: 'Random Forest'}

    model_selector = ModelSelection(grids=grids, grid_dict=grid_dict)
    best_classifier_name = model_selector.choose(X_train, y_train, X_valid, y_valid)
    print(f"\nBest classifier chosen: {best_classifier_name}")

    results_df = model_selector.best_results()
    print("\nBest results for each model class:")
    print(results_df)
    print("-" * 30)

    final_estimator = model_selector.get_best_estimator()

    finalizer = Finalize(estimator=final_estimator)

    final_accuracy = finalizer.final_score(X_train, y_train, X_test, y_test)
    model_name_for_save = type(final_estimator).__name__.lower()
    save_directory = 'models'
    os.makedirs(save_directory, exist_ok=True)

    save_file_name = f"{model_name_for_save}_{final_accuracy:.15f}.sav"
    model_save_path = os.path.join(save_directory, save_file_name)
    finalizer.save_model(model_save_path)

Class distribution for 'labname' column (original):
labname
project1    951
laba05      222
laba04      178
laba04s     104
code_rvw     82
laba06s      61
laba06       48
lab05s       36
lab02         2
lab03         1
lab03s        1
Name: count, dtype: int64
------------------------------
Merging the following rare classes in 'labname' into 'Other_Labs': ['lab03', 'lab03s']
Class distribution for 'labname' column (updated):
labname
project1      951
laba05        222
laba04        178
laba04s       104
code_rvw       82
laba06s        61
laba06         48
lab05s         36
Other_Labs      2
lab02           2
Name: count, dtype: int64
------------------------------
Preprocessing pipeline is being applied...

Least populated class in 'labname' after merging: 2. Splitter will enable stratification.

Splitting data into training, validation, and test sets...
X_train shape: (1348, 33), y_train shape: (1348,)
X_valid shape: (270, 33), y_valid shape: (270,)
X_test shape: (68, 33), y_test s

HBox(children=(FloatProgress(value=0.0, description='Training and evaluating models', max=3.0, style=ProgressS…


Estimator: SVM
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
Best training accuracy: 0.777
Validation set accuracy score for best params: 0.837

Estimator: Decision Tree
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21}
Best training accuracy: 0.800
Validation set accuracy score for best params: 0.822

Estimator: Random Forest
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50}
Best training accuracy: 0.843
Validation set accuracy score for best params: 0.878


Classifier with best validation set accuracy: Random Forest

Best classifier chosen: Random Forest

Best results for each model class:
           model                                             params  \
0            SVM  {'C': 10, 'class_weight': None, 'gamma': 'auto...   
1  Decision Tree  {'class_weight': 'balanced', 'criterion': 'gin...   
2  Random Forest  {'class_weight': None, 'criterion': 'entropy',...   

   va