# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [1]:
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [2]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = X.copy()
        df['hour'] = df['timestamp'].dt.hour
        df['dayofweek'] = df['timestamp'].dt.dayofweek
        df = df.drop(['timestamp'], axis=1)
        return df

In [11]:
class MyOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, target_column):
        self.target_column = target_column
        self.encoder = OneHotEncoder(sparse=False)
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.encoder.fit(X[['uid', 'labname']])
        self.scaler.fit(X[['numTrials', 'hour']])
        return self

    def transform(self, X, y=None):
        encoded_data = self.encoder.transform(X[['uid', 'labname']])
        encoded_df = pd.DataFrame(encoded_data)

        scaled_data = self.scaler.transform(X[['numTrials', 'hour']])
        scaled_df = pd.DataFrame(scaled_data, columns=['numTrials', 'hour'])

        transform_df = scaled_df.join(X[self.target_column]).join(encoded_df)
        return transform_df

In [4]:
class TrainValidationTest(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y):
        return self

    def transform(self, X, y):
        X_train_val, X_test, y_train_val, y_test = train_test_split(
            X, y, test_size=0.2, random_state=21, stratify=y
        )

        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train_val, y_train_val, test_size=0.2, random_state=21, stratify=y_train_val
        )

        return X_train, X_valid, X_test, y_train, y_valid, y_test

## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.772727
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.801484
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.855288
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [5]:
class ModelSelection:
    def __init__(self, grids, grids_dict):
        self.grids = grids
        self.grids_dict = grids_dict
        self.results = []

    def choose(self, X_train, X_valid, y_train, y_valid):
        for grid in self.grids:
            estimator_name = grid.estimator.__class__.__name__
            print(f'Estimator: {estimator_name}')
            
            grid.fit(X_train, y_train)
            valid_score = grid.score(X_valid, y_valid)
            
            # Сохранение результатов
            self.results.append({
                'model': estimator_name,
                'params': grid.best_params_,
                'valid_score': valid_score,
                'train_score': grid.best_score_
            })
            
            print(f'Best params: {grid.best_params_}')
            print(f'Best training accuracy: {grid.best_score_:.3f}')
            print(f'Validation set accuracy score for best params: {valid_score:.3f}')
            print()

    def best_results(self):
        # Создаем DataFrame с результатами
        results_df = pd.DataFrame(self.results)
        
        # Находим лучшую модель
        best_model = results_df.loc[results_df['valid_score'].idxmax()]
        print(f'Classifier with best validation set accuracy: {best_model["model"]}')
        
        return results_df

svm = SVC()
tree = DecisionTreeClassifier()
rf = RandomForestClassifier()

svm_params = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None],
    'random_state': [21],
    'probability': [True]
}

tree_params = {
    'max_depth': np.arange(1, 50),
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy'],
    'random_state': [21]
}

rf_params =  {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': np.arange(1, 50),
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy'],
    'random_state': [21]
}

jobs = -1
gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs)
gs_tree = GridSearchCV(estimator=tree, param_grid=tree_params, scoring='accuracy', cv=2, n_jobs=jobs)
gs_rf = GridSearchCV(estimator=rf, param_grid=rf_params, scoring='accuracy', cv=2, n_jobs=jobs)

grids = [gs_svm, gs_tree, gs_rf]
grids_dict = {}

## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [6]:
class Finalize:
    def __init__(self, model):
        self.model = model
    
    def final_score(self, X_train, X_test, y_train, y_test):
        self.model.fit(X_train, y_train)
        score = self.model.score(X_test, y_test)
        print(f'Accuracy of the final model is: {score}')
        return score
    
    def save_model(self, path):
        joblib.dump(self.model, path)

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [12]:
df = pd.read_csv('../data/checker_submits.csv', parse_dates=['timestamp'])

preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
data = preprocessing.fit_transform(df)
data.head()

Unnamed: 0,numTrials,hour,dayofweek,0,1,2,3,4,5,6,...,31,32,33,34,35,36,37,38,39,40
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
X = data.drop('dayofweek', axis=1)
y = data['dayofweek']
splitter = TrainValidationTest()
X_train, X_valid, X_test, y_train, y_valid, y_test = splitter.transform(X, y)

In [14]:
selection = ModelSelection(grids, grids_dict)
selection.choose(X_train, X_valid, y_train, y_valid)
model_df = selection.best_results()

Estimator: SVC
Best params: {'C': 10, 'class_weight': None, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.818
Validation set accuracy score for best params: 0.885

Estimator: DecisionTreeClassifier
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.803
Validation set accuracy score for best params: 0.867

Estimator: RandomForestClassifier
Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 21, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.854
Validation set accuracy score for best params: 0.893

Classifier with best validation set accuracy: RandomForestClassifier


In [15]:
model_df

Unnamed: 0,model,params,valid_score,train_score
0,SVC,"{'C': 10, 'class_weight': None, 'gamma': 'scal...",0.885185,0.818182
1,DecisionTreeClassifier,"{'class_weight': 'balanced', 'criterion': 'gin...",0.866667,0.80334
2,RandomForestClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",0.892593,0.85436


In [16]:
final_model = Finalize(RandomForestClassifier(class_weight=None, criterion='gini', max_depth=21, n_estimators=50, random_state=21))
final_model.final_score(X_train, X_test, y_train, y_test)
final_model.save_model('../data/final_model.pkl')

Accuracy of the final model is: 0.9171597633136095
