# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
import itertools
from tqdm.notebook import tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from joblib import dump

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [18]:
class FeatureExtractor:
    def __init__(self):
        pass
    def fit(self, df, y=None):
        return self
    def transform(self, df):
        df['hour'] = pd.to_datetime(df['timestamp']).dt.hour
        df['weekday'] = pd.to_datetime(df['timestamp']).dt.day_of_week
        df = df.drop(columns=['timestamp'])
        return df
    
df = pd.read_csv("../data/checker_submits.csv")
f_e = FeatureExtractor()
df = f_e.transform(df)
df

Unnamed: 0,uid,labname,numTrials,hour,weekday
0,user_4,project1,1,5,4
1,user_4,project1,2,5,4
2,user_4,project1,3,5,4
3,user_4,project1,4,5,4
4,user_4,project1,5,5,4
...,...,...,...,...,...
1681,user_19,laba06s,9,20,3
1682,user_1,laba06s,6,20,3
1683,user_1,laba06s,7,20,3
1684,user_1,laba06s,8,20,3


In [None]:
class MyOneHotEncoder:
    def __init__(self, target_column):
        self.target_column = target_column
        self.encoder = None
        self.cat_cols = None
    def fit(self, df, y=None):
        self.cat_cols = list(df.select_dtypes(include='object').columns)
        if self.target_column in self.cat_cols:
            self.cat_cols.remove(self.target_column)
        self.encoder = OneHotEncoder()
        self.encoder.fit(df[self.cat_cols])
        return self
    def transform(self, df):
        encoded = self.encoder.transform(df[self.cat_cols]).toarray()
        new_columns = self.encoder.get_feature_names_out(self.cat_cols)
        encoded = pd.DataFrame(encoded, columns=new_columns, index=df.index)
        df = df.drop(columns=self.cat_cols)
        df = pd.concat([df, encoded], axis=1)
        return df
    
my_encoder = MyOneHotEncoder('weekday')
my_encoder.fit(df)
df = my_encoder.transform(df)
df

Unnamed: 0,numTrials,hour,weekday,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,6,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,7,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,8,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [20]:
class TrainValidationTest:
    def __init__(self):
        pass
    def split(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=21, stratify=y_train)
        return X_train, X_valid, X_test, y_train, y_valid, y_test
    
splitter = TrainValidationTest()
X = df.drop(columns=['weekday'])
y = df['weekday']

X_train, X_valid, X_test, y_train, y_valid, y_test = splitter.split(X, y)
X_train.head(5)

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
862,5,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
812,3,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
830,19,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
482,5,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
651,12,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.877778
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.866667
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.907407
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [None]:
class ModelSelection:
    def __init__(self, grids, grid_dict):
        self.grids = grids
        self.grid_dict = grid_dict
        self.results = []
    
    def choose(self, X_train, y_train, X_valid, y_valid):
        best_clf_all = ''
        best_score_all = 0
        for idx, grid in enumerate(self.grids):
            model_name = self.grid_dict[idx]
            print(f"Estimator: {model_name}")

            grid.fit(X_train, y_train)

            best_params = grid.best_params_
            best_score = grid.best_score_
            
            val_score = grid.score(X_valid, y_valid)

            if val_score > best_score_all:
                best_score_all = val_score
                best_clf_all = model_name

            self.results.append({
                'model' : idx,
                'params' : model_name + " " + str(best_params),
                'valid_score' : val_score

            })
            with tqdm(total=len(grid.cv_results_['params']), desc=model_name) as pbar:
                pbar.update(len(grid.cv_results_['params']))

            print(f"Best params: {best_params}")
            print(f"Best training accuracy: {best_score:.3f}")
            print(f"Validation set accuracy score for best params: {val_score:.3f}\n")
        print(f"Classifier with best validation set accuracy: {best_clf_all}")
        return best_clf_all
    
    def best_results(self):
        return pd.DataFrame(self.results)


In [32]:
svm_params = {
    'kernel': ['linear', 'rbf'],
    'random_state': [21],
    'probability': [True]
}
tree_params = {
    'max_depth': [None, 10, 20], 
    'class_weight': ['balanced', None]
}
rf_params = {
    'n_estimators': [10, 50], 
    'max_depth': [None, 10]
}

gs_svm = GridSearchCV(estimator=SVC(), param_grid=svm_params, scoring='accuracy', cv=2)
gs_tree = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=tree_params, scoring='accuracy', cv=2)
gs_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_params, scoring='accuracy', cv=2)

grids = [gs_svm, gs_tree, gs_rf]

grid_dict = {0: "SVM", 1: "Decision Tree", 2: "Random Forest"}

model_selector = ModelSelection(grids, grid_dict)

best_model_name = model_selector.choose(X_train, y_train, X_valid, y_valid)

results_df = model_selector.best_results()
results_df

Estimator: SVM


SVM:   0%|          | 0/2 [00:00<?, ?it/s]

Best params: {'kernel': 'linear', 'probability': True, 'random_state': 21}
Best training accuracy: 0.633
Validation set accuracy score for best params: 0.607

Estimator: Decision Tree


Decision Tree:   0%|          | 0/6 [00:00<?, ?it/s]

Best params: {'class_weight': 'balanced', 'max_depth': None}
Best training accuracy: 0.803
Validation set accuracy score for best params: 0.859

Estimator: Random Forest


Random Forest:   0%|          | 0/4 [00:00<?, ?it/s]

Best params: {'max_depth': None, 'n_estimators': 50}
Best training accuracy: 0.852
Validation set accuracy score for best params: 0.904

Classifier with best validation set accuracy: Random Forest


Unnamed: 0,model,params,valid_score
0,0,"SVM {'kernel': 'linear', 'probability': True, ...",0.607407
1,1,"Decision Tree {'class_weight': 'balanced', 'ma...",0.859259
2,2,"Random Forest {'max_depth': None, 'n_estimator...",0.903704


## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [None]:
class Finalize():
    def __init__(self, estimator):
        self.estimator = estimator
        
    def final_score(self, X_train, y_train, X_test, y_test):
        self.estimator.fit(X_train, y_train)
        y_pred = self.estimator.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'Accuracy of the final model is {accuracy}')
    
    def save_model(self, path):
        dump(self.estimator, path, compress=9)
        print('The model was successfully saved')

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [38]:
df = pd.read_csv("../data/checker_submits.csv")
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
data = preprocessing.fit_transform(df)
data.head()

Unnamed: 0,numTrials,hour,weekday,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [39]:
splitter = TrainValidationTest()
X = data.drop(columns=['weekday'])
y = data['weekday']

X_train, X_valid, X_test, y_train, y_valid, y_test = splitter.split(X, y)

In [40]:
svm_params = {
    'kernel': ['linear', 'rbf'],
    'random_state': [21],
    'probability': [True]
}
tree_params = {
    'max_depth': [None, 10, 20], 
    'class_weight': ['balanced', None]
}
rf_params = {
    'n_estimators': [10, 50], 
    'max_depth': [None, 10]
}

gs_svm = GridSearchCV(estimator=SVC(), param_grid=svm_params, scoring='accuracy', cv=2)
gs_tree = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=tree_params, scoring='accuracy', cv=2)
gs_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_params, scoring='accuracy', cv=2)

In [41]:
grids = [gs_svm, gs_tree, gs_rf]

grid_dict = {0: "SVM", 1: "Decision Tree", 2: "Random Forest"}

model_selector = ModelSelection(grids, grid_dict)

best_model_name = model_selector.choose(X_train, y_train, X_valid, y_valid)

results_df = model_selector.best_results()

Estimator: SVM


SVM:   0%|          | 0/2 [00:00<?, ?it/s]

Best params: {'kernel': 'linear', 'probability': True, 'random_state': 21}
Best training accuracy: 0.633
Validation set accuracy score for best params: 0.607

Estimator: Decision Tree


Decision Tree:   0%|          | 0/6 [00:00<?, ?it/s]

Best params: {'class_weight': 'balanced', 'max_depth': None}
Best training accuracy: 0.800
Validation set accuracy score for best params: 0.859

Estimator: Random Forest


Random Forest:   0%|          | 0/4 [00:00<?, ?it/s]

Best params: {'max_depth': None, 'n_estimators': 50}
Best training accuracy: 0.852
Validation set accuracy score for best params: 0.904

Classifier with best validation set accuracy: Random Forest


In [42]:
results_df

Unnamed: 0,model,params,valid_score
0,0,"SVM {'kernel': 'linear', 'probability': True, ...",0.607407
1,1,"Decision Tree {'class_weight': 'balanced', 'ma...",0.859259
2,2,"Random Forest {'max_depth': None, 'n_estimator...",0.903704


In [43]:
rf = RandomForestClassifier(max_depth=None, n_estimators = 50)
finalize = Finalize(rf)
finalize.final_score(X_train, y_train, X_test, y_test)

'Accuracy of the final model is 0.9023668639053254'

In [44]:
finalize.save_model("Random_Forest_Classifier_{0.90}.sav")

The model was successfully saved
