In [1]:
import tempfile
import urllib.request

import sys
sys.path.append("../")

import pandas as pd
import joblib
import transformers as tr #custom ad-hoc classes for pipeline

from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

## Reading and preparing stratified sample
from [notebook](https://github.com/woldemarg/ds_tests/blob/master/machine_learning/company_3/task_solution/scripts/notebooks/get_sample.ipynb)

In [2]:
sample = (pd.read_csv("https://raw.githubusercontent.com/woldemarg/ds_tests/master/machine_learning/company_3/task_solution/derived/sample.csv")
          .drop(["id"], axis=1))

In [3]:
cat_cols = sample.columns[sample.columns.str.startswith("cat")]
sample.loc[:, cat_cols] = sample.loc[:, cat_cols].astype(str)

In [4]:
y_sample = sample["gb"]
X_sample = sample.copy()

## Reading baseline model
from [notebook](https://github.com/woldemarg/ds_tests/blob/master/machine_learning/company_3/task_solution/scripts/notebooks/sample_model.ipynb)

In [5]:
temp_dir_path = tempfile.mkstemp()
file_path = (urllib
             .request
             .urlretrieve("https://github.com/woldemarg/ds_tests/raw/master/machine_learning/company_3/task_solution/derived/base_model.sav",
                          temp_dir_path[1]))
xgb_model = joblib.load(file_path[0])

## Making simple pipeline
* pipeline steps previously defined in [notebook](https://github.com/woldemarg/ds_tests/blob/master/machine_learning/company_3/task_solution/scripts/notebooks/sample_model.ipynb)
* all custom ad-hoc transformers from [file](https://github.com/woldemarg/ds_tests/blob/master/machine_learning/company_3/task_solution/scripts/transformers.py)

In [6]:
feature_selection = Pipeline(
    steps=[
        ("drop_initial", tr.DropColumnsTransformer()),
        ("impute_cats", tr.CustomImputer(strategy="mode")),
        ("impute_nums", tr.CustomImputer(strategy="mean")),
        ("encode_oh", tr.CustomOHEncoder()),
        ("drop_low_corr", tr.CorrelationTransformer())])

model_pipe = Pipeline(
    steps=[
        ("f_selection", feature_selection),
        ("xgb_model", xgb_model)])

## Evaluating model on CV
Via applying automated pipeline to each fold we prevent data leakage on nan's imputing step

In [7]:
roc_auc_sample = cross_val_score(model_pipe,
                                 X_sample,
                                 y_sample,
                                 scoring="roc_auc",
                                 n_jobs=-1)

print("ROC_AUC on CV: {} ({})".format(roc_auc_sample.mean(),
                                      roc_auc_sample.std()))

ROC_AUC on CV: 0.88963869303581 (0.018753906806245037)


In [8]:
# joblib.dump(model_pipe, "machine_learning/company_3/task_solution/derived/model_pipe.sav")

Looking good! With an automated pipeline at our disposal we can safely continue to model hyper-parameters tuning - see [next notebook](https://github.com/woldemarg/ds_tests/blob/master/machine_learning/company_3/task_solution/scripts/notebooks/pipeline_tuning.ipynb) 