# Libraries

In [19]:
import os
import pandas as pd
import numpy as np
import scipy as sp
from tqdm import tqdm_notebook
from IPython.display import display
from datetime import datetime
import joblib
from pprint import pprint

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, \
            GradientBoostingClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score, classification_report, \
            accuracy_score, precision_score, recall_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

SEED = 24
N_JOBS = -1
cv_number = 3
cv = StratifiedKFold(n_splits=cv_number, shuffle=True, random_state=SEED)

metric = 'roc_auc'
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
file_name_log = 'models/df_log.pkl'
path_for_tunning_model = 'models/tunning'

# List of models

In [2]:
model_dict = {
    'logreg': LogisticRegression(max_iter=100, random_state=SEED, n_jobs=N_JOBS),
#     'knn': KNeighborsClassifier(n_jobs=N_JOBS),
    'dt': DecisionTreeClassifier(random_state=SEED),
    'rf': RandomForestClassifier(random_state=SEED, n_jobs=N_JOBS),
    'ext': ExtraTreesClassifier(random_state=SEED, n_jobs=N_JOBS),
#     'bag': BaggingClassifier() # Dublicate RandomForest
    'adb': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), random_state=SEED),
    'gb': GradientBoostingClassifier(random_state=SEED),
    'xgb': XGBClassifier(random_state=SEED, n_jobs=N_JOBS),
#     'svc': SVC(probability=True, random_state=SEED),
    'cat': CatBoostClassifier(random_state=SEED, verbose=False),
    'lgb': LGBMClassifier(random_state=SEED, n_jobs=N_JOBS),
    'mlp': MLPClassifier(hidden_layer_sizes=(128, 32, 2), random_state=SEED),
}

# Load data with all imputation

In [3]:
df = pd.read_csv('data/df_after_all_imputations.csv')
df.sort_values(by=['Location_ID', 'Date'], inplace=True)
df.reset_index(inplace=True);
df.drop(['Unnamed: 0', 'index', 'Date'], axis=1, inplace=True)

print(df.shape)
df.head(3).T
# df.info()


(142193, 24)


Unnamed: 0,0,1,2
Location_ID,0.0,0.0,0.0
Cloud9am,6.44443,6.487496,5.593826
Cloud3pm,6.73156,6.426017,5.762627
Humidity9am,92.0,75.0,81.0
Humidity3pm,67.0,52.0,56.0
Pressure9am,1017.4,1022.4,1027.8
Pressure3pm,1017.7,1022.6,1026.5
MinTemp,8.8,12.7,6.2
MaxTemp,15.7,15.8,15.1
Temp9am,13.5,13.7,9.3


# Split data test for stacking 

In [4]:
X, X_test_stacking, y, y_test_stacking = train_test_split(
    df.drop('RainTomorrow', axis=1), 
    df['RainTomorrow'],
    test_size=0.2,
    random_state=SEED,
    shuffle=True,
)
print(X.shape, X_test_stacking.shape, y.shape, y_test_stacking.shape)

(113754, 23) (28439, 23) (113754,) (28439,)


In [5]:
def create_log_file(file_name):
    d = dict(
        model_type=model_type,
        model=model,
        type_of_model='tunning', # or base
        best_preprocessing_pipe=best_preprocessing_pipe,
        time_save=datetime.now().strftime("%m-%d-%Y, %H-%M-%S"),
        time_search=None,
        best_roc_auc=0.5,
        best_hypparam=None,
        type_of_searchers='GridSearchCV',
        start_params_for_searchers=None,
    )
    df_log = pd.DataFrame(columns=d.keys())
    joblib.dump(df_log, file_name)

def write_log(data, file_name):
    if not os.path.isfile(file_name):
        create_log_file(file_name)
    df_log = joblib.load(file_name)
    df_log = pd.concat([df_log, data], ignore_index=True)
    joblib.dump(df_log, file_name)


# LOG file display

In [22]:
df_log0 = joblib.load(file_name_log)
list_display = ['model_type', 'type_of_model', 'best_roc_auc','time_save', 'time_search', 'best_hypparam', 'model' ]
df_log = df_log0.sort_values(by=['model_type', 'best_roc_auc'], ascending=[True, False])[list_display]
display(df_log)
df_log = df_log.groupby(by=['model_type']).agg(['first'])
display(df_log)

Unnamed: 0,model_type,type_of_model,best_roc_auc,time_save,time_search,best_hypparam,model
5,adb,tunning,0.864118,"02-22-2020, 01-28-11",00:00:45.217540,{},models/tunning\adb-tunning_model-[GridSearchCV...
10,cat,tunning,0.929801,"02-22-2020, 20-51-52",00:03:59.031723,{},models/tunning\cat-tunning_model-[GridSearchCV...
4,ext,tunning,1.0,"02-22-2020, 01-26-47",00:01:34.252897,{},models/tunning\ext-tunning_model-[GridSearchCV...
6,gb,tunning,0.876805,"02-22-2020, 01-35-13",00:03:37.395600,{},models/tunning\gb-tunning_model-[GridSearchCV]...
12,knn,tunning,0.941013,"02-22-2020, 21-08-00",00:11:45.298764,{},models/tunning\knn-tunning_model-[GridSearchCV...
11,lgb,tunning,0.903246,"02-22-2020, 20-54-52",00:00:11.183694,{},models/tunning\lgb-tunning_model-[GridSearchCV...
3,logreg,tunning,0.869198,"02-21-2020, 17-39-18",00:02:49.210596,{'m__C': 0.4771764656134451},models/tunning\logreg-tunning_model-[OptunaSea...
0,logreg,tunning,0.869198,"02-21-2020, 16-40-33",00:02:18.733379,{},models/tunning\logreg-tunning_model-[GridSearc...
13,mlp,tunning,0.929429,"02-25-2020, 01-58-07",00:20:50.728495,{},models/tunning\mlp-tunning_model-[GridSearchCV...
1,rf,tunning,0.999998,"02-21-2020, 16-43-36",00:01:29.060461,{},models/tunning\rf-tunning_model-[GridSearchCV]...


Unnamed: 0_level_0,type_of_model,best_roc_auc,time_save,time_search,best_hypparam,model
Unnamed: 0_level_1,first,first,first,first,first,first
model_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
adb,tunning,0.864118,"02-22-2020, 01-28-11",00:00:45.217540,{},models/tunning\adb-tunning_model-[GridSearchCV...
cat,tunning,0.929801,"02-22-2020, 20-51-52",00:03:59.031723,{},models/tunning\cat-tunning_model-[GridSearchCV...
ext,tunning,1.0,"02-22-2020, 01-26-47",00:01:34.252897,{},models/tunning\ext-tunning_model-[GridSearchCV...
gb,tunning,0.876805,"02-22-2020, 01-35-13",00:03:37.395600,{},models/tunning\gb-tunning_model-[GridSearchCV]...
knn,tunning,0.941013,"02-22-2020, 21-08-00",00:11:45.298764,{},models/tunning\knn-tunning_model-[GridSearchCV...
lgb,tunning,0.903246,"02-22-2020, 20-54-52",00:00:11.183694,{},models/tunning\lgb-tunning_model-[GridSearchCV...
logreg,tunning,0.869198,"02-21-2020, 17-39-18",00:02:49.210596,{'m__C': 0.4771764656134451},models/tunning\logreg-tunning_model-[OptunaSea...
mlp,tunning,0.929429,"02-25-2020, 01-58-07",00:20:50.728495,{},models/tunning\mlp-tunning_model-[GridSearchCV...
rf,tunning,0.999998,"02-21-2020, 16-43-36",00:01:29.060461,{},models/tunning\rf-tunning_model-[GridSearchCV]...
stacking,base_light,0.906363,"02-27-2020, 01-59-21",00:07:16.892957,,models/tunning\stacking-base_light_model-[Hold...


In [23]:
# get full list(dict) of tunning models
# drop stacking model
# 'model' - path to file with model
dict_tunning_models = dict(zip(df_log.index.to_list(), df_log[('model', 'first')]))
print(dict_tunning_models)
dict_tunning_models.pop('stacking', None)
# dict_tunning_models.pop('ext', None)
# dict_tunning_models.pop('rf', None)
pprint(dict_tunning_models)

{'adb': 'models/tunning\\adb-tunning_model-[GridSearchCV][02-22-2020, 01-28-11].pkl', 'cat': 'models/tunning\\cat-tunning_model-[GridSearchCV][02-22-2020, 20-51-52].pkl', 'ext': 'models/tunning\\ext-tunning_model-[GridSearchCV][02-22-2020, 01-26-47].pkl', 'gb': 'models/tunning\\gb-tunning_model-[GridSearchCV][02-22-2020, 01-35-13].pkl', 'knn': 'models/tunning\\knn-tunning_model-[GridSearchCV][02-22-2020, 21-08-00].pkl', 'lgb': 'models/tunning\\lgb-tunning_model-[GridSearchCV][02-22-2020, 20-54-52].pkl', 'logreg': 'models/tunning\\logreg-tunning_model-[OptunaSearchCV][02-21-2020, 17-39-18].pkl', 'mlp': 'models/tunning\\mlp-tunning_model-[GridSearchCV][02-25-2020, 01-58-07].pkl', 'rf': 'models/tunning\\rf-tunning_model-[GridSearchCV][02-21-2020, 16-43-36].pkl', 'stacking': 'models/tunning\\stacking-base_light_model-[HoldOut][02-27-2020, 01-59-21].pkl', 'xgb': 'models/tunning\\xgb-tunning_model-[GridSearchCV][02-21-2020, 17-24-57].pkl'}
{'adb': 'models/tunning\\adb-tunning_model-[GridSear

# для Слайда о препроцессинге

In [30]:
for model_type, model_path in dict_tunning_models.items():
#     print(model_type, model_path) 
    load_model = joblib.load(dict_tunning_models[model_type])
    print(model_type, load_model.steps[0][1].steps)

adb [('tf1', StandardScaler(copy=True, with_mean=True, with_std=True)), ('tf2', None)]
cat [('tf1', None), ('tf2', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 StandardScaler(copy=True, with_mean=True,
                                                with_std=True),
                                 ['Humidity9am', 'Humidity3pm', 'Pressure9am',
                                  'Pressure3pm', 'MinTemp', 'MaxTemp',
                                  'Temp9am', 'Temp3pm', 'Rainfall',
                                  'WindGustSpeed', 'WindSpeed9am',
                                  'WindSpeed3pm', 'year', 'month', 'season',
                                  'Cloud9am', 'Cloud3pm', 'Evaporation',
                                  'Sunshine']),
                                ('cat',
                                 OneHotEncoder(categories='auto', dr

# FeatureUnion for Stacking

In [15]:
# Xnew_train
for model_type, model_path in dict_tunning_models.items():
    print(model_type, model_path)

adb models/tunning\adb-tunning_model-[GridSearchCV][02-22-2020, 01-28-11].pkl
cat models/tunning\cat-tunning_model-[GridSearchCV][02-22-2020, 20-51-52].pkl
ext models/tunning\ext-tunning_model-[GridSearchCV][02-22-2020, 01-26-47].pkl
gb models/tunning\gb-tunning_model-[GridSearchCV][02-22-2020, 01-35-13].pkl
knn models/tunning\knn-tunning_model-[GridSearchCV][02-22-2020, 21-08-00].pkl
lgb models/tunning\lgb-tunning_model-[GridSearchCV][02-22-2020, 20-54-52].pkl
logreg models/tunning\logreg-tunning_model-[OptunaSearchCV][02-21-2020, 17-39-18].pkl
mlp models/tunning\mlp-tunning_model-[GridSearchCV][02-25-2020, 01-58-07].pkl
rf models/tunning\rf-tunning_model-[GridSearchCV][02-21-2020, 16-43-36].pkl
xgb models/tunning\xgb-tunning_model-[GridSearchCV][02-21-2020, 17-24-57].pkl


In [16]:
class PredictProbaToTransform(BaseEstimator, TransformerMixin):
    """
    Give all X and classifier on input
    output - predict_proba by classifier (only binary classifier)
    This class needs to compare outputs from classifiers in FeatureUnion model
    """
    def __init__(self, clf):
        self.clf = clf
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        output = self.clf.predict_proba(X)[:, 1]
        output = np.reshape(output, (-1,1))
#         print(output.shape)
        return output


In [17]:
start_time = datetime.now()
union_list = []

for model_type, model_path in dict_tunning_models.items():
    if os.path.exists(model_path):
        load_model = joblib.load(dict_tunning_models[model_type])
    else:
        print(f'{model_path} - file not found')
        continue
    union_list.append((model_type, PredictProbaToTransform(load_model)))

union = FeatureUnion(union_list)

stacking_pipe = Pipeline([
    ('union', union),
    ('meta_m', LogisticRegression()) 
])

# Hold out
X_train, X_test, y_train, y_test = train_test_split(
    X_test_stacking, 
    y_test_stacking,
    test_size=0.2,
    random_state=SEED,
    shuffle=True,
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

stacking_pipe.fit(X_train, y_train)
y_pred_train = stacking_pipe.predict_proba(X_train)[:, 1]
y_pred_test = stacking_pipe.predict_proba(X_test)[:, 1]
print(f'roc_auc train - {roc_auc_score(y_train, y_pred_train):.5f}, roc_auc test - {roc_auc_score(y_test, y_pred_test):.5f}')

stacking_pipe.fit(X_test_stacking, y_test_stacking)

model_type = 'stacking'
type_of_searchers='HoldOut'
cur_time = datetime.now().strftime("%m-%d-%Y, %H-%M-%S")
type_of_model = 'base_light'
file_name = os.path.join(path_for_tunning_model, \
    f'{model_type}-{type_of_model}_model-[{type_of_searchers}][{cur_time}].pkl')
joblib.dump(stacking_pipe, file_name)
new_log = dict(
    model_type=model_type,
    model=file_name,
    type_of_model=type_of_model,
    best_preprocessing_pipe='?',
    time_save=cur_time,
    time_search=datetime.now() - start_time,
    best_roc_auc=roc_auc_score(y_test, y_pred_test),
    best_hypparam='',
    type_of_searchers=type_of_searchers,
    start_params_for_searchers='',
)
print(f'new_log[best_roc_auc] = {new_log["best_roc_auc"]}')
write_log(pd.DataFrame([new_log]), file_name_log)

(22751, 23) (5688, 23) (22751,) (5688,)
roc_auc train - 0.90504, roc_auc test - 0.90636
new_log[best_roc_auc] = 0.9063631911002366



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [18]:
print(stacking_pipe.steps[1][1].coef_[0])
print(np.array(union.transformer_list)[:,0])
# [2.05081497 1.07581275 2.14593311 2.95698225]
# ['adb' 'gb' 'logreg' 'xgb']


[ 1.47690734  4.50000543  3.74516449 -3.64773419  0.27699953  0.95520988
  0.06311789  0.51863522  0.89785433 -0.48374183]
['adb' 'cat' 'ext' 'gb' 'knn' 'lgb' 'logreg' 'mlp' 'rf' 'xgb']


In [20]:
write_log(pd.DataFrame([new_log]), file_name_log)

In [143]:
load_model = joblib.load(dict_tunning_models['logreg'])
ppp = PredictProbaToTransform(load_model)
display(ppp.fit_transform(X_test_stacking)[:5])
union_list = []

for model_type, model_path in dict_tunning_models.items():
    print(model_type, model_path)
#     to check file for existing
    if os.path.exists(model_path):
        load_model = joblib.load(dict_tunning_models[model_type])
    else:
        print(f'{model_path} - file not found')
        continue
    union_list.append((model_type, PredictProbaToTransform(load_model)))
# union_list
union = FeatureUnion(union_list)
output = union.fit_transform(X_test_stacking)
display(output[:5])

(28439, 1)


array([[0.1704409 ],
       [0.11660942],
       [0.69035864],
       [0.07295003],
       [0.96805561]])

logreg models/tunning\logreg-tunning_model-[OptunaSearchCV][02-21-2020, 17-39-18].pkl
rf models/tunning\rf-tunning_model-[GridSearchCV][02-21-2020, 16-43-36].pkl
xgb models/tunning\xgb-tunning_model-[GridSearchCV][02-21-2020, 17-24-57].pkl
(28439, 1)
(28439, 1)
(28439, 1)


array([[0.1704409 , 0.5       , 0.30050564],
       [0.11660942, 0.23      , 0.10351475],
       [0.69035864, 0.49      , 0.64695835],
       [0.07295003, 0.04      , 0.04351906],
       [0.96805561, 0.98      , 0.9610343 ]])