# Libraries

In [2]:
import os
import pandas as pd
import numpy as np
import scipy as sp
from tqdm import tqdm_notebook
from IPython.display import display
from datetime import datetime
import joblib

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, \
            GradientBoostingClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score, classification_report, \
            accuracy_score, precision_score, recall_score
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import optuna  # for optuna need sklearn ver less then '0.22.1' (for example 0.22)
from optuna.samplers import RandomSampler
from optuna.integration import OptunaSearchCV
from optuna.distributions import CategoricalDistribution, IntUniformDistribution, \
    UniformDistribution, LogUniformDistribution
from optuna.visualization import plot_slice, plot_contour, plot_optimization_history, \
    plot_intermediate_values, plot_parallel_coordinate

SEED = 24
N_JOBS = -1
cv_number = 3
cv = StratifiedKFold(n_splits=cv_number, shuffle=True, random_state=SEED)

metric = 'roc_auc'
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter



The sklearn.metrics.scorer module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.



# List of models

In [11]:
model_dict = {
    'logreg': LogisticRegression(max_iter=100, random_state=SEED, n_jobs=N_JOBS),
    'knn': KNeighborsClassifier(n_jobs=N_JOBS),
    'dt': DecisionTreeClassifier(random_state=SEED),
    'rf': RandomForestClassifier(random_state=SEED, n_jobs=N_JOBS),
    'ext': ExtraTreesClassifier(random_state=SEED, n_jobs=N_JOBS),
#     'bag': BaggingClassifier() # Dublicate RandomForest
    'adb': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), random_state=SEED),
    'gb': GradientBoostingClassifier(random_state=SEED),
    'xgb': XGBClassifier(random_state=SEED, n_jobs=N_JOBS),
#     'svc': SVC(random_state=SEED),
    'cat': CatBoostClassifier(random_state=SEED, verbose=False),
    'lgb': LGBMClassifier(random_state=SEED, n_jobs=N_JOBS),
    'mlp': MLPClassifier(hidden_layer_sizes=(128, 32, 2), random_state=SEED),
}

# Create dirs for saving model

In [4]:
path_for_base_model = 'models/base'
path_for_base_model1 = 'models/base1'
path_for_tunning_model = 'models/tunning'
file_name_log = 'models/df_log.pkl'
def create_dir(path):
    if not os.path.isdir(path):
        os.mkdir(path)
        if os.path.isdir(path):
            print(f'Create dir "{path}"')
    else:
        print(f'Check this path "{path}", now it is exist')

create_dir(path_for_base_model)
create_dir(path_for_tunning_model)

Check this path "models/base", now it is exist
Check this path "models/tunning", now it is exist


# I see two branches for next Imputation
* fill miising values for some Locations with ML algoritm
* del all Locations with a lot missing values


then compare


* ### param for tunning each model (to choose gridCV and randomizedCV, scopeCV ...)


# PipeLine Preprocessing data after imputation
* ### check some ways for preprocessing (Location and windDir as Categorical (one_hot) or as Numerical)

In [12]:
num_col_for_scaling = ['Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'MinTemp', \
                       'MaxTemp', 'Temp9am', 'Temp3pm', 'Rainfall', 'WindGustSpeed',  'WindSpeed9am',\
                       'WindSpeed3pm',]
num_col_raw = ['year', 'month', 'season']
num_question = ['Cloud9am', 'Cloud3pm', 'Evaporation', 'Sunshine',]
num_col_categorical = ['Location_ID']
cat_col = ['WindGustDir', 'WindDir9am', 'WindDir3pm',] 

tf_only_scale = StandardScaler()

tf_cat_and_num_list = [
    ('num', StandardScaler(), num_col_for_scaling + num_col_raw + num_question),
    ('cat', OneHotEncoder(sparse=False, dtype=np.uint8, handle_unknown='ignore'), num_col_categorical + cat_col),
]
tf_cat_and_num = ColumnTransformer(transformers=tf_cat_and_num_list)

pipe_preprocessing = Pipeline([
    ('tf1', tf_only_scale),
    ('tf2', tf_cat_and_num),
])


# Load data with all imputation

In [13]:
df = pd.read_csv('data/df_after_all_imputations.csv')
df.sort_values(by=['Location_ID', 'Date'], inplace=True)
df.reset_index(inplace=True);
df.drop(['Unnamed: 0', 'index', 'Date'], axis=1, inplace=True)

print(df.shape)

(142193, 24)


# Split data test for stacking 

In [14]:
X, X_test_stacking, y, y_test_stacking = train_test_split(
    df.drop('RainTomorrow', axis=1), 
    df['RainTomorrow'],
    test_size=0.2,
    random_state=SEED,
    shuffle=True,
)
print(X.shape, X_test_stacking.shape, y.shape, y_test_stacking.shape)

(113754, 23) (28439, 23) (113754,) (28439,)


# Save base models with preprocessing

In [None]:
time_show = True
print(X.shape, y.shape)
score_dict = dict()
for i, model in model_dict.items():
    print(f'Start {i}...')
    if time_show:
        start_time = datetime.now()
    ml_pipe = Pipeline([
        ('pre', pipe_preprocessing),
        ('model', model),
    ])
    param_gs = [{'pre__tf1': [None],},
                {'pre__tf2': [None],},
                {'pre__tf1': [None],'pre__tf2': [None],}]

    gs = GridSearchCV(
        estimator=ml_pipe,
        param_grid=param_gs,
        cv=cv,
        scoring=metric,
        return_train_score=False,
        verbose=0,
        n_jobs=N_JOBS,
    )
    gs.fit(X, y);
    print(f'gs.best_params_ {gs.best_params_}')
    print(f'gs.best_score_ {gs.best_score_}')
#     gs.best_params_, gs.best_score_, gs.best_estimator_.score(X_train, y_train), gs.best_estimator_.score(X_test, y_test)    
    print
    best_model = gs.best_estimator_
    best_model.fit(X, y)
    y_pred = best_model.predict_proba(X)[:, 1]
    result = {
        'gs.best_params_': gs.best_params_,
        'gs.best_estimator_' : gs.best_estimator_,
        'gs.best_score_' : gs.best_score_,
        'roc_auc_score' : roc_auc_score(y, y_pred),
    }
    score_dict.setdefault(i, result)
    file_name = os.path.join(path_for_base_model, f''+ i +'-base_model-[without_drop].pkl')
    joblib.dump(best_model, file_name)
    if os.path.isfile(file_name):
        print(f'Model saves as {file_name}')
    else:
        print(f'File doesn`t save {file_name}')
    
    if time_show:
        print(datetime.now() - start_time)
    print(f'! Finish {i}')
    
file_name = os.path.join(path_for_base_model, f'score_dict-base_model-[without_drop].pkl')
joblib.dump(score_dict, file_name)
if os.path.isfile(file_name):
    print(f'File saves as {file_name}')
else:
    print(f'File doesn`t save {file_name}')


# Load base model

In [15]:
model_base_dict = dict()
print(path_for_base_model1)
for file in os.listdir(path_for_base_model1):
    print(file)
    name = file.split('-')[0]
    if name in model_dict:
        # if we found in model_dict
        model = joblib.load(os.path.join(path_for_base_model1,file))
        model_base_dict.setdefault(name, model)
    elif 'score_dict' == name:
        score_dict = joblib.load(os.path.join(path_for_base_model1,file))
    else:
        pass

models/base1
.ipynb_checkpoints
adb-base_model-[without_drop].pkl
cat-base_model-[without_drop].pkl
del
dt-base_model-[without_drop].pkl
ext-base_model-[without_drop].pkl
gb-base_model-[without_drop].pkl
knn-base_model2-[without_drop].pkl
lgb-base_model-[without_drop].pkl
logreg-base_model-[without_drop].pkl
mlp-base_model3-[without_drop].pkl
outcome - Save base models with preprocessing.txt
rf-base_model-[without_drop].pkl
score_dict-base_model-[without_drop].pkl
xgb-base_model-[without_drop].pkl


In [11]:
temp = model_base_dict['logreg']
print(X.shape, y.shape)
# temp

(113754, 23) (113754,)


# Tunning model and save, save log in pd.DF => pickle 
* ### we have best preproccessing for each model, load it, learn pipeline model and tune hyper parameters for classifier

In [17]:
def create_log_file(file_name):
    d = dict(
        model_type=model_type,
        model=model,
        type_of_model='tunning', # or base
        best_preprocessing_pipe=best_preprocessing_pipe,
        time_save=datetime.now().strftime("%m-%d-%Y, %H-%M-%S"),
        time_search=None,
        best_roc_auc=0.5,
        best_hypparam=None,
        type_of_searchers='GridSearchCV',
        start_params_for_searchers=None,
    )
    df_log = pd.DataFrame(columns=d.keys())
    joblib.dump(df_log, file_name)

def write_log(data, file_name):
    if not os.path.isfile(file_name):
        create_log_file(file_name)
    df_log = joblib.load(file_name)
    df_log = pd.concat([df_log, data], ignore_index=True)
    joblib.dump(df_log, file_name)


# GridSearchCV

In [None]:
# GridSearchCV
model_type = 'mlp'
# model_dict = {
#     'logreg': LogisticRegression(max_iter=100, random_state=SEED, n_jobs=N_JOBS),
#     'knn': KNeighborsClassifier(n_jobs=N_JOBS),
#     'dt': DecisionTreeClassifier(random_state=SEED),
#     'rf': RandomForestClassifier(random_state=SEED, n_jobs=N_JOBS),
#     'ext': ExtraTreesClassifier(random_state=SEED, n_jobs=N_JOBS),
# #     'bag': BaggingClassifier() # Dublicate RandomForest
#     'adb': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), random_state=SEED),
#     'gb': GradientBoostingClassifier(random_state=SEED),
#     'xgb': XGBClassifier(random_state=SEED, n_jobs=N_JOBS),
# #     'svc': SVC(probability=True, random_state=SEED),
#     'cat': CatBoostClassifier(random_state=SEED, verbose=False),
#     'lgb': LGBMClassifier(random_state=SEED, n_jobs=N_JOBS),
# }
type_of_searchers='GridSearchCV'
start_time = datetime.now()
base_pipeline = model_base_dict[model_type]
print(X.shape, y.shape)
best_preprocessing_pipe = base_pipeline[0]
model = model_dict[model_type]

ml_pipe = Pipeline([
    ('pre', best_preprocessing_pipe),
    ('m', model),
])
param_gs = {
#     'm__solver' : ['lbfgs', 'liblinear'],

#     'm__penalty': ['l1', 'l2'],
#     'm__C': [0.1, 0.47, 1, 10], # [0.001, 0.01, 0.1, 1, 10, 100, 1000],
}
gs = GridSearchCV(
    estimator=ml_pipe,
    param_grid=param_gs,
    cv=cv,
    scoring=metric,
    return_train_score=False,
    verbose=11,
    n_jobs=N_JOBS,
)
gs.fit(X, y);
print(f'gs.best_params_ {gs.best_params_}')
print(f'gs.best_score_ {gs.best_score_}')
best_model = gs.best_estimator_
best_model.fit(X, y)
y_pred = best_model.predict_proba(X)[:, 1]

cur_time = datetime.now().strftime("%m-%d-%Y, %H-%M-%S")
type_of_model = 'tunning'
file_name = os.path.join(path_for_tunning_model, \
    f'{model_type}-{type_of_model}_model-[{type_of_searchers}][{cur_time}].pkl')
joblib.dump(best_model, file_name)
new_log = dict(
    model_type=model_type,
    model=file_name,
    type_of_model=type_of_model, # or base
    best_preprocessing_pipe=best_preprocessing_pipe,
    time_save=cur_time,
    time_search=datetime.now() - start_time,
    best_roc_auc=roc_auc_score(y, y_pred),
    best_hypparam=gs.best_params_,
    type_of_searchers=type_of_searchers,
    start_params_for_searchers=param_gs,
)
print(f'new_log[best_roc_auc] = {new_log["best_roc_auc"]}')
write_log(pd.DataFrame([new_log]), file_name_log)
