# Libraries

In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook
from IPython.display import display
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score, classification_report, \
            accuracy_score, precision_score, recall_score
from sklearn.base import BaseEstimator, TransformerMixin

from pprint import pprint # хорошо словари принтит

file_name_log = 'models/df_log.pkl'
file_name_log2 = 'models/df_log2.pkl'
# path_for_tunning_model = 'models/tunning'
SEED = 24

In [23]:
class PredictProbaToTransform(BaseEstimator, TransformerMixin):
    """
    Give all X and classifier on input
    output - predict_proba by classifier (only binary classifier)
    This class needs to compare outputs from classifiers in FeatureUnion model
    """
    def __init__(self, clf):
        self.clf = clf
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        output = self.clf.predict_proba(X)[:, 1]
        output = np.reshape(output, (-1,1))
        return output


# Load data (step 1)

In [2]:
# load data
df = pd.read_csv('data/weather.csv')
df.sort_values(by=['Location_ID', 'Date'], inplace=True)
df.reset_index(inplace=True);

print(df.shape)

(142193, 29)


# Cleaning and preparing PANDAS (step 1)

In [3]:
# separate target
df_target = df['RainTomorrow'].map(lambda x: 1 if x == True else 0).astype('uint8')
# load data

print(df.shape, df.duplicated().sum(), df.columns)
df.head(2)

(142193, 29) 0 Index(['index', 'Unnamed: 0', 'Date', 'Location_ID', 'Location', 'Cloud9am',
       'Cloud3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
       'RISK_MM', 'RainTomorrow', 'MinTemp', 'MaxTemp', 'Temp9am', 'Temp3pm',
       'Rainfall', 'Evaporation', 'Sunshine', 'RainToday', 'WindGustDir',
       'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am',
       'WindSpeed3pm', 'Data2', 'Location_ID2'],
      dtype='object')


Unnamed: 0.1,index,Unnamed: 0,Date,Location_ID,Location,Cloud9am,Cloud3pm,Humidity9am,Humidity3pm,Pressure9am,...,Sunshine,RainToday,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Data2,Location_ID2
0,32041,32041,2008-07-01,0,Adelaide,,,92.0,67.0,1017.4,...,2.6,True,NW,48.0,SW,W,13.0,15.0,2008-07-01,0
1,32042,32042,2008-07-02,0,Adelaide,,,75.0,52.0,1022.4,...,7.8,False,SW,35.0,SSW,SW,13.0,15.0,2008-07-02,0


# Template csv

In [5]:
# load data
df = pd.read_csv('data/weather.csv')
df.sort_values(by=['Location_ID', 'Date'], inplace=True)
df.reset_index(inplace=True);
df.drop(['Unnamed: 0', 'Data2',  'Location', 'Location_ID2', 'RISK_MM', 'RainTomorrow', 'RainToday', 'index'], axis=1, inplace=True)
print(df.shape)

(142193, 21)


In [7]:
template_csv = df.dropna().head()

In [8]:
template_csv.to_csv('data/template1.csv')

In [9]:
df = pd.read_csv('data/template1.csv')
df

Unnamed: 0.1,Unnamed: 0,Date,Location_ID,Cloud9am,Cloud3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,MinTemp,...,Temp3pm,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm
0,9117,2008-12-01,3,0.0,3.0,17.0,16.0,1010.5,1005.8,19.6,...,37.2,0.0,11.2,9.8,WNW,87.0,NNE,NE,20.0,11.0
1,9118,2008-12-02,3,2.0,5.0,18.0,13.0,1009.3,1004.6,21.0,...,38.2,1.2,9.0,12.2,NNW,41.0,NNW,S,24.0,9.0
2,9119,2008-12-03,3,0.0,3.0,17.0,11.0,1006.3,1003.4,22.9,...,40.4,0.0,11.6,12.6,NNW,48.0,ENE,N,7.0,24.0
3,9120,2008-12-04,3,3.0,6.0,29.0,24.0,1008.7,1006.9,24.7,...,36.5,0.0,16.0,7.8,WNW,72.0,SSW,W,2.0,22.0
4,9121,2008-12-05,3,7.0,6.0,58.0,43.0,1014.0,1010.7,23.4,...,31.1,0.2,12.2,4.1,SSW,46.0,S,S,9.0,17.0


# def preprocessing_befor_load(df):

In [10]:
def preprocessing_befor_load(df):
    # convert dates
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
    df['year'] = df['Date'].dt.year.astype('uint16')
    df['month'] = df['Date'].dt.month.astype('uint8')
    df['season'] = df['month'].apply(lambda x: 1 if x in [1,11,12] else 
                                    2 if x in [3, 4, 5] else 
                                    3 if x in [6, 7, 8] else 4).astype('uint8')
    # replace wind directions with numeric
    wind_rose = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']
    dict_replacer = dict(zip(wind_rose, range(len(wind_rose))))
    df.replace(dict_replacer, inplace=True)

    # drop
    df.drop(['Unnamed: 0', 'Date'], axis=1, inplace=True)
    return df

In [11]:
print(df.shape)
df = preprocessing_befor_load(df)
print(df.shape)
display(df.head().T)

(5, 22)
(5, 23)


Unnamed: 0,0,1,2,3,4
Location_ID,3.0,3.0,3.0,3.0,3.0
Cloud9am,0.0,2.0,0.0,3.0,7.0
Cloud3pm,3.0,5.0,3.0,6.0,6.0
Humidity9am,17.0,18.0,17.0,29.0,58.0
Humidity3pm,16.0,13.0,11.0,24.0,43.0
Pressure9am,1010.5,1009.3,1006.3,1008.7,1014.0
Pressure3pm,1005.8,1004.6,1003.4,1006.9,1010.7
MinTemp,19.6,21.0,22.9,24.7,23.4
MaxTemp,37.6,39.1,40.9,40.5,32.4
Temp9am,30.4,32.5,35.0,32.3,26.8


# Load data with all imputation

In [13]:
df = pd.read_csv('data/df_after_all_imputations.csv')
df.sort_values(by=['Location_ID', 'Date'], inplace=True)
df.reset_index(inplace=True);
df.drop(['Unnamed: 0', 'index', 'Date'], axis=1, inplace=True)

print(df.shape)

(142193, 24)


# Split data test for stacking 

In [14]:
X, X_test_stacking, y, y_test_stacking = train_test_split(
    df.drop('RainTomorrow', axis=1), 
    df['RainTomorrow'],
    test_size=0.2,
    random_state=SEED,
    shuffle=True,
)
print(X.shape, X_test_stacking.shape, y.shape, y_test_stacking.shape)

(113754, 23) (28439, 23) (113754,) (28439,)


# LOG file display (step 3)

In [86]:
df_log0 = joblib.load(file_name_log)
display(df_log.tail().T)

Unnamed: 0,model_type,logreg,mlp,rf,stacking,xgb
type_of_model,first,tunning,tunning,tunning,base_light,tunning
best_roc_auc,first,0.869198,0.929429,0.999998,0.906363,0.877168
time_save,first,"02-21-2020, 17-39-18","02-25-2020, 01-58-07","02-21-2020, 16-43-36","02-27-2020, 01-59-21","02-21-2020, 17-24-57"
time_search,first,0 days 00:02:49.210596,0 days 00:20:50.728495,0 days 00:01:29.060461,0 days 00:07:16.892957,0 days 00:00:52.400062
best_hypparam,first,{'m__C': 0.4771764656134451},{},{},,{}
model,first,models/tunning\logreg-tunning_model-[OptunaSea...,models/tunning\mlp-tunning_model-[GridSearchCV...,models/tunning\rf-tunning_model-[GridSearchCV]...,models/tunning\stacking-base_light_model-[Hold...,models/tunning\xgb-tunning_model-[GridSearchCV...


In [59]:
df_log0 = joblib.load(file_name_log)
# print(df_log.columns)
display(df_log.tail().T)
list_display = ['model_type', 'type_of_model', 'best_roc_auc','time_save', 'time_search', 'best_hypparam', 'model' ]
df_log = df_log0.sort_values(by=['model_type', 'best_roc_auc'], ascending=[True, False])[list_display]
display(df_log)
df_log = df_log.groupby(by=['model_type']).agg(['first'])
display(df_log)

Unnamed: 0,model_type,type_of_model,best_roc_auc,time_save,time_search,best_hypparam,model
5,adb,tunning,0.864118,"02-22-2020, 01-28-11",00:00:45.217540,{},models/tunning\adb-tunning_model-[GridSearchCV...
10,cat,tunning,0.929801,"02-22-2020, 20-51-52",00:03:59.031723,{},models/tunning\cat-tunning_model-[GridSearchCV...
4,ext,tunning,1.0,"02-22-2020, 01-26-47",00:01:34.252897,{},models/tunning\ext-tunning_model-[GridSearchCV...
6,gb,tunning,0.876805,"02-22-2020, 01-35-13",00:03:37.395600,{},models/tunning\gb-tunning_model-[GridSearchCV]...
12,knn,tunning,0.941013,"02-22-2020, 21-08-00",00:11:45.298764,{},models/tunning\knn-tunning_model-[GridSearchCV...
11,lgb,tunning,0.903246,"02-22-2020, 20-54-52",00:00:11.183694,{},models/tunning\lgb-tunning_model-[GridSearchCV...
3,logreg,tunning,0.869198,"02-21-2020, 17-39-18",00:02:49.210596,{'m__C': 0.4771764656134451},models/tunning\logreg-tunning_model-[OptunaSea...
0,logreg,tunning,0.869198,"02-21-2020, 16-40-33",00:02:18.733379,{},models/tunning\logreg-tunning_model-[GridSearc...
13,mlp,tunning,0.929429,"02-25-2020, 01-58-07",00:20:50.728495,{},models/tunning\mlp-tunning_model-[GridSearchCV...
1,rf,tunning,0.999998,"02-21-2020, 16-43-36",00:01:29.060461,{},models/tunning\rf-tunning_model-[GridSearchCV]...


Unnamed: 0_level_0,type_of_model,best_roc_auc,time_save,time_search,best_hypparam,model
Unnamed: 0_level_1,first,first,first,first,first,first
model_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
adb,tunning,0.864118,"02-22-2020, 01-28-11",00:00:45.217540,{},models/tunning\adb-tunning_model-[GridSearchCV...
cat,tunning,0.929801,"02-22-2020, 20-51-52",00:03:59.031723,{},models/tunning\cat-tunning_model-[GridSearchCV...
ext,tunning,1.0,"02-22-2020, 01-26-47",00:01:34.252897,{},models/tunning\ext-tunning_model-[GridSearchCV...
gb,tunning,0.876805,"02-22-2020, 01-35-13",00:03:37.395600,{},models/tunning\gb-tunning_model-[GridSearchCV]...
knn,tunning,0.941013,"02-22-2020, 21-08-00",00:11:45.298764,{},models/tunning\knn-tunning_model-[GridSearchCV...
lgb,tunning,0.903246,"02-22-2020, 20-54-52",00:00:11.183694,{},models/tunning\lgb-tunning_model-[GridSearchCV...
logreg,tunning,0.869198,"02-21-2020, 17-39-18",00:02:49.210596,{'m__C': 0.4771764656134451},models/tunning\logreg-tunning_model-[OptunaSea...
mlp,tunning,0.929429,"02-25-2020, 01-58-07",00:20:50.728495,{},models/tunning\mlp-tunning_model-[GridSearchCV...
rf,tunning,0.999998,"02-21-2020, 16-43-36",00:01:29.060461,{},models/tunning\rf-tunning_model-[GridSearchCV]...
stacking,base_light,0.906363,"02-27-2020, 01-59-21",00:07:16.892957,,models/tunning\stacking-base_light_model-[Hold...


In [16]:
print(df_log.columns)
df_log.columns = [x[0] for x in df_log.columns]
print(df_log.columns)

MultiIndex([('type_of_model', 'first'),
            ( 'best_roc_auc', 'first'),
            (    'time_save', 'first'),
            (  'time_search', 'first'),
            ('best_hypparam', 'first'),
            (        'model', 'first')],
           )
Index(['type_of_model', 'best_roc_auc', 'time_save', 'time_search',
       'best_hypparam', 'model'],
      dtype='object')


In [17]:
dict_tunning_models = dict(zip(df_log.index.to_list(), df_log['model']))
for m, m_path in dict_tunning_models.items():
    print(m, m_path)

adb models/tunning\adb-tunning_model-[GridSearchCV][02-22-2020, 01-28-11].pkl
cat models/tunning\cat-tunning_model-[GridSearchCV][02-22-2020, 20-51-52].pkl
ext models/tunning\ext-tunning_model-[GridSearchCV][02-22-2020, 01-26-47].pkl
gb models/tunning\gb-tunning_model-[GridSearchCV][02-22-2020, 01-35-13].pkl
knn models/tunning\knn-tunning_model-[GridSearchCV][02-22-2020, 21-08-00].pkl
lgb models/tunning\lgb-tunning_model-[GridSearchCV][02-22-2020, 20-54-52].pkl
logreg models/tunning\logreg-tunning_model-[OptunaSearchCV][02-21-2020, 17-39-18].pkl
mlp models/tunning\mlp-tunning_model-[GridSearchCV][02-25-2020, 01-58-07].pkl
rf models/tunning\rf-tunning_model-[GridSearchCV][02-21-2020, 16-43-36].pkl
stacking models/tunning\stacking-base_model-[HoldOut][02-22-2020, 15-18-41].pkl
xgb models/tunning\xgb-tunning_model-[GridSearchCV][02-21-2020, 17-24-57].pkl


In [19]:
def m_predict(model_path):
    model = joblib.load(model_path)
    y_pred_train = model.predict_proba(X)[:, 1]
    y_pred_test = model.predict_proba(X_test_stacking)[:, 1]
#     print(f'roc_auc train - {roc_auc_score(y, y_pred_train):.5f}, roc_auc test - {roc_auc_score(y_test_stacking, y_pred_test):.5f}')
    print(model_path)
    return roc_auc_score(y, y_pred_train), roc_auc_score(y_test_stacking, y_pred_test)

In [30]:
# df_log['roc_auc_test'] = df_log['model'].apply(lambda x: m_predict(x))

models/tunning\adb-tunning_model-[GridSearchCV][02-22-2020, 01-28-11].pkl
models/tunning\cat-tunning_model-[GridSearchCV][02-22-2020, 20-51-52].pkl
models/tunning\ext-tunning_model-[GridSearchCV][02-22-2020, 01-26-47].pkl
models/tunning\gb-tunning_model-[GridSearchCV][02-22-2020, 01-35-13].pkl
models/tunning\knn-tunning_model-[GridSearchCV][02-22-2020, 21-08-00].pkl
models/tunning\lgb-tunning_model-[GridSearchCV][02-22-2020, 20-54-52].pkl
models/tunning\logreg-tunning_model-[OptunaSearchCV][02-21-2020, 17-39-18].pkl
models/tunning\mlp-tunning_model-[GridSearchCV][02-25-2020, 01-58-07].pkl
models/tunning\rf-tunning_model-[GridSearchCV][02-21-2020, 16-43-36].pkl
models/tunning\stacking-base_model-[HoldOut][02-22-2020, 15-18-41].pkl
models/tunning\xgb-tunning_model-[GridSearchCV][02-21-2020, 17-24-57].pkl


# Test all model (exept stacking) on X_test_stacking Data

In [88]:
# file_name_log2 = 'models/df_log2.pkl'
df_log2 = joblib.load(file_name_log2)
display(df_log2.sort_values(by=['roc_auc_test'], ascending=False))
for_saving = df_log2.sort_values(by=['roc_auc_test'], ascending=False)
for_saving.to_csv('raiting_models')

Unnamed: 0_level_0,type_of_model,best_roc_auc,time_save,time_search,best_hypparam,model,roc_auc_test,roc_auc_train
model_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
stacking,base_light,0.906363,"02-27-2020, 01-59-21",00:07:16.892957,,models/tunning\stacking-base_light_model-[Hold...,0.905371,0.999997
cat,tunning,0.929801,"02-22-2020, 20-51-52",00:03:59.031723,{},models/tunning\cat-tunning_model-[GridSearchCV...,0.900574,0.929801
ext,tunning,1.0,"02-22-2020, 01-26-47",00:01:34.252897,{},models/tunning\ext-tunning_model-[GridSearchCV...,0.89155,1.0
lgb,tunning,0.903246,"02-22-2020, 20-54-52",00:00:11.183694,{},models/tunning\lgb-tunning_model-[GridSearchCV...,0.889887,0.903246
rf,tunning,0.999998,"02-21-2020, 16-43-36",00:01:29.060461,{},models/tunning\rf-tunning_model-[GridSearchCV]...,0.886741,0.999998
xgb,tunning,0.877168,"02-21-2020, 17-24-57",00:00:52.400062,{},models/tunning\xgb-tunning_model-[GridSearchCV...,0.875322,0.877168
gb,tunning,0.876805,"02-22-2020, 01-35-13",00:03:37.395600,{},models/tunning\gb-tunning_model-[GridSearchCV]...,0.87454,0.876805
mlp,tunning,0.929429,"02-25-2020, 01-58-07",00:20:50.728495,{},models/tunning\mlp-tunning_model-[GridSearchCV...,0.873276,0.929429
logreg,tunning,0.869198,"02-21-2020, 17-39-18",00:02:49.210596,{'m__C': 0.4771764656134451},models/tunning\logreg-tunning_model-[OptunaSea...,0.869513,0.869198
adb,tunning,0.864118,"02-22-2020, 01-28-11",00:00:45.217540,{},models/tunning\adb-tunning_model-[GridSearchCV...,0.864254,0.864118


# For SLIDES

In [None]:
tf_only_scale = StandardScaler()
tf_cat_and_num_list = [
    ('num', StandardScaler(), num_col),
    ('cat', OneHotEncoder(), cat_col),
]
tf_cat_and_num = ColumnTransformer(transformers=tf_cat_and_num_list)

pipe_preprocessing = Pipeline([
    ('tf1', tf_only_scale),
    ('tf2', tf_cat_and_num),
])

ml_pipe = Pipeline([
    ('pre', pipe_preprocessing),
    ('model', model),
])
param_gs = [{'pre__tf1': [None],},
            {'pre__tf2': [None],},
            {'pre__tf1': [None],'pre__tf2': [None],}]

gs = GridSearchCV(
    estimator=ml_pipe,
    param_grid=param_gs,
    cv=cv,
    scoring=metric,
)

In [None]:
model_dict = {
    'logreg': LogisticRegression(),
    'knn'   : KNeighborsClassifier(),
    'rf'    : RandomForestClassifier(),
    'ext'   : ExtraTreesClassifier(),
    'adb'   : AdaBoostClassifier(),
    'gb'    : GradientBoostingClassifier(),
    'xgb'   : XGBClassifier(),
    'cat'   : CatBoostClassifier(),
    'lgb'   : LGBMClassifier(),
    'mlp'   : MLPClassifier(hidden_layer_sizes=(128, 32, 2)),
}
logreg [('tf1', None), ('tf2', ColumnTransformer(...))]
knn    [('tf1', StandardScaler()), ('tf2', None)]
rf     [('tf1', None), ('tf2', None)]
ext    [('tf1', None), ('tf2', None)]
adb    [('tf1', StandardScaler()), ('tf2', None)]
gb     [('tf1', None), ('tf2', None)]
xgb    [('tf1', StandardScaler()), ('tf2', None)]
cat    [('tf1', None), ('tf2', ColumnTransformer(...))]
lgb    [('tf1', None), ('tf2', ColumnTransformer(...))]
mlp    [('tf1', StandardScaler()), ('tf2', None)]