In [None]:
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn API like
from ngboost import NGBClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import learning_curve
import optuna 
import pickle

sns.set(font_scale=1.5, rc={"figure.figsize": (12, 8)})

In [None]:
class LabelEncoder(LabelEncoder):
    """Override the LabelEncoder in order to use it on pipeline."""

    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(np.array(y).ravel()).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(np.array(y).ravel()).reshape(-1, 1)

In [None]:
features = pd.read_csv('../data/processed/features.csv')

In [None]:
features.set_index('SK_ID_CURR', inplace=True, drop=True)

In [None]:
features.shape

In [None]:
features

### Drop observation with missing AMT_ANNUITY

In [None]:
print('Original shape %s' % features.shape[0])
features = features[features['AMT_ANNUITY'].notnull()]
print('New shape %i' % features.shape[0])

### Missing values in Category

In [None]:
features = features.copy()
features['OCCUPATION_TYPE'].replace(np.nan, 'unknown', inplace=True)

### Missing values in Integer

In [None]:
integer_features = []
for col in features.columns:
    if col.startswith('CREDIT_ACTIVE') or col.endswith('_HC') or col == 'REPORTED_DPD':
        features[col].replace(np.nan, 0, inplace=True)
        integer_features.append(col)

In [None]:
for col in features.columns:
    if not len(list(features[col].dropna().index)) == len(list(features.index)):
        print(col)

In [None]:
to_scale = integer_features
to_scale += ['DAYS_BIRTH', 
             'AMT_INCOME_TOTAL', 
             'AMT_CREDIT', 
             'AMT_ANNUITY']
to_impute = ['EXT_SOURCE_1',
             'EXT_SOURCE_2',
             'EXT_SOURCE_3']

column_trans = ColumnTransformer(
    [('occupation', OneHotEncoder(dtype=int), ['OCCUPATION_TYPE']),
     ('contract_type', LabelEncoder(), ['NAME_CONTRACT_TYPE']),
     ('integer_features', MinMaxScaler(), integer_features),
     ('ext_source', SimpleImputer(strategy='median'), to_impute)
    ], remainder='passthrough'
)

In [None]:
column_trans.fit_transform(features).shape

In [None]:
column_trans.fit(features.drop(columns='TARGET'))

In [None]:
features.shape

In [None]:
with open('../models/preprocessing_pipeline.pickle', 'wb') as f:
    pickle.dump(column_trans, f)
features.to_csv('../data/processed/features_final.csv')

In [None]:
x_transformed = column_trans.fit_transform(features)

In [None]:
models = {'RandomForest': RandomForestClassifier(n_estimators=10, max_depth=75, max_features=10, max_leaf_nodes=50),
#           'SVC': SVC(kernel='linear', cache_size=7000), # Take a long time to fit...
          'AdaBoost': AdaBoostClassifier(),
          'XGBoost': XGBClassifier(),
          'LightGBM': LGBMClassifier(),
          'NGBM': NGBClassifier()}

In [None]:
X = column_trans.fit_transform(features[features['TARGET'].notnull()].drop(columns='TARGET'))
y = features[features['TARGET'].notnull()]['TARGET'].astype('int8')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
print('train shape %i' % X_train.shape[0])
print('test shape %i' % X_test.shape[0])

In [None]:
def plot_learning_curve(estimator, title, X, y, axes=None, cv=None, train_sizes=None):
    if axes is None:
        _, ax = plt.subplots(1)
    ax.set_title(title)
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")
    
    scorer = make_scorer(roc_auc_score, needs_proba=True)
    train_sizes, train_scores, valid_scores = learning_curve(estimator, X, y, cv=cv, scoring=scorer)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    valid_scores_mean = np.mean(valid_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    valid_scores_std = np.std(valid_scores, axis=1)
    
    ax.plot(train_sizes, train_scores_mean, label='Train score')
    ax.plot(train_sizes, valid_scores_mean, label='Valid score')
    
    ax.fill_between(train_sizes, train_scores_mean + train_scores_std,
                    train_scores_mean - train_scores_std, alpha=0.2)
    ax.fill_between(train_sizes, valid_scores_mean + valid_scores_std,
                    valid_scores_mean - valid_scores_std, alpha=0.2)
    plt.legend()
    plt.show()

In [None]:
scores = dict()
fitting_time = dict()

In [None]:
for name, m in models.items():
    start = time.time()
    print("Fitting %s" % name)
    m.fit(X_train, y_train)
    fitting_time[name] = time.time() - start
    y_pred = m.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, y_pred)
    print('ROC_AUC score %f' % score)
    scores[name] = score

In [None]:
scores

In [None]:
fitting_time

In [None]:
scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1])}

In [None]:
data_plot = pd.DataFrame({"Modèle": list(scores.keys()),
                          "Score": list(scores.values())})
data_plot_2 = pd.DataFrame({"Modèle": list(fitting_time.keys()),
                            "Fitting time": list(fitting_time.values())})
data_plot.set_index('Modèle', inplace=True)
data_plot_2.set_index('Modèle', inplace=True)

In [None]:
data_plot = pd.concat([data_plot, data_plot_2], axis=1)

In [None]:
data_plot.reset_index(inplace=True)

In [None]:
data_plot.rename(columns={'index': 'Modèle'}, inplace=True)

In [None]:
data_plot

In [None]:
ax = sns.barplot(data=data_plot, 
                 y='Modèle', 
                 x='Score', 
                 color='cornflowerblue')
ax.set(xlabel='Score ROC AUC')

In [None]:
ax = sns.barplot(data=data_plot, y='Modèle', x='Fitting time', color='cornflowerblue')
ax.set(xlabel='Temps de calcul CPU pour entrainement en seconde')

In [None]:
title = 'Learning curve %s' % 'LightGBM'
plot_learning_curve(models['LightGBM'], title, X, y, cv=5)

In [None]:
model = LGBMClassifier()
model.fit(X, y)

In [None]:
model.feature_importances_

In [None]:
import lightgbm as lgb

lgb.plot_importance(model)

In [None]:
features.columns[:-1]

In [None]:
model.feature_importances_

In [None]:
feature_importance = {x: y for x, y in zip(features.columns[:-1], model.feature_importances_)}

In [None]:
feature_importance

In [None]:
with open('../models/model_v0.pickle', 'wb') as f:
    pickle.dump(model, f)