In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.preprocessing import LabelEncoder
import optuna

sns.set(font_scale=1.5, rc={"figure.figsize": (12, 8)})

In [None]:
class LabelEncoder(LabelEncoder):
    """Override the LabelEncoder in order to use it on pipeline."""

    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(np.array(y).ravel()).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(np.array(y).ravel()).reshape(-1, 1)

In [None]:
def plot_learning_curve(estimator, title, X, y, axes=None, cv=None, train_sizes=None):
    if axes is None:
        _, ax = plt.subplots(1)
    ax.set_title(title)
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")
    
    scorer = make_scorer(roc_auc_score, needs_proba=True)
    train_sizes, train_scores, valid_scores = learning_curve(estimator, X, y, cv=cv, scoring=scorer)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    valid_scores_mean = np.mean(valid_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    valid_scores_std = np.std(valid_scores, axis=1)
    
    ax.plot(train_sizes, train_scores_mean, label='Train score')
    ax.plot(train_sizes, valid_scores_mean, label='Valid score')
    
    ax.fill_between(train_sizes, train_scores_mean + train_scores_std,
                    train_scores_mean - train_scores_std, alpha=0.2)
    ax.fill_between(train_sizes, valid_scores_mean + valid_scores_std,
                    valid_scores_mean - valid_scores_std, alpha=0.2)
    plt.legend()
    plt.show()

In [None]:
features = pd.read_csv('../data/processed/features_final.csv')

In [None]:
features.head()

In [None]:
features.set_index('SK_ID_CURR', inplace=True, drop=True)

In [None]:
with open('../models/preprocessing_pipeline.pickle', 'rb') as f:
    transformer = pickle.load(f)

In [None]:
features_train = features[features['TARGET'].notna()]

X = transformer.transform(features_train.iloc[:, :-1])
y = features_train['TARGET'].astype('int8')

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
plot_learning_curve(RandomForestClassifier(n_estimators=10), 'Learning curve (no optimization)', X, y, cv=2)

In [None]:
def objective(trial):
    param = {
        'n_estimators': 10,
        'max_depth': trial.suggest_int('max_depth', 2, 100),
        'max_features': trial.suggest_int('max_features', 2, 15),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 100)
    }
    evals_results = dict()
    
    rf = RandomForestClassifier(**param)
    rf.fit(X_train, y_train)
    pred = rf.predict_proba(X_test)
    train_score = roc_auc_score(y_train, rf.predict_proba(X_train)[:, 1])
    score = roc_auc_score(y_test, pred[:, 1])
    print('Train score %f' % train_score)
    return score
    

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

In [None]:
study.best_params

In [None]:
plot_learning_curve(RandomForestClassifier(**study.best_params), 
                    'Learning curves Optimized RandomForest', 
                     X, y, cv=5)