In [None]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lightgbm.sklearn import LGBMClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from imblearn.over_sampling import RandomOverSampler, SMOTE


In [None]:
class LabelEncoder(LabelEncoder):
    """Override the LabelEncoder in order to use it on pipeline."""

    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(np.array(y).ravel()).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(np.array(y).ravel()).reshape(-1, 1)

In [None]:
data = pd.read_csv('../data/processed/features_final.csv', index_col='SK_ID_CURR')

In [None]:
data.loc[100063]

In [None]:
with open('../models/preprocessing_pipeline.pickle', 'rb') as f:
    preprocessor = pickle.load(f)

In [None]:
train_data = data[data['TARGET'].notnull()].copy()
test_data = data[data['TARGET'].isnull()].copy()
target = train_data['TARGET'].astype('int8')
train_data.drop(columns='TARGET', inplace=True)

In [None]:
X = preprocessor.transform(train_data)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, target)

In [None]:
baseline = LGBMClassifier(objective="binary")
baseline.fit(X_train, y_train)
res = baseline.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, res))

### Over-sampling

In [None]:
X_resampled, y_resampled = RandomOverSampler().fit_resample(X, target)

In [None]:
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_resampled, y_resampled) 

In [None]:
clf = LGBMClassifier()
clf.fit(X_train_, y_train_)
res = clf.predict_proba(X_test_)[:, 1]
print(roc_auc_score(y_test_, res))

In [None]:
X_resampled, y_resampled = SMOTE().fit_resample(X, target)
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_resampled, y_resampled)
clf = LGBMClassifier()
clf.fit(X_train_, y_train_)
res = clf.predict_proba(X_test_)[:, 1]
print(roc_auc_score(y_test_, res))

In [None]:
clf.fit(X_resampled, y_resampled)
res = clf.predict_proba(preprocessor.transform(test_data.drop(columns='TARGET')))[:, 1]

In [None]:
submission = pd.DataFrame(test_data.reset_index()['SK_ID_CURR'])
submission['TARGET'] = res

In [None]:
submission.shape

In [None]:
submission.to_csv('../reports/SMOTE.csv', index=False)

Error when submitting because some observations has been droped (missing amt_annuity)... 

### Under-sampling