In [76]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

import os
import warnings
warnings.filterwarnings('ignore')

In [77]:
train = pd.read_csv("train_after_selection.csv")

In [78]:
train = train.sample(frac=0.3, random_state=42)

train_0 = train[train['TARGET'] == 0]
train_1 = train[train['TARGET'] == 1]

## make testing classifier

In [79]:
clf = make_pipeline(Imputer(strategy = 'median'),
                   StandardScaler(),
                   AdaBoostClassifier())
scoring = ['accuracy', 'precision', 'recall', 'roc_auc']



## random undersample

In [80]:
train_0_resample = train_0.sample(len(train_1))

In [81]:
train_resample = pd.concat([train_0_resample, train_1], axis=0)

In [82]:
target = train_resample['TARGET']
train_resample = train_resample.drop(columns=['TARGET'])

In [84]:
score = cross_validate(clf, train_resample, target, 
                       scoring=scoring, cv=5, verbose=2, 
                       return_train_score=False, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    9.0s remaining:   13.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.1s finished


In [94]:
from collections import defaultdict
mean_score = defaultdict(list)


for key, value in score.items():
    if 'time' not in key:
        mean_score[key].append(np.mean(value))

## random oversample

In [96]:
train_1_resample = train_1.sample(len(train_0), replace=True)

In [98]:
train_resample = pd.concat([train_1_resample, train_0], axis=0)

target = train_resample['TARGET']
train_resample = train_resample.drop(columns=['TARGET'])

In [99]:
score = cross_validate(clf, train_resample, target, 
                       scoring=scoring, cv=5, verbose=2, 
                       return_train_score=False, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.8min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.8min finished


In [100]:
for key, value in score.items():
    if 'time' not in key:
        mean_score[key].append(np.mean(value))

In [101]:
mean_score

defaultdict(list,
            {'test_accuracy': [0.6817819630568634, 0.6966855025267468],
             'test_precision': [0.682188646802196, 0.6999596982474541],
             'test_recall': [0.6806533653045415, 0.688487804217407],
             'test_roc_auc': [0.7447237095297746, 0.7627008764807365]})

## imbalearn under-sample

In [105]:
target = train['TARGET']
train = train.drop(columns=['TARGET'])

train = Imputer(strategy = 'median').fit_transform(train)
train = StandardScaler().fit_transform(train)



In [109]:
from imblearn.under_sampling import ClusterCentroids

In [114]:
target = target.astype('int')

In [115]:
train_resample, target_resample = ClusterCentroids(sampling_strategy=1.0, n_jobs=-1).fit_sample(train, target)

KeyboardInterrupt: 

In [None]:
tra