## Imbalanced Data
* weighting: punishing the errors on the minority class
* upsampling: randomly replicating instances in the minority class
* downsampling: randomly removing instances in the majority class
* SMOTE: synthetic minority oversampling technique(minority class instance)

### SMOTE(Synthetic Minority Over-sampling Technique)
SMOTE는 비율이 낮은 분류의 데이터를 만들어내는 방법이다.
Minority class에 속하는 데이터 $x_i$에 대해 K-NN을 사용하여 K개의 샘플을 얻어낸다.
샘플 중에서 랜덤하게 선택하고 아래와 같은 계산을 통해 새로운 데이터를 생성한다.
$$
x_{new} = x_i + (x_{ih} - x_i) * delta
$$
여기서 $x_i$ 는 minority class에 속하는 기준 샘플이고 $x_{ih}$는 $x_i$에 대한 K-NN의 하나이다. $x_{ih}$또한 minority class에 속한다. delta는 0과 1사이의 랜덤 수이다.

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

In [2]:
# read data
hr = pd.read_csv("data/processed_data.csv")
hr = hr.drop(["Unnamed: 0"], axis=1)
target = "Attrition_Yes"
hr.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,EmployeeNumber,HourlyRate,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,TotalWorkingYears,...,RelationshipSatisfaction_1,RelationshipSatisfaction_2,RelationshipSatisfaction_3,RelationshipSatisfaction_4,OverTime_No,OverTime_Yes,WorkLifeBalance_1,WorkLifeBalance_2,WorkLifeBalance_3,WorkLifeBalance_4
0,0.547619,0.71582,0.0,0.0,0.914286,0.262454,0.698053,0.888889,0.0,0.2,...,1,0,0,0,0,1,1,0,0,0
1,0.738095,0.1267,0.25,0.000484,0.442857,0.217009,0.916001,0.111111,0.857143,0.25,...,0,0,0,1,1,0,0,0,1,0
2,0.452381,0.909807,0.035714,0.001451,0.885714,0.056925,0.012126,0.666667,0.285714,0.175,...,0,1,0,0,0,1,0,0,1,0
3,0.357143,0.923407,0.071429,0.001935,0.371429,0.100053,0.845814,0.111111,0.0,0.2,...,0,0,1,0,0,1,0,0,1,0
4,0.214286,0.350036,0.035714,0.002903,0.142857,0.129489,0.583738,1.0,0.071429,0.15,...,0,0,0,1,1,0,0,0,1,0


In [3]:
hr_target = hr[target]
hr_data = hr.drop([target], axis=1)

In [4]:
# split train test
X_train, X_test, y_train, y_test = train_test_split(hr_data, hr_target, test_size=0.3, random_state=12)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1029, 79)
(441, 79)
(1029,)
(441,)


In [5]:
# Balancing Data
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=2)
X_train, y_train = sm.fit_sample(X_train,y_train)

In [6]:
X_train = pd.DataFrame(X_train)
X_train.columns = X_test.columns
X_train.shape, y_train.shape

((1726, 79), (1726,))

In [7]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression(random_state=12)
logisticRegr.fit(X_train, np.array(y_train).ravel())
predictions = logisticRegr.predict(X_test)
score = logisticRegr.score(X_test, y_test)
print(score)

0.7891156462585034


In [8]:
from sklearn.ensemble import GradientBoostingClassifier  #GBM 
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.grid_search import GridSearchCV

y_train = np.array(y_train)
y_train = y_train.ravel()
# Gradient Boosting
# base
base_model_args = {'max_depth': 3, 'n_estimators': 500, 'subsample': 1, 'random_state': 5,
            'min_samples_split': 2, 'min_samples_leaf':1, 'max_features':'sqrt'}
base_model = GradientBoostingClassifier(learning_rate=0.1, **base_model_args)
base_model.fit(X_train,y_train)

# learning rate, estimators
model1_args = {'learning_rate':0.1,'max_depth': 3, 'n_estimators': 1500, 'subsample': 1, 'random_state': 5,
            'min_samples_split': 2, 'min_samples_leaf':1, 'max_features':'sqrt'}
model1 = GradientBoostingClassifier(**model1_args)
model1.fit(X_train,y_train)

# sample split, leaf
model2_args = {'learning_rate':0.1,'max_depth': 3, 'n_estimators': 1500, 'subsample': 1, 'random_state': 5,
            'min_samples_split': 2, 'min_samples_leaf':1, 'max_features':'sqrt'}
model2=GradientBoostingClassifier(learning_rate=0.01, n_estimators=1500,max_depth=4, min_samples_split=40, min_samples_leaf=7,max_features=4 , subsample=0.95, random_state=10)
model2.fit(X_train,y_train)

pred=base_model.predict(X_test)
print(classification_report(y_test, pred))

pred=model1.predict(X_test)
print(classification_report(y_test, pred))

pred=model2.predict(X_test)
print(classification_report(y_test, pred))



             precision    recall  f1-score   support

          0       0.89      0.96      0.93       370
          1       0.67      0.41      0.51        71

avg / total       0.86      0.87      0.86       441

             precision    recall  f1-score   support

          0       0.90      0.95      0.92       370
          1       0.62      0.42      0.50        71

avg / total       0.85      0.87      0.86       441

             precision    recall  f1-score   support

          0       0.89      0.96      0.93       370
          1       0.69      0.41      0.51        71

avg / total       0.86      0.88      0.86       441



In [9]:
from sklearn import svm, grid_search
from sklearn.model_selection import GridSearchCV

In [13]:
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
SVM_grid_search = GridSearchCV(svm.SVC(kernel='rbf', probability=True), param_grid, cv=10)
SVM_grid_search.fit(X_train, y_train)
SVM_grid_search.predict(X_test)
print(SVM_grid_search.best_estimator_)

print('Accuracy of the svm on test set: {:.3f}'.format(SVM_grid_search.score(X_test, y_test)))
pred = SVM_grid_search.predict(X_test)
print(classification_report(y_test, pred))

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Accuracy of the svm on test set: 0.868
             precision    recall  f1-score   support

          0       0.89      0.96      0.92       370
          1       0.64      0.41      0.50        71

avg / total       0.85      0.87      0.86       441



In [83]:
import xgboost as xgb

In [96]:
clf = xgb.XGBClassifier(n_estimators=10000)
eval_set  = [(X_train, y_train.ravel()), (X_test, y_test)]
clf.fit(X_train, y_train, eval_set=eval_set,
        eval_metric="auc", early_stopping_rounds=30)

[0]	validation_0-auc:0.879708	validation_1-auc:0.685801
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 30 rounds.
[1]	validation_0-auc:0.910877	validation_1-auc:0.720442
[2]	validation_0-auc:0.932029	validation_1-auc:0.75925
[3]	validation_0-auc:0.933093	validation_1-auc:0.762752
[4]	validation_0-auc:0.935328	validation_1-auc:0.763989
[5]	validation_0-auc:0.938645	validation_1-auc:0.762124
[6]	validation_0-auc:0.946767	validation_1-auc:0.766749
[7]	validation_0-auc:0.949218	validation_1-auc:0.761858
[8]	validation_0-auc:0.951635	validation_1-auc:0.771317
[9]	validation_0-auc:0.955109	validation_1-auc:0.773848
[10]	validation_0-auc:0.957136	validation_1-auc:0.769871
[11]	validation_0-auc:0.959326	validation_1-auc:0.774324
[12]	validation_0-auc:0.960118	validation_1-auc:0.775752
[13]	validation_0-auc:0.96173	validation_1-auc:0.778359
[14]	validation_0-auc:0.964864	validation_1-auc:0.778131


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=10000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [97]:
clf.predict(X_test)

  if diff:


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,