# 不平衡样本处理
#### By 王升
#### 2020.12.10

In [96]:
## 导入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import imblearn
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
import os

In [97]:
## 加载数据
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [98]:
data.shape

(284807, 31)

In [99]:
data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

> - **标签Class非常不平衡**

In [100]:
## 对变量Amount, Time进行标准化处理
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
data[['Amount', 'Time']] = sc.fit_transform(data[['Amount', 'Time']])


## 切分数据为特征X和标签y
features = [col for col in data.columns if col not in ['Class']]
X = data[features]
y = data['Class']

In [101]:
## 忽略不平衡问题，直接LR建模
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_test_label = lr.predict(X_test)
y_test_value = lr.predict_proba(X_test)[:, 1]
print('>>>直接建模:')
print('测试集准确率:', accuracy_score(y_test, y_test_label))
print('测试集AUC:', roc_auc_score(y_test, y_test_value))
print('model report:\n',classification_report(y_test, y_test_label, digits=6))

>>>直接建模:
测试集准确率: 0.9992743700478681
测试集AUC: 0.9783592167803621
model report:
               precision    recall  f1-score   support

           0   0.999414  0.999859  0.999637     85307
           1   0.877551  0.632353  0.735043       136

    accuracy                       0.999274     85443
   macro avg   0.938483  0.816106  0.867340     85443
weighted avg   0.999220  0.999274  0.999216     85443



> - **从准确率和AUC来看，效果还不错。但因为样本不平衡，模型的评价指标应选择recall，以recall来衡量，模型效果并不好。**

In [102]:
## RandomUnderSampler欠采样
from imblearn.under_sampling import RandomUnderSampler
X_resampled, y_resampled = RandomUnderSampler().fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_test_pred_label = lr.predict(X_test)
y_test_pred_prob = lr.predict_proba(X_test)[:, 1]
print('>>>RandomUnderSampler欠采样:')
print('测试集准确率:', accuracy_score(y_test, y_test_pred_label))
print('测试集AUC:', roc_auc_score(y_test, y_test_pred_prob))
print('model report:\n', classification_report(y_test, y_test_pred_label, digits=6))

>>>RandomUnderSampler欠采样:
测试集准确率: 0.9358108108108109
测试集AUC: 0.9657077625570777
model report:
               precision    recall  f1-score   support

           0   0.917197  0.960000  0.938111       150
           1   0.956835  0.910959  0.933333       146

    accuracy                       0.935811       296
   macro avg   0.937016  0.935479  0.935722       296
weighted avg   0.936748  0.935811  0.935754       296



In [103]:
## RandomOverSampler过采样
from imblearn.over_sampling import RandomOverSampler
X_resampled, y_resampled = RandomOverSampler().fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_test_pred_label = lr.predict(X_test)
y_test_pred_prob = lr.predict_proba(X_test)[:, 1]
print('>>>RandomOverSampler过采样:')
print('测试集准确率:', accuracy_score(y_test, y_test_pred_label))
print('测试集AUC:', roc_auc_score(y_test, y_test_pred_prob))
print('model report:\n', classification_report(y_test, y_test_pred_label, digits=6))

>>>RandomOverSampler过采样:
测试集准确率: 0.9497974664251505
测试集AUC: 0.9870762850611989
model report:
               precision    recall  f1-score   support

           0   0.926377  0.977075  0.951051     85149
           1   0.975836  0.922612  0.948478     85440

    accuracy                       0.949797    170589
   macro avg   0.951106  0.949844  0.949765    170589
weighted avg   0.951149  0.949797  0.949762    170589



In [104]:
## SMOTE过采样
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE().fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_test_pred_label = lr.predict(X_test)
y_test_pred_prob = lr.predict_proba(X_test)[:, 1]
print('>>>SMOTE过采样:')
print('测试集准确率:', accuracy_score(y_test, y_test_pred_label))
print('测试集AUC:', roc_auc_score(y_test, y_test_pred_prob))
print('model report:\n', classification_report(y_test, y_test_pred_label, digits=6))

>>>SMOTE过采样:
测试集准确率: 0.9481033360884934
测试集AUC: 0.9893424847347345
model report:
               precision    recall  f1-score   support

           0   0.924716  0.975443  0.949402     85149
           1   0.974111  0.920857  0.946736     85440

    accuracy                       0.948103    170589
   macro avg   0.949414  0.948150  0.948069    170589
weighted avg   0.949456  0.948103  0.948067    170589



In [105]:
## ADASYN过采样
from imblearn.over_sampling import ADASYN
X_resampled, y_resampled = ADASYN().fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_test_pred_label = lr.predict(X_test)
y_test_pred_prob = lr.predict_proba(X_test)[:, 1]
print('>>>ADASYN过采样:')
print('测试集准确率:', accuracy_score(y_test, y_test_pred_label))
print('测试集AUC:', roc_auc_score(y_test, y_test_pred_prob))
print('model report:\n', classification_report(y_test, y_test_pred_label, digits=6))

>>>ADASYN过采样:
测试集准确率: 0.8877347042857897
测试集AUC: 0.9599110613849913
model report:
               precision    recall  f1-score   support

           0   0.874156  0.905547  0.889574     85185
           1   0.902287  0.869968  0.885833     85402

    accuracy                       0.887735    170587
   macro avg   0.888221  0.887757  0.887704    170587
weighted avg   0.888239  0.887735  0.887701    170587



In [106]:
## RandomOverSampler过采样 + LightGBM
from imblearn.over_sampling import RandomOverSampler
X_resampled, y_resampled = RandomOverSampler().fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
clf_lgb = lgb.LGBMClassifier(class_weight='balanced')
clf_lgb.fit(X_train, y_train)
y_test_pred_label = clf_lgb.predict(X_test)
y_test_pred_prob = clf_lgb.predict_proba(X_test)[:, 1]
print('>>>RandomOverSampler过采样 + LightGBM:')
print('测试集准确率:', accuracy_score(y_test, y_test_pred_label))
print('测试集AUC:', roc_auc_score(y_test, y_test_pred_prob))
print('model report:\n', classification_report(y_test, y_test_pred_label, digits=6))

>>>RandomOverSampler过采样 + LightGBM:
测试集准确率: 0.9998475868901278
测试集AUC: 0.9999708499389459
model report:
               precision    recall  f1-score   support

           0   1.000000  0.999695  0.999847     85149
           1   0.999696  1.000000  0.999848     85440

    accuracy                       0.999848    170589
   macro avg   0.999848  0.999847  0.999848    170589
weighted avg   0.999848  0.999848  0.999848    170589



In [107]:
## SMOTE过采样 + LightGBM
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE().fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
clf_lgb = lgb.LGBMClassifier(class_weight='balanced')
clf_lgb.fit(X_train, y_train)
y_test_pred_label = clf_lgb.predict(X_test)
y_test_pred_prob = clf_lgb.predict_proba(X_test)[:, 1]
print('>>>SMOTE过采样 + LightGBM:')
print('测试集准确率:', accuracy_score(y_test, y_test_pred_label))
print('测试集AUC:', roc_auc_score(y_test, y_test_pred_prob))
print('model report:\n', classification_report(y_test, y_test_pred_label, digits=6))

>>>SMOTE过采样 + LightGBM:
测试集准确率: 0.9991969001518269
测试集AUC: 0.9999643125304957
model report:
               precision    recall  f1-score   support

           0   0.999988  0.998403  0.999195     85149
           1   0.998411  0.999988  0.999199     85440

    accuracy                       0.999197    170589
   macro avg   0.999199  0.999196  0.999197    170589
weighted avg   0.999198  0.999197  0.999197    170589



### 小结
- 使用欠采样或过采样方法后，指标recall会有明显的提升
- 过采样优于欠采样
- (过采样 + LGB模型)效果很好
- 实际应用时，可进一步调节采样方法的参数以及模型的参数，以使得结果更好