## imbalanced-learnで不均衡なデータのunder-sampling/over-samplingを行う

In [1]:
%load_ext lab_black

In [2]:
# ライブラリーのインポート
import pandas as pd
from sklearn.datasets import make_classification

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import f1_score

In [3]:
# 　Credit Card Fraud Detectionデータを読み込み
df = pd.read_csv(
    "https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv"
)

print(df.shape)
df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# 分類クラスのデータ数を確認
df["Class"].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [5]:
# 欠損値の確認
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
# 欠損値データの削除
df = df.dropna()

In [7]:
# 分類クラスのデータ数を確認
df["Class"].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [8]:
# データを学習用と検証用に分ける
x = df.iloc[:, 1:30]
y = df["Class"]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=40)

# 分類モデル作成
model = GradientBoostingClassifier()
model.fit(x_train, y_train)

# 作成したモデルで、テストデータを予測値
y_pred = model.predict(x_test)

# Accuracyと混同行列
print("Confusion matrix(test):\n{}".format(confusion_matrix(y_test, y_pred)))
print("Accuracy(test) : %.4f" % accuracy_score(y_test, y_pred))

Confusion matrix(test):
[[71071    10]
 [   43    78]]
Accuracy(test) : 0.9993


In [9]:
# PrecisionとRecall
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("precision : %.4f" % (tp / (tp + fp)))
print("recall : %.4f" % (tp / (tp + fn)))

# F値
f1_score(y_pred, y_test)

precision : 0.8864
recall : 0.6446


0.7464114832535885

## Under Sampling

In [10]:
# ライブラリ
from imblearn.under_sampling import RandomUnderSampler

# 正例の数を保存
positive_count_train = int(y_train.sum())
print("positive count:{}".format(positive_count_train))

# 正例が10％になるまで負例をダウンサンプリング
rus = RandomUnderSampler(
    sampling_strategy={0: positive_count_train * 9, 1: positive_count_train},
    random_state=40,
)

# 学習用データに反映
x_train_resampled, y_train_resampled = rus.fit_resample(x_train, y_train)
print("y_train_undersample:\n{}".format(pd.Series(y_train_resampled).value_counts()))

positive count:371
y_train_undersample:
0    3339
1     371
Name: Class, dtype: int64


In [11]:
# 分類モデル作成
mod = GradientBoostingClassifier()
mod.fit(x_train_resampled, y_train_resampled)

# 予測値算出
y_pred = mod.predict(x_test)

# Accuracyと混同行列
print("Confusion matrix(test):\n{}".format(confusion_matrix(y_test, y_pred)))
print("Accuracy(test) : %.4f" % accuracy_score(y_test, y_pred))

Confusion matrix(test):
[[70827   254]
 [   14   107]]
Accuracy(test) : 0.9962


In [12]:
# PrecisionとRecall
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("precision : %.4f" % (tp / (tp + fp)))
print("recall : %.4f" % (tp / (tp + fn)))

# F値
f1_score(y_pred, y_test)

precision : 0.2964
recall : 0.8843


0.4439834024896266

## Over Sampling

In [13]:
# ライブラリ
from imblearn.over_sampling import RandomOverSampler

# 正例を10％まであげる
ros = RandomOverSampler(
    sampling_strategy={0: x_train.shape[0], 1: x_train.shape[0] // 9}, random_state=40
)

# 学習用データに反映
x_train_resampled, y_train_resampled = ros.fit_resample(x_train, y_train)



In [14]:
# 分類モデル作成
mod = GradientBoostingClassifier()
mod.fit(x_train_resampled, y_train_resampled)

# 予測値算出
y_pred = mod.predict(x_test)

# Accuracyと混同行列
print("Confusion matrix(test):\n{}".format(confusion_matrix(y_test, y_pred)))
print("Accuracy(test) : %.4f" % accuracy_score(y_test, y_pred))

Confusion matrix(test):
[[71023    58]
 [   16   105]]
Accuracy(test) : 0.9990


In [15]:
# PrecisionとRecall
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("precision : %.4f" % (tp / (tp + fp)))
print("recall : %.4f" % (tp / (tp + fn)))

# F値
f1_score(y_pred, y_test)

precision : 0.6442
recall : 0.8678


0.7394366197183099

## SMOTE

In [16]:
# ライブラリ
from imblearn.over_sampling import SMOTE

# SMOTE
smote = SMOTE(
    sampling_strategy={0: x_train.shape[0], 1: x_train.shape[0] // 9}, random_state=40
)
x_train_resampled_smoth, y_train_resampled_smoth = smote.fit_resample(x_train, y_train)



In [17]:
# 分類モデル作成
mod = GradientBoostingClassifier()
mod.fit(x_train_resampled_smoth, y_train_resampled_smoth)

# 予測値算出
y_pred = mod.predict(x_test)

# Accuracyと混同行列
print("Confusion matrix(test):\n{}".format(confusion_matrix(y_test, y_pred)))
print("Accuracy(test) : %.4f" % accuracy_score(y_test, y_pred))

Confusion matrix(test):
[[70999    82]
 [   13   108]]
Accuracy(test) : 0.9987


In [18]:
# PrecisionとRecall
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("precision : %.4f" % (tp / (tp + fp)))
print("recall : %.4f" % (tp / (tp + fn)))

# F値
f1_score(y_pred, y_test)

precision : 0.5684
recall : 0.8926


0.6945337620578778