In [2]:
import pandas as pd

# 載入信用卡欺詐資料
df = pd.read_csv('creditcard.csv')

Fraud = df[df['Class']==1]
Valid = df[df['Class']==0]
Fraud.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
623,7610,0.73,2.3,-5.33,4.01,-1.73,-1.73,-3.97,1.06,-0.49,...,0.59,0.11,0.6,-0.36,-1.84,0.35,0.59,0.1,1.0,1
664,8808,-4.62,1.7,-3.11,4.33,-1.87,-0.99,-4.58,0.47,0.47,...,0.48,0.15,0.12,-0.22,-0.14,-0.42,-1.0,0.89,1.1,1
924,13323,-5.45,8.29,-12.75,8.59,-3.11,-3.18,-9.25,4.25,-6.33,...,1.85,-0.27,-0.31,-1.2,1.35,0.61,1.57,0.81,1.0,1


In [3]:
# 計算 "Class" 欄位為 1 的比例
outlier_fraction = len(Fraud)/float(df.shape[0])

print('Outlier ratio:', outlier_fraction)
print("Fraud Cases : {}".format(len(Fraud)))
print("Valid Cases : {}".format(len(Valid)))

Outlier ratio: 0.0017204452090867595
Fraud Cases : 49
Valid Cases : 28432


In [4]:
# 取出特徵欄位 V1,V2,...,V28,Amount、類別欄位 class
columns = [c for c in df.columns if c not in ['Time', 'Class']]
target = 'Class'
X_train = df[columns]
y_train = df[target] 

In [5]:
# 建立孤立森林、局部異常因子模型
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

clf1 = IsolationForest(n_estimators=100, max_samples=len(X_train), 
                       contamination=outlier_fraction)
clf2 = LocalOutlierFactor(n_neighbors=20, algorithm='auto', 
                          leaf_size=30, 
                          p=2, 
                          contamination=outlier_fraction)

In [7]:
# 分別計算分類錯誤樣本數
lst_name = ['孤立森林', '局部異常因子']
lst_clf = [clf1, clf2]

print('=== 分類錯誤的樣本數 ===')
for name, clf in zip(lst_name, lst_clf):
    y_pred = clf.fit_predict(X_train)
    
    # 修改預測值： 0->正常交易、1->詐欺交易
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != y_train).sum()
    
    print('%s：%d' %(name, n_errors))

=== 分類錯誤的樣本數 ===
孤立森林：70
局部異常因子：78
