# 离群点分析与异常检测
数据集:wine

## 导入算法分类器

In [3]:
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.models.iforest import IForest
from pyod.models.hbos import HBOS
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF


clfs  =  []
clf_ABOD = ABOD()
clfs.append(["ABOD", clf_ABOD])

clf_KNN = KNN()
clfs.append(["KNN", clf_KNN])

clf_IForest = IForest()
clfs.append(["IForest", clf_IForest])

clf_HBOS = HBOS()
clfs.append(["HBOS", clf_HBOS])

clf_LOF = LOF()
clfs.append(["LOF", clf_LOF])

clf_CBLOF = CBLOF()
clfs.append(["CBLOF", clf_CBLOF])
print(len(clfs))

6


## 导入数据集

In [4]:
import numpy as np
import pandas as pd
import os

df = pd.read_csv('./data/wine/wine_benchmark_0001.csv')
df.head()

Unnamed: 0,point.id,motherset,origin,original.label,diff.score,ground.truth,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
0,wine_point_3594,wine,regression,7,0.050492,nominal,-1.245962,-0.362411,-0.265853,-0.261304,-0.343495,1.209882,0.747594,-0.899276,0.817846,-0.613338,0.17456
1,wine_point_5089,wine,regression,5,0.082237,anomaly,0.75954,0.973867,0.215849,-0.53454,0.598458,-0.536656,0.199134,0.968217,0.071518,0.596292,-0.915394
2,wine_point_1912,wine,regression,6,0.290201,nominal,-0.088942,-0.969809,-0.403482,-0.870829,-0.429127,-0.592996,-0.791633,-0.699187,-1.110168,1.402712,-0.496181
3,wine_point_4908,wine,regression,5,0.053559,anomaly,0.219597,0.973867,0.284664,0.138039,0.427194,-0.762016,-0.243173,1.034913,0.817846,1.805921,0.006874
4,wine_point_2246,wine,regression,7,0.4203,nominal,0.219597,-0.180191,-0.541112,0.34822,-0.714567,-0.142276,0.446826,-0.242318,-0.36384,-1.016548,0.090717


## 将离群转换为数字标签

In [5]:
label_mapping = {
           'nominal': 0,
           'anomaly': 1}
df['ground.truth'] = df['ground.truth'].map(label_mapping)

df.head()

Unnamed: 0,point.id,motherset,origin,original.label,diff.score,ground.truth,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
0,wine_point_3594,wine,regression,7,0.050492,0,-1.245962,-0.362411,-0.265853,-0.261304,-0.343495,1.209882,0.747594,-0.899276,0.817846,-0.613338,0.17456
1,wine_point_5089,wine,regression,5,0.082237,1,0.75954,0.973867,0.215849,-0.53454,0.598458,-0.536656,0.199134,0.968217,0.071518,0.596292,-0.915394
2,wine_point_1912,wine,regression,6,0.290201,0,-0.088942,-0.969809,-0.403482,-0.870829,-0.429127,-0.592996,-0.791633,-0.699187,-1.110168,1.402712,-0.496181
3,wine_point_4908,wine,regression,5,0.053559,1,0.219597,0.973867,0.284664,0.138039,0.427194,-0.762016,-0.243173,1.034913,0.817846,1.805921,0.006874
4,wine_point_2246,wine,regression,7,0.4203,0,0.219597,-0.180191,-0.541112,0.34822,-0.714567,-0.142276,0.446826,-0.242318,-0.36384,-1.016548,0.090717


## 提取训练属性和标签

In [6]:
col_n =  ['fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar', 'chlorides', 'free.sulfur.dioxide', 
         'total.sulfur.dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'ground.truth']

data = pd.DataFrame(df,columns = col_n)
x = data.iloc[:, :-1]
y = data.iloc[:, -1:]

## 数据集划分

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

## 使用分类器检测

In [8]:
import numpy as np

x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [9]:
clf_nameclf_name = 'KNN'
clf = KNN()
clf.fit(x_train)

y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(x_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(x_test)  # outlier scores

## 准确率评价

In [11]:
from sklearn.metrics import roc_auc_score
from pyod.utils.utility import precision_n_scores

train_roc = np.round(roc_auc_score(y_train, y_train_pred), decimals=4)
train_prn = np.round(precision_n_scores(y_train, y_train_pred), decimals=4)
test_roc = np.round(roc_auc_score(y_test, y_test_pred), decimals=4)
test_prn = np.round(precision_n_scores(y_test, y_test_pred), decimals=4)

print(train_roc)
print(train_prn)
print(test_roc)
print(test_prn)

0.5243
0.4714
0.5325
0.5303
