# 离群点检测和异常点分析 Part1

本部分实验基于wine_benchmarks数据集，结合pyob工具使用多种算法进行离群点的识别和分析

### 1.准备工作

In [1]:
# 导入必须的数据集和依赖包
from __future__ import division
from __future__ import print_function

import os
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
import pandas as pd
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize
from sklearn.model_selection import train_test_split

### 2. 数据处理

In [2]:
#从wine_benchmark数据集中读取对应的特征和标签信息
def get_data(no):
    filename = "./wine/benchmarks/wine_benchmark_" + no + ".csv"
    feature = pd.read_csv(filename, usecols=[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])
    label = pd.read_csv(filename, usecols=[5])
    label_mapping = {'nominal': 0, 'anomaly': 1}
    label['ground.truth'] = label['ground.truth'].map(label_mapping)
    return feature,label

> 本实验主要使用pyod工具包内提供的算法接口，评价指标为ROC-AUC分数

### 3.使用knn算法分析离群点

In [6]:
    for i in range(1,1681):
        #划分数据集
        no = str(i).zfill(4)
        feature,label = get_data(no)
        X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=1)

        # train kNN detector
        clf_name = 'KNN'
        clf = KNN()
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        if(i<=20):
            # 出于显示考虑，只显示前20个数据集的结果，实际使用时可以将该判断删除
            print("wine_benchmark_" + no + ".csv:")
            print("On Training Data:  ",end="")
            evaluate_print(clf_name, y_train, y_train_scores)
            print("On Test Data:  ",end="")
            evaluate_print(clf_name, y_test, y_test_scores)
            print("")

wine_benchmark_0001.csv:
On Training Data:  KNN ROC:0.5599, precision @ rank n:0.4098
On Test Data:  KNN ROC:0.4976, precision @ rank n:0.368

wine_benchmark_0002.csv:
On Training Data:  KNN ROC:0.5479, precision @ rank n:0.3977
On Test Data:  KNN ROC:0.5557, precision @ rank n:0.4038

wine_benchmark_0003.csv:
On Training Data:  KNN ROC:0.5578, precision @ rank n:0.4234
On Test Data:  KNN ROC:0.5527, precision @ rank n:0.4066

wine_benchmark_0004.csv:
On Training Data:  KNN ROC:0.5522, precision @ rank n:0.4135
On Test Data:  KNN ROC:0.5813, precision @ rank n:0.4444

wine_benchmark_0005.csv:
On Training Data:  KNN ROC:0.5634, precision @ rank n:0.4213
On Test Data:  KNN ROC:0.557, precision @ rank n:0.4176

wine_benchmark_0006.csv:
On Training Data:  KNN ROC:0.5522, precision @ rank n:0.426
On Test Data:  KNN ROC:0.531, precision @ rank n:0.4252

wine_benchmark_0007.csv:
On Training Data:  KNN ROC:0.551, precision @ rank n:0.4272
On Test Data:  KNN ROC:0.5501, precision @ rank n:0.424

### 4.使用LOF进行离群点分析

In [7]:
for i in range(1,1681):
    no = str(i).zfill(4)
    feature,label = get_data(no)
    X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=1)

    # train LOF detector
    clf_name = 'LOF'
    clf = LOF()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    if(i<=20):
        # 出于显示考虑，只显示前20个数据集的结果，实际使用时可以将该判断删除
        print("wine_benchmark_" + no + ".csv:")
        print("On Training Data:  ",end="")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("On Test Data:  ",end="")
        evaluate_print(clf_name, y_test, y_test_scores)
        print("")

wine_benchmark_0001.csv:
On Training Data:  LOF ROC:0.5588, precision @ rank n:0.416
On Test Data:  LOF ROC:0.4914, precision @ rank n:0.3643

wine_benchmark_0002.csv:
On Training Data:  LOF ROC:0.5647, precision @ rank n:0.4035
On Test Data:  LOF ROC:0.5614, precision @ rank n:0.4192

wine_benchmark_0003.csv:
On Training Data:  LOF ROC:0.5577, precision @ rank n:0.4169
On Test Data:  LOF ROC:0.5444, precision @ rank n:0.3919

wine_benchmark_0004.csv:
On Training Data:  LOF ROC:0.5508, precision @ rank n:0.4198
On Test Data:  LOF ROC:0.564, precision @ rank n:0.4148

wine_benchmark_0005.csv:
On Training Data:  LOF ROC:0.5648, precision @ rank n:0.4217
On Test Data:  LOF ROC:0.5777, precision @ rank n:0.4286

wine_benchmark_0006.csv:
On Training Data:  LOF ROC:0.5517, precision @ rank n:0.4097
On Test Data:  LOF ROC:0.547, precision @ rank n:0.4441

wine_benchmark_0007.csv:
On Training Data:  LOF ROC:0.5625, precision @ rank n:0.4213
On Test Data:  LOF ROC:0.5161, precision @ rank n:0.3

### 5.使用PCA算法进行离群点分析

In [8]:
    for i in range(1,1681):
        no = str(i).zfill(4)
        feature,label = get_data(no)
        X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=1)

        clf_name = 'PCA'
        clf = PCA(n_components=3)
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        if(i<=20):
        # 出于显示考虑，只显示前20个数据集的结果，实际使用时可以将该判断删除
            print("wine_benchmark_" + no + ".csv:")
            print("On Training Data:  ",end="")
            evaluate_print(clf_name, y_train, y_train_scores)
            print("On Test Data:  ",end="")
            evaluate_print(clf_name, y_test, y_test_scores)
            print("")

wine_benchmark_0001.csv:
On Training Data:  PCA ROC:0.5372, precision @ rank n:0.4113
On Test Data:  PCA ROC:0.5445, precision @ rank n:0.3978

wine_benchmark_0002.csv:
On Training Data:  PCA ROC:0.5323, precision @ rank n:0.387
On Test Data:  PCA ROC:0.537, precision @ rank n:0.3808

wine_benchmark_0003.csv:
On Training Data:  PCA ROC:0.5387, precision @ rank n:0.4215
On Test Data:  PCA ROC:0.4813, precision @ rank n:0.3493

wine_benchmark_0004.csv:
On Training Data:  PCA ROC:0.531, precision @ rank n:0.4198
On Test Data:  PCA ROC:0.5434, precision @ rank n:0.4

wine_benchmark_0005.csv:
On Training Data:  PCA ROC:0.5467, precision @ rank n:0.4147
On Test Data:  PCA ROC:0.5164, precision @ rank n:0.3956

wine_benchmark_0006.csv:
On Training Data:  PCA ROC:0.5313, precision @ rank n:0.4206
On Test Data:  PCA ROC:0.5564, precision @ rank n:0.4407

wine_benchmark_0007.csv:
On Training Data:  PCA ROC:0.534, precision @ rank n:0.4236
On Test Data:  PCA ROC:0.5013, precision @ rank n:0.4064


### 6.使用MCD算法进行离群点分析

In [9]:
for i in range(1,1681):
    no = str(i).zfill(4)
    feature,label = get_data(no)
    X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=1)

    # train LOF detector
    clf_name = 'MCD'
    clf = MCD()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    if(i<=20):
        # 出于显示考虑，只显示前20个数据集的结果，实际使用时可以将该判断删除
        print("wine_benchmark_" + no + ".csv:")
        print("On Training Data:  ",end="")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("On Test Data:  ",end="")
        evaluate_print(clf_name, y_test, y_test_scores)
        print("")

wine_benchmark_0001.csv:
On Training Data:  MCD ROC:0.618, precision @ rank n:0.4779
On Test Data:  MCD ROC:0.6082, precision @ rank n:0.4721

wine_benchmark_0002.csv:
On Training Data:  MCD ROC:0.6184, precision @ rank n:0.4694
On Test Data:  MCD ROC:0.5951, precision @ rank n:0.45

wine_benchmark_0003.csv:
On Training Data:  MCD ROC:0.6196, precision @ rank n:0.4819
On Test Data:  MCD ROC:0.5818, precision @ rank n:0.4652

wine_benchmark_0004.csv:
On Training Data:  MCD ROC:0.62, precision @ rank n:0.4991
On Test Data:  MCD ROC:0.6291, precision @ rank n:0.4852

wine_benchmark_0005.csv:
On Training Data:  MCD ROC:0.6194, precision @ rank n:0.4816
On Test Data:  MCD ROC:0.6292, precision @ rank n:0.4945

wine_benchmark_0006.csv:
On Training Data:  MCD ROC:0.6102, precision @ rank n:0.4779
On Test Data:  MCD ROC:0.6131, precision @ rank n:0.5119

wine_benchmark_0007.csv:
On Training Data:  MCD ROC:0.6031, precision @ rank n:0.4888
On Test Data:  MCD ROC:0.6142, precision @ rank n:0.484