# 离群点检测和异常点分析 Part2

本部分实验基于wave_benchmarks数据集，结合pyob工具使用多种算法进行离群点的识别和分析

### 1.准备工作

In [3]:
# 导入必须的数据集和依赖包
from __future__ import division
from __future__ import print_function

import os
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
import pandas as pd
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize
from sklearn.model_selection import train_test_split

### 2. 数据处理

In [1]:
#从wave_benchmark数据集中读取对应的特征和标签信息
def get_data(no):
    filename = "./wave/benchmarks/wave_benchmark_" + no + ".csv"
    feature = pd.read_csv(filename, usecols=[6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26])
    label = pd.read_csv(filename, usecols=[5])
    label_mapping = {'nominal': 0, 'anomaly': 1}
    label['ground.truth'] = label['ground.truth'].map(label_mapping)
    return feature,label

> 本实验主要使用pyod工具包内提供的算法接口，评价指标为ROC-AUC分数

### 3.使用knn算法分析离群点

In [5]:
    for i in range(1,1641):
        #划分数据集
        no = str(i).zfill(4)
        feature,label = get_data(no)
        X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=1)

        # train kNN detector
        clf_name = 'KNN'
        clf = KNN()
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        if(i<=20):
            # 出于显示考虑，只显示前20个数据集的结果，实际使用时可以将该判断删除
            print("wine_benchmark_" + no + ".csv:")
            print("On Training Data:  ",end="")
            evaluate_print(clf_name, y_train, y_train_scores)
            print("On Test Data:  ",end="")
            evaluate_print(clf_name, y_test, y_test_scores)
            print("")

wine_benchmark_0001.csv:
On Training Data:  KNN ROC:0.5427, precision @ rank n:0.365
On Test Data:  KNN ROC:0.5571, precision @ rank n:0.3698

wine_benchmark_0002.csv:
On Training Data:  KNN ROC:0.5126, precision @ rank n:0.3421
On Test Data:  KNN ROC:0.5386, precision @ rank n:0.3529

wine_benchmark_0003.csv:
On Training Data:  KNN ROC:0.5275, precision @ rank n:0.3567
On Test Data:  KNN ROC:0.5339, precision @ rank n:0.3266

wine_benchmark_0004.csv:
On Training Data:  KNN ROC:0.5588, precision @ rank n:0.3706
On Test Data:  KNN ROC:0.5314, precision @ rank n:0.3415

wine_benchmark_0005.csv:
On Training Data:  KNN ROC:0.5039, precision @ rank n:0.33
On Test Data:  KNN ROC:0.53, precision @ rank n:0.3846

wine_benchmark_0006.csv:
On Training Data:  KNN ROC:0.5158, precision @ rank n:0.3309
On Test Data:  KNN ROC:0.5623, precision @ rank n:0.412

wine_benchmark_0007.csv:
On Training Data:  KNN ROC:0.5399, precision @ rank n:0.3664
On Test Data:  KNN ROC:0.515, precision @ rank n:0.3493


### 4.使用LOF进行离群点分析

In [6]:
for i in range(1,1641):
    no = str(i).zfill(4)
    feature,label = get_data(no)
    X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=1)

    # train LOF detector
    clf_name = 'LOF'
    clf = LOF()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    if(i<=20):
        # 出于显示考虑，只显示前20个数据集的结果，实际使用时可以将该判断删除
        print("wine_benchmark_" + no + ".csv:")
        print("On Training Data:  ",end="")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("On Test Data:  ",end="")
        evaluate_print(clf_name, y_test, y_test_scores)
        print("")

wine_benchmark_0001.csv:
On Training Data:  LOF ROC:0.52, precision @ rank n:0.3375
On Test Data:  LOF ROC:0.5376, precision @ rank n:0.3542

wine_benchmark_0002.csv:
On Training Data:  LOF ROC:0.5057, precision @ rank n:0.3337
On Test Data:  LOF ROC:0.5462, precision @ rank n:0.3627

wine_benchmark_0003.csv:
On Training Data:  LOF ROC:0.4898, precision @ rank n:0.3112
On Test Data:  LOF ROC:0.5159, precision @ rank n:0.3266

wine_benchmark_0004.csv:
On Training Data:  LOF ROC:0.5221, precision @ rank n:0.3167
On Test Data:  LOF ROC:0.5159, precision @ rank n:0.3366

wine_benchmark_0005.csv:
On Training Data:  LOF ROC:0.5138, precision @ rank n:0.33
On Test Data:  LOF ROC:0.5448, precision @ rank n:0.4118

wine_benchmark_0006.csv:
On Training Data:  LOF ROC:0.4973, precision @ rank n:0.3162
On Test Data:  LOF ROC:0.5255, precision @ rank n:0.3657

wine_benchmark_0007.csv:
On Training Data:  LOF ROC:0.5167, precision @ rank n:0.3482
On Test Data:  LOF ROC:0.4872, precision @ rank n:0.33

### 5.使用PCA算法进行离群点分析

In [7]:
    for i in range(1,1641):
        no = str(i).zfill(4)
        feature,label = get_data(no)
        X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=1)

        clf_name = 'PCA'
        clf = PCA(n_components=3)
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        if(i<=20):
        # 出于显示考虑，只显示前20个数据集的结果，实际使用时可以将该判断删除
            print("wine_benchmark_" + no + ".csv:")
            print("On Training Data:  ",end="")
            evaluate_print(clf_name, y_train, y_train_scores)
            print("On Test Data:  ",end="")
            evaluate_print(clf_name, y_test, y_test_scores)
            print("")

wine_benchmark_0001.csv:
On Training Data:  PCA ROC:0.546, precision @ rank n:0.38
On Test Data:  PCA ROC:0.5594, precision @ rank n:0.3646

wine_benchmark_0002.csv:
On Training Data:  PCA ROC:0.5069, precision @ rank n:0.3601
On Test Data:  PCA ROC:0.5731, precision @ rank n:0.4314

wine_benchmark_0003.csv:
On Training Data:  PCA ROC:0.521, precision @ rank n:0.3752
On Test Data:  PCA ROC:0.5498, precision @ rank n:0.3719

wine_benchmark_0004.csv:
On Training Data:  PCA ROC:0.533, precision @ rank n:0.3495
On Test Data:  PCA ROC:0.5331, precision @ rank n:0.3902

wine_benchmark_0005.csv:
On Training Data:  PCA ROC:0.5151, precision @ rank n:0.3696
On Test Data:  PCA ROC:0.5248, precision @ rank n:0.4027

wine_benchmark_0006.csv:
On Training Data:  PCA ROC:0.5315, precision @ rank n:0.3799
On Test Data:  PCA ROC:0.5567, precision @ rank n:0.3889

wine_benchmark_0007.csv:
On Training Data:  PCA ROC:0.5489, precision @ rank n:0.3894
On Test Data:  PCA ROC:0.5228, precision @ rank n:0.392

### 6.使用MCD算法进行离群点分析

In [8]:
for i in range(1,1641):
    no = str(i).zfill(4)
    feature,label = get_data(no)
    X_train, X_test, y_train, y_test = train_test_split(feature, label, test_size=0.2, random_state=1)

    # train LOF detector
    clf_name = 'MCD'
    clf = MCD()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    if(i<=20):
        # 出于显示考虑，只显示前20个数据集的结果，实际使用时可以将该判断删除
        print("wine_benchmark_" + no + ".csv:")
        print("On Training Data:  ",end="")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("On Test Data:  ",end="")
        evaluate_print(clf_name, y_test, y_test_scores)
        print("")

wine_benchmark_0001.csv:
On Training Data:  MCD ROC:0.5043, precision @ rank n:0.3312
On Test Data:  MCD ROC:0.5176, precision @ rank n:0.3229

wine_benchmark_0002.csv:
On Training Data:  MCD ROC:0.4796, precision @ rank n:0.3217
On Test Data:  MCD ROC:0.5315, precision @ rank n:0.3284

wine_benchmark_0003.csv:
On Training Data:  MCD ROC:0.4848, precision @ rank n:0.3173
On Test Data:  MCD ROC:0.5033, precision @ rank n:0.3065

wine_benchmark_0004.csv:
On Training Data:  MCD ROC:0.4979, precision @ rank n:0.3049
On Test Data:  MCD ROC:0.4897, precision @ rank n:0.322

wine_benchmark_0005.csv:
On Training Data:  MCD ROC:0.4735, precision @ rank n:0.3041
On Test Data:  MCD ROC:0.5054, precision @ rank n:0.3846

wine_benchmark_0006.csv:
On Training Data:  MCD ROC:0.4811, precision @ rank n:0.3064
On Test Data:  MCD ROC:0.5311, precision @ rank n:0.3796

wine_benchmark_0007.csv:
On Training Data:  MCD ROC:0.5025, precision @ rank n:0.3398
On Test Data:  MCD ROC:0.4748, precision @ rank n:0