In [1]:
# -*- coding: utf-8 -*-
"""Example of using PCA for outlier detection
"""
# Author: Yue Zhao <zhaoy@cmu.edu>
# License: BSD 2 clause

from __future__ import division
from __future__ import print_function

import os
import sys
import numpy as np
import time
import pickle

# temporary solution for relative imports in case pyod is not installed
# if pyod is installed, no need to use the following line
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

from pyod.models.knn import KNN
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize

def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def main():
    
    # 读取训练数据集
    train_x = []
    train_y = []
    for i in range(5):
        file = "./dataSet/data_batch_%d"%(i+1)
        dicts = unpickle(file)
        labels = dicts[b'labels']
        data = dicts[b'data'].reshape(-1, 3, 32, 32) # 3*32x32
        data = data.transpose(0,2,3,1)

    for j in range(len(labels)):
        img_data = data[j]
        label = labels[j]
        train_x.append(img_data)
        train_y.append(label)
    # 读取测试数据集
    test_x = []
    test_y = []
    for i in range(1):
        # file = "./data_batch_%d"%(i+1)
        file = "./dataSet/test_batch"
        dicts = unpickle(file)
        labels = dicts[b'labels']
        data = dicts[b'data'].reshape(-1, 3, 32, 32) # 3*32x32
        data = data.transpose(0,2,3,1)

        for j in range(len(labels)):
            img_data = data[j]
            label = labels[j]
            test_x.append(img_data)
            test_y.append(label)
    train_x = np.array(train_x)
    train_y = np.array(train_y)
    test_x = np.array(test_x)
    test_y = np.array(test_y)
    
    nomal_label = 0
    norm_train_x = train_x[train_y == nomal_label]
    norm_train_x = norm_train_x.flatten()
    norm_train_x = norm_train_x.reshape(-1, 3072)
    
    test_x = test_x.flatten()
    test_x = test_x.reshape(-1, 3072)

    
    true_y = test_y.copy()
    true_y[test_y == nomal_label] = 1
    true_y[test_y != nomal_label] = 0
    
    # train kNN detector
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(norm_train_x)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    for i in range(10):
        start_time = time.time()
        y_test_pred = clf.predict(test_x)  # outlier labels (0 or 1)
        print("平均消耗的时间为", ((time.time()-start_time) / test_x.shape[0]*1000))

In [2]:
main()

平均消耗的时间为 8.73938982486725
平均消耗的时间为 8.117585301399231
平均消耗的时间为 8.06956024169922
平均消耗的时间为 8.112499094009399
平均消耗的时间为 8.14856948852539
平均消耗的时间为 8.130594182014466
平均消耗的时间为 8.046667742729188
平均消耗的时间为 8.044942736625671
平均消耗的时间为 8.02818913459778
平均消耗的时间为 7.989023613929748
