In [18]:
import math
import numpy as np
from pandas import DataFrame
import datetime
from itertools import repeat

class HBOS:
        
    def __init__(self, log_scale=True, ranked=False, bin_info_array=[], mode_array=[], nominal_array=[]):
        self.log_scale = log_scale
        self.ranked = ranked
        self.bin_info_array = bin_info_array
        self.mode_array = mode_array
        self.nominal_array = nominal_array
        # self.histogram_list = []
        
    def fit(self, data):
        attr_size = len(data.columns)
        total_data_size = len(data)
        
        # init params if needed
        if len(self.bin_info_array) == 0:
            self.bin_info_array = list(repeat(-1, attr_size))
        
        if len(self.mode_array) == 0:
            self.mode_array = list(repeat('dynamic binwidth', attr_size))
            
        if len(self.nominal_array) == 0:
            self.nominal_array = list(repeat(False, attr_size))
                  
        if self.ranked:
            self.log_scale = False
            
        normal = 1.0
        
        # calculate standard _bin size if needed
        for i in range(len(self.bin_info_array)):
            if self.bin_info_array[ i ] == -1:
                self.bin_info_array[ i ] = round(math.sqrt(len(data)))
                
        # initialize histogram
        self.histogram_list = []
        for i in range(attr_size):
            self.histogram_list.append([])
            
        # save maximum value for every attribute(needed to normalize _bin width)
        maximum_value_of_rows = data.apply(max).values
        
        # sort data
        sorted_data = data.apply(sorted)
                
        # create histograms
        for attrIndex in range(len(sorted_data.columns)):
            attr = sorted_data.columns[ attrIndex ]
            last = 0
            bin_start = sorted_data[ attr ][ 0 ]
            if self.mode_array[ attrIndex ] == 'dynamic binwidth':
                if self.nominal_array[ attrIndex ] == True:
                    while last < len(sorted_data) - 1:
                        last = self.create_dynamic_histogram(self.histogram_list, sorted_data, last, 1, attrIndex, True)
                else:
                    length = len(sorted_data)
                    binwidth = self.bin_info_array[ attrIndex ]
                    while last < len(sorted_data) - 1:
                        values_per_bin = math.floor(len(sorted_data) / self.bin_info_array[ attrIndex ])
                        last = self.create_dynamic_histogram(self.histogram_list, sorted_data, last, values_per_bin, attrIndex, False)
                        if binwidth > 1:
                            length = length - self.histogram_list[ attrIndex ][ -1 ].quantity
                            binwidth = binwidth - 1
            else:
                count_bins = 0
                binwidth = (sorted_data[ attr ][ len(sorted_data) - 1 ] - sorted_data[ attr ][ 0 ]) / self.bin_info_array[ attrIndex ]
                if (self.nominal_array[ attrIndex ] == True) | (binwidth == 0):
                    binwidth = 1
                while last < len(sorted_data):       
                    is_last_bin = count_bins == self.bin_info_array[ attrIndex ] - 1
                    last = self.create_static_histogram(self.histogram_list, sorted_data, last, binwidth, attrIndex, bin_start, is_last_bin)
                    bin_start = bin_start + binwidth
                    count_bins = count_bins + 1
    
        # calculate score using normalized _bin width
        # _bin width is normalized to the number of datapoints
        # save maximum score for every attr(needed to normalize score)
        max_score = []
        
        # loop for all histograms
        for i in range(len(self.histogram_list)):
            max_score.append(0)
            histogram = self.histogram_list[ i ]
            
            # loop for all bins
            for k in range(len(histogram)):
                _bin = histogram[ k ]
                _bin.total_data_size = total_data_size
                _bin.calc_score(maximum_value_of_rows[ i ])
                if max_score[ i ] < _bin.score:
                    max_score[ i ] = _bin.score
                    
        for i in range(len(self.histogram_list)):
            histogram = self.histogram_list[ i ]
            for k in range(len(histogram)):
                _bin = histogram[ k ]
                _bin.normalize_score(normal, max_score[ i ], self.log_scale)
                                    
        # if ranked
        
    def predict(self, data):
        score_array = []
        for i in range(len(data)):
            each_data = data.values[ i ]
            value = 1
            if self.log_scale | self.ranked:
                value = 0
            for attr in range(len(data.columns)):
                score = self.get_score(self.histogram_list[ attr ], each_data[ attr ])
                if self.log_scale:
                    value = value + score
                elif self.ranked:
                    value = value + score
                else:
                    value = value * score
            score_array.append(value)
        return score_array
    
    def fit_predict(self, data):
        self.fit(data)
        return self.predict(data)
    
    def get_score(self, histogram, value):
        for i in range(len(histogram) - 1):
            _bin = histogram[ i ]
            if (_bin.range_from <= value) & (value < _bin.range_to):
                return _bin.score
            
        _bin = histogram[ -1 ]
        if (_bin.range_from <= value) & (value <= _bin.range_to):
            return _bin.score
        return 0
          
    @staticmethod  
    def check_amount(sortedData, first_occurrence, values_per_bin, attr):
        # check if there are more than values_per_bin values of a given value
        if first_occurrence + values_per_bin < len(sortedData):
            if sortedData[ attr ][ first_occurrence ] == sortedData[ attr ][ first_occurrence + values_per_bin ]:
                return True
            else:
                return False
        else:
            return False
                    
    @staticmethod
    def create_dynamic_histogram(histogram_list, sortedData, first_index, values_per_bin, attrIndex, isNominal):
        last_index = 0
        attr = sortedData.columns[ attrIndex ]
        
        # create new _bin
        _bin = HistogramBin(sortedData[ attr ][ first_index ], 0, 0)
            
        # check if an end of the data is near
        if first_index + values_per_bin < len(sortedData):
            last_index = first_index + values_per_bin
        else:
            last_index = len(sortedData)
    
        # the first value always goes to the _bin
        _bin.add_quantitiy(1)
        
        # for every other value
        # check if it is the same as the last value
        # if so
        #   put it into the _bin
        # if not
        #   check if there are more than values_per_bin of that value
        #   if so
        #     open new _bin
        #   if not
        #     continue putting the value into the _bin
        
        cursor = first_index
        for i in range(first_index + 1, last_index):
            if sortedData[ attr ][ i ] == sortedData[ attr ][ cursor ]:
                _bin.add_quantitiy(1)
                cursor = cursor + 1
            else:
                if HBOS.check_amount(sortedData, i, values_per_bin, attr):
                    break
                else:
                    _bin.add_quantitiy(1)
                    cursor = cursor + 1
                    
        # continue to put values in the _bin until a new values arrive
        for i in range(cursor + 1, len(sortedData)):
            if sortedData[ attr ][ i ] == sortedData[ attr ][ cursor ]:
                _bin.quantity = _bin.quantity + 1
                cursor = cursor + 1
            else:
                break
                                
        # adjust range of the bins
        if cursor + 1 < len(sortedData):
            _bin.range_to = sortedData[ attr ][ cursor + 1 ]
        else:  # last data
            if isNominal:
                _bin.range_to = sortedData[ attr ][ len(sortedData) - 1 ] + 1
            else:
                _bin.range_to = sortedData[ attr ][ len(sortedData) - 1 ]
                
        # save _bin
        if _bin.range_to - _bin.range_from > 0:
            histogram_list[ attrIndex ].append(_bin)
        elif len(histogram_list[ attrIndex ]) == 0:
            _bin.range_to = _bin.range_to + 1
            histogram_list[ attrIndex ].append(_bin)
        else:
            # if the _bin would have length of zero
            # we merge it with previous _bin
            # this can happen at the end of the histogram
            lastBin = histogram_list[ attrIndex ][ -1 ]
            lastBin.add_quantitiy(_bin.quantity)
            lastBin.range_to = _bin.range_to
        
        return cursor + 1

    
    @staticmethod
    def create_static_histogram(histogram_list, sorted_data, first_index, binwidth, attrIndex, bin_start, last_bin):
        attr = sorted_data.columns[ attrIndex ]
        _bin = HistogramBin(bin_start, bin_start + binwidth, 0)
        if last_bin == True:
            _bin = HistogramBin(bin_start, sorted_data[ attr ][ len(sorted_data) - 1 ], 0)
        
        last = first_index - 1
        cursor = first_index
        
        while True:
            if cursor >= len(sorted_data):
                break
            if sorted_data[ attr ][ cursor ] > _bin.range_to:
                break
            _bin.quantity = _bin.quantity + 1
            last = cursor
            cursor = cursor + 1
            
        histogram_list[ attrIndex ].append(_bin)
        return last + 1     

In [19]:
class HistogramBin:

    def __init__(self, range_from, range_to, quantity):
        self.range_from = range_from
        self.range_to = range_to
        self.quantity = quantity
        self.score = 0
        self.total_data_size = 0
        
    def get_height(self):
        width = self.range_to - self.range_from
        height = self.quantity / width
        return height
    
    def add_quantitiy(self, anz):
        self.quantity = self.quantity + anz
        
    def calc_score(self, max_score):
        if max_score == 0:
            max_score = 1
        
        if self.quantity > 0:
            self.score = self.quantity / ((self.range_to - self.range_from) * self.total_data_size / abs(max_score))
                    
    def normalize_score(self, normal, max_score, log_scale):
        self.score = self.score * normal / max_score
        if(self.score == 0):
            return
        self.score = 1 / self.score
        if log_scale:
            self.score = math.log10(self.score)

In [20]:
from sklearn.ensemble import IsolationForest

In [31]:
import generate_nnr_data
residual_data, aged_residual_data = generate_nnr_data.generate_nnr()

data = []
for i in range(50):
    data.append(residual_data[i].flatten())
    
for i in range(2):
    data.append(aged_residual_data[i].flatten())
    
data = np.array(data)

import csv # csvモジュールをインポート

file = open('new.csv', 'w')    #既存でないファイル名を作成してください
w = csv.writer(file)
w = w.writerows(data)
 
file.close()

In [32]:
dataset = pd.read_csv("./new.csv", header=None)
dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4874,4875,4876,4877,4878,4879,4880,4881,4882,4883
0,1.38975,-1.309167,1.424967,-0.8881,1.6051,-1.6184,0.9402,36.783467,-36.5481,0.0,...,1.773133,0.4778,0.026467,-0.068067,35.350433,-35.4169,0.0,0.0,0.0,0.0
1,1.5723,-2.371967,1.2213,1.089233,0.7761,0.803967,-0.7076,36.4778,-36.1453,0.0,...,1.2373,0.287833,0.827667,-0.313267,35.884,-36.102267,0.0,0.0,0.0,0.0
2,2.48775,-2.279433,1.000967,1.4769,1.874133,-0.688033,-0.436433,39.998233,-38.753767,0.0,...,1.116833,0.281233,0.8139,-0.6626,38.373933,-38.3024,0.0,0.0,0.0,0.0
3,1.7678,-1.852233,0.907767,1.061767,1.8017,0.4512,-0.556533,41.515533,-40.9928,0.0,...,0.792267,1.103433,0.470733,-0.791733,40.1131,-39.741767,0.0,0.0,0.0,0.0
4,2.7298,-3.1781,1.0412,1.7057,1.934533,-0.177067,-0.194433,41.583867,-41.2845,0.0,...,0.695167,0.254133,0.8565,-0.9051,41.089233,-40.769633,0.0,0.0,0.0,0.0
5,2.05035,-2.0777,1.0933,1.207767,1.7112,-0.326367,0.855867,41.642467,-41.111167,0.0,...,0.851633,0.9651,-0.296233,-0.008233,40.1906,-40.1376,0.0,0.0,0.0,0.0
6,2.4741,-2.858067,1.2018,1.825367,0.946067,0.558767,-0.3139,40.102167,-39.803533,0.0,...,0.710367,0.497367,1.3157,-1.2069,38.6519,-38.844967,0.0,0.0,0.0,0.0
7,1.9491,-2.672767,0.540733,1.6046,1.0587,0.546367,0.1041,41.160467,-40.400567,0.0,...,0.113367,0.972233,0.340067,-0.315167,40.714967,-40.276667,0.0,0.0,0.0,0.0
8,2.05115,-2.647867,1.028033,1.819867,0.940633,-0.602833,1.095267,41.167667,-40.393567,0.0,...,0.980067,0.644433,0.588333,-0.6806,39.7294,-39.423933,0.0,0.0,0.0,0.0
9,1.92485,-3.477433,1.3499,1.650067,1.081167,-0.278867,0.710567,40.5503,-40.063567,0.0,...,2.073833,0.043633,1.409867,-1.5563,40.0436,-39.8195,0.0,0.0,0.0,0.0


In [33]:
hbos = HBOS()
hbos_result = hbos.fit_predict(dataset)

In [34]:
hbos_result

[2071.7105844799466,
 1553.952907749677,
 1512.5488754722924,
 1444.6715192726872,
 1556.1522287182413,
 1467.3931742680475,
 1380.4785150334258,
 1359.428932990368,
 1439.8938330934527,
 1411.7887481292867,
 1610.637787459698,
 1523.8708983447873,
 1476.2021634559687,
 1484.631984759517,
 1514.5123203712078,
 1372.4084346265179,
 1598.255056211019,
 1525.1932327334594,
 1435.7017444230216,
 1671.0433449347522,
 1609.0555369692484,
 1548.0726175394336,
 1460.9233626602434,
 1436.513489522176,
 1619.3224279720673,
 1505.800314443531,
 1576.1617135172585,
 1707.1803732270055,
 1407.1135977337603,
 1504.2735124508943,
 1372.490181247685,
 1377.4753913173438,
 1494.794578199002,
 1414.4389174669163,
 1378.6816297582643,
 1545.9880628949263,
 1505.3181748816473,
 1792.1788891651388,
 1535.5696712480694,
 1423.063696675478,
 1326.9568959687567,
 1484.0067244188895,
 1549.6876487247482,
 1452.4375757441853,
 1347.846013962276,
 1542.137160928559,
 1454.699914519909,
 1485.9629271959122,
 1544