In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
from scipy.stats import ks_2samp
import random

## Data Visualization

In [2]:
def data_visualize(headers, dataset_f, dataset_l):
    for i in range(len(headers)):
        plt.figure(num=i, figsize=(18, 10), dpi=80)
        plt.title(headers[i])
        plt.scatter(range(len(dataset_f.iloc[:, 0])), dataset_f.iloc[:, i], c = dataset_l, s=4)

## Data Pre-processing

In [3]:
## For training data
def train_preprocess(dataset):
    dataset = dataset.drop(['DATETIME'], axis=1)

    dataset_f = dataset.iloc[:, 0:-1]
    dataset_l = dataset.iloc[:, -1]

    headers = list(dataset_f)
    
    return headers, dataset_f, dataset_l

In [4]:
## For testing data
def test_preprocess(dataset):
    dataset = dataset.drop(['DATETIME'], axis=1)
    headers = list(dataset)
    
    return headers, dataset

## Main Algorithm

In [5]:
## Generate random window size
def generate_win_size(data_size):
    windows = []
    for i in range(data_size):
        window_size = random.randint(125, 133)
        if sum(windows) + window_size <data_size:
            windows.append(window_size)
        else:
            windows.append(data_size - sum(windows))
            return windows

In [6]:
## Generate index for windows
def generate_index(windows):
    indexs = [0]
    for i in range(1, len(windows)):
        indexs.append(indexs[i-1] + windows[i-1])
    return indexs

In [7]:
## Calculating p-values for each window
def calculate_p(headers, windows, indexs, train1_f, test):
    all_pvals = []
    all_sorted_index = []
    for j in range(len(headers)):
        pvals = []
        for i in range(0, len(windows) - 1):
            ks_status, pval = ks_2samp(train1_f.iloc[:, j], test.iloc[indexs[i]:indexs[i+1], j])
            pvals.append(pval)
            
        sorted_index = sorted(range(len(pvals)), key=lambda k: pvals[k])
        all_sorted_index.append(sorted_index)
        pvals.sort()
        all_pvals.append(pvals)
        
    return all_pvals, all_sorted_index

In [8]:
## Identify attack index
def identify_attack(all_pvals, all_sorted_index, numeric_threshold, binary_threshold):
    attack = []
    
    ## Identify attack for numeric features
    for i in range(0, 31):
        for j in range(len(all_pvals[0])):
            if all_pvals[i][j] < 10**numeric_threshold:
                attack.append(all_sorted_index[i][j])
                
    ## Identify attack for binary features
    for i in range(31, 43):
        for j in range(len(all_pvals[0])):
            if all_pvals[i][j] < 10**binary_threshold:
                attack.append(all_sorted_index[i][j])
                
    return list(set(attack))

In [9]:
## Generate False padding
def generate_false(size):
    padding_False = []
    for i in range(size):
        padding_False.append([False])
    padding_False = np.array(padding_False)   
    return padding_False    

In [10]:
## Generate True padding
def generate_true(size):
    padding_True = []
    for i in range(size):
        padding_True.append([True])
    padding_True = np.array(padding_True)   
    return padding_True

In [11]:
## Generate one batch result
def one_batch(train1_f, test, headers, numeric_threshold, binary_threshold):
    ## Size of test set
    test_size = test.shape[0]
    
    ## Generating windows and indexs
    windows = generate_win_size(test_size)
    indexs = generate_index(windows)
    
    ## Calculating p values for each window
    all_pvals, all_sorted_index = calculate_p(headers, windows, indexs, train1_f, test)
    
    ## Identify Attack
    final_attack = identify_attack(all_pvals, all_sorted_index, numeric_threshold, binary_threshold)
    
    ## Generating result for one batch
    result = []
    for i in range(len(windows)):
        result.append(generate_false(windows[i]))
        
    ## Change attack labels
    for i in range(len(final_attack)):
        result[final_attack[i]] = generate_true(windows[final_attack[i]])
        
    batch_result = np.vstack(result)
    return batch_result

In [12]:
## Run n batches to optimize result
def multiple_batches(batches, train1_f, test, headers, numeric_threshold, binary_threshold):
    batch_list = []
    for i in range(batches):
        print('Start Batch No.', i+1)
        batch_result = one_batch(train1_f, test, headers, numeric_threshold, binary_threshold)
        batch_list.append(batch_result)
        print('Finish Batch No.', i+1)
        
    add_on = batch_list[0]
    for i in range(1, len(batch_list)):
        add_on += batch_list[i]
        
    final_result = add_on >= int(1)
    return add_on, final_result        

## Main Function

In [13]:
if __name__ == '__main__':
    ## Load Training data
    train1 = pd.read_csv('training/train_dataset01.csv')*1
    train2 = pd.read_csv('training/train_dataset02.csv')*1

    ## Load Testing data
    test = pd.read_csv('test_dataset.csv')*1
    
    ## Pre-Process
    headers, train1_f, train1_l = train_preprocess(train1)
    headers, train2_f, train2_l = train_preprocess(train2)
    
    headers, test = test_preprocess(test)
    
    ## Visulize the data
#     data_visualize(headers, train1_f, train1_l)
#     data_visualize(headers, train2_f, train2_l)
    
    ## Prediction
#     prediction = generate_output(train1_f, test, 129, headers, -43, -12)
    add_on, prediction = multiple_batches(5, train1_f, test, headers, -43, -12)
    
    ## Save to CSV file
    csv_df = pd.DataFrame(prediction)
    csv_df.to_csv('yourname.csv')

Start Batch No. 1
Finish Batch No. 1
Start Batch No. 2
Finish Batch No. 2
Start Batch No. 3
Finish Batch No. 3
Start Batch No. 4
Finish Batch No. 4
Start Batch No. 5
Finish Batch No. 5
