In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
import joblib 
from copy import copy

def data2timeseries(data, n_timesteps):
    processed_data = copy(data)
    # translate into timeseries
    timeseries = []
    for i in range(processed_data.shape[0]):
        # pad 0 for the initial steps
        if i < n_timesteps-1:
            pad = np.zeros((1, n_timesteps, processed_data.shape[1]))
            for j in range(i+1):
                pad[0, n_timesteps-1-j, :] = processed_data[i-j, ...]
            timeseries.append(pad)
        else:
            timeseries.append(processed_data[np.newaxis, i-n_timesteps+1:i+1, ...])
    processed_data = np.concatenate(timeseries, axis=0)
    return processed_data

def preprocess_dataframe(scaler_path='scaler_ip.pkl', scaler=RobustScaler()):
    # 학습 데이터 읽기. 경로 설정에 주의 하세요!
    data = pd.read_csv('data/IP/DHCP.csv')
    
    for key in sorted(list(data.keys())):
        data[key] = data[key].fillna(method='ffill')
        data[key] = data[key].fillna(0)
    print(np.sum(data.isna()))
    # -----------------------------------
    # TODO: 데이터 분석을 통해 다양한 전처리를 시도 해보세요!
    # preprocessed_train_set = train_set
    # -----------------------------------
    data['server_abnorm'] = data['Svr_detect'] + data['Svr_connect'] + data['Ss_request']
    data['server_tot'] = data['Svr_detect'] + data['Svr_connect'] 
    data['client_tot'] = data['Ss_request'] + data['Ss_Established']
    data['Svr_detect+Ss_request'] = data['Ss_request'] + data['Svr_detect'] 
    data['Svr_connet+Ss_request'] = data['Ss_request'] + data['Svr_connect'] 
    data['tot'] = data['server_tot']+data['client_tot']

    processed_df = data.copy()
    # data frame의 key를 set으로 변환하고 다시 list로 만드는 과정에서 key의 순서가 정해지지 않습니다. 
    # 본 검증 자료에서는 sort를 함으로써 동일한 결과가 나오도록 하였지만, 대회 중에는 이를 인지하지 못해 따로 통제하지 못했습니다. 
    # 이로 인해 검증 자료의 결과가 대회 기간 중 제출한 것과 완전히 동일하진 않을 수는 있으나, 결과의 유의미한 차이는 없을 것이라 
    # 판단하여 자료를 제출하니 참고 부탁드립니다.
    for key in sorted(list(set(data.keys())-set(['Timestamp']))):
        processed_df[key+'_cum'] = data[key].cumsum().fillna(method='ffill').fillna(0)
        processed_df[key+'_meddiff'] = data[key].median() - data[key]
        processed_df[key+'_mindiff'] = data[key].min() - data[key]
        processed_df[key+'_maxdiff'] = data[key].max() - data[key]
        processed_df[key+'_meandiff'] = data[key].mean() - data[key]
        processed_df[key+'_stddiff'] = data[key].std() - processed_df[key+'_meandiff']
        for i in range(1, 7):
            processed_df[key+'_diff_'+str(i)] = data[key].diff(i).fillna(method='bfill').fillna(0)
            processed_df[key+'_diff_back_'+str(i)] = data[key].diff(-i).fillna(method='ffill').fillna(0)
    
    processed_data = scaler.fit_transform(processed_df.drop(['Timestamp'], axis=1))
    joblib.dump(scaler, scaler_path)
    idx_half = data.index[data['Timestamp'] == '20210630_2350-0000'].tolist()[0]
    test_out = processed_data[idx_half+1:]   # 7.1 - 12.31 분리

    return processed_data, test_out, idx_half

    
train_set, test_set, idx_half = preprocess_dataframe(scaler=RobustScaler())
train_ts = data2timeseries(train_set, 12)
test_ts = data2timeseries(test_set, 12)

Timestamp         0
Svr_detect        0
Svr_connect       0
Ss_request        0
Ss_Established    0
dtype: int64


  processed_df[key+'_diff_'+str(i)] = data[key].diff(i).fillna(method='bfill').fillna(0)
  processed_df[key+'_diff_back_'+str(i)] = data[key].diff(-i).fillna(method='ffill').fillna(0)
  processed_df[key+'_cum'] = data[key].cumsum().fillna(method='ffill').fillna(0)
  processed_df[key+'_meddiff'] = data[key].median() - data[key]
  processed_df[key+'_mindiff'] = data[key].min() - data[key]
  processed_df[key+'_maxdiff'] = data[key].max() - data[key]
  processed_df[key+'_meandiff'] = data[key].mean() - data[key]
  processed_df[key+'_stddiff'] = data[key].std() - processed_df[key+'_meandiff']


Make True-Positive train-valid set using Isolation Forest

In [4]:
from copy import copy
from sklearn.model_selection import train_test_split

def isolate_outliers(train_data_ts:np.ndarray, train_df:np.ndarray, idx_half:int, n_time_step=6, n_estimators=200, seed=415, contamination=0.01):
    assert train_data_ts.shape[0]==len(train_df)
    model = IsolationForest(n_estimators=n_estimators, random_state=seed, contamination=contamination)

    indices = list(np.arange(len(train_df))[model.fit_predict(train_df)==-1])
    new_indices = copy(indices)
    for idx in indices:
        for i in range(1, n_time_step):
            if idx+i >= len(train_df):  
                break
            new_indices.append(idx+i)
    new_indices = list(set(range(len(train_df)))-set(new_indices))
    train_data_ts = train_data_ts[new_indices]
    train, valid = train_test_split(train_data_ts, test_size=0.2, random_state=415)

    return train, valid

train_set, valid_set = isolate_outliers(train_ts, train_set, idx_half, contamination=0.01, n_time_step=12)
train_set.shape, valid_set.shape, test_ts.shape

((39318, 12, 190), (9830, 12, 190), (26496, 12, 190))

In [5]:
print(train_set.shape, valid_set.shape, test_ts.shape)
joblib.dump(train_set, 'data/IP/ip_train_ts_12_190f.pkl')
joblib.dump(valid_set, 'data/IP/ip_valid_ts_12_190f.pkl')
joblib.dump(test_ts, 'data/IP/ip_test_ts_12_190f.pkl')

(39318, 12, 190) (9830, 12, 190) (26496, 12, 190)


['data/IP/ip_test_ts_12_190f.pkl']