In [None]:
import os
import numpy as np   
import sklearn
import keras
import pandas as pd

In [None]:
train_data = pd.read_csv('../../input/train.csv')
test_data = pd.read_csv('../../input/test.csv')

In [None]:
id_list, id_indexes = np.unique(train_data['KPI ID'], return_index=True)
id_indexes.sort()
id_indexes = np.append(id_indexes, len(train_data))   
timeseries_all = []
timeseries_label = []

for i in np.arange(len(id_indexes)-1):
    timeseries_all.append(np.asarray(train_data['value'][id_indexes[i]:id_indexes[i+1]]))
    timeseries_label.append(np.asarray(train_data['label'][id_indexes[i]:id_indexes[i+1]]))

In [None]:
test_id_list, test_id_indexes = np.unique(test_data['KPI ID'], return_index=True)
test_id_indexes.sort()
test_id_indexes = np.append(test_id_indexes, len(test_data))   
testseries_all = []

for i in np.arange(len(test_id_indexes)-1):
    testseries_all.append(np.asarray(test_data['value'][test_id_indexes[i]:test_id_indexes[i+1]]))

In [None]:
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
def get_feature_AddES_residuals(time_series):
    predict = ExponentialSmoothing(time_series, trend='add').fit(smoothing_level=1)
    return time_series - predict.fittedvalues

def get_feature_SimpleES_residuals(time_series):
    predict = SimpleExpSmoothing(time_series).fit(smoothing_level=1)
    return time_series - predict.fittedvalues

def get_feature_Holt_residuals(time_series):
    predict = Holt(time_series).fit(smoothing_level=1)
    return time_series - predict.fittedvalues

In [None]:
def new_get_timeseries_features(time_series, time_series_label, Windows, delay):
  
    data = []
    data_label = []
    data_label_vital = []
    
    start_point = 2*max(Windows) - 1
    start_accum = sum(time_series[0:start_point])
    
    time_series_AddES_residuals = get_feature_AddES_residuals(time_series)
    time_series_SimpleES_residuals = get_feature_SimpleES_residuals(time_series)
    time_Series_Holt_residuals = get_feature_Holt_residuals(time_series)
    
    for i in np.arange(start_point, len(time_series)):        
        datum = []
        datum_label = time_series_label[i]        
        
        diff_plain = time_series[i] - time_series[i-1]
        start_accum = start_accum + time_series[i]
        mean_accum = (start_accum)/(i+1)
        
        datum.append(time_series_AddES_residuals[i])
        datum.append(time_series_SimpleES_residuals[i])
        datum.append(time_Series_Holt_residuals[i])

        datum.append(time_series[i])
        

        datum.append(diff_plain)
        
        datum.append(diff_plain/(time_series[i-1] + 1e-8))  

        datum.append(diff_plain - (time_series[i-1] - time_series[i-2]))
 
        datum.append(time_series[i] - mean_accum)


        for k in Windows:
            mean_w = np.mean(time_series[i-k+1:i+1])
            var_w = np.mean((np.asarray(time_series[i-k+1:i+1]) - mean_w)**2)
 
            
            mean_w_and_1 = mean_w + (time_series[i-k]-time_series[i])/k
            var_w_and_1 = np.mean((np.asarray(time_series[i-k:i]) - mean_w_and_1)**2)

            
            mean_2w = np.mean(time_series[i-2*k+1:i-k+1])
            var_2w = np.mean((np.asarray(time_series[i-2*k+1:i-k+1]) - mean_2w)**2)
            
            
            diff_mean_1 = mean_w - mean_w_and_1
            diff_var_1 = var_w - var_w_and_1
            
            diff_mean_w = mean_w - mean_2w
            diff_var_w = var_w - var_2w
            
            datum.append(mean_w)  
            
            datum.append(var_w)
            
            datum.append(diff_mean_1)
            
            datum.append(diff_mean_1/(mean_w_and_1 + 1e-8))
            
            datum.append(diff_var_1)
            
            datum.append(diff_var_1/(var_w_and_1 + 1e-8))
            
            datum.append(diff_mean_w)
            
            datum.append(diff_mean_w/(mean_2w + 1e-8))
            
            datum.append(diff_var_w)
            
            datum.append(diff_var_w/(var_2w + 1e-8))
            
            
            datum.append(time_series[i] - mean_w_and_1)
            
            datum.append(time_series[i] - mean_2w)

        data.append(np.asarray(datum))
        data_label.append(np.asarray(datum_label))

        if datum_label == 1 and sum(time_series_label[i-delay:i]) < delay + 1:
            data_label_vital.append(np.asarray(1))
        else:
            data_label_vital.append(np.asarray(0))
            
    return data, data_label, data_label_vital

In [None]:
from sklearn.preprocessing import StandardScaler
import random

In [None]:
W = np.asarray([2, 5, 10, 25, 50, 100, 200, 300, 400, 500])
delay = 7
scaler_list_new = [] 
timeseries_features_new = []
timeseries_features_label_new = []
timeseries_features_label_vital_new = []

for i in range(len(timeseries_all)):
    print(i,len(timeseries_all[i]),len(scaler_list_new),len(timeseries_features_new),len(timeseries_features_label_new),
                len(timeseries_features_label_vital_new))
    features_temp,label_temp,label_vital_temp = new_get_timeseries_features(timeseries_all[i], timeseries_label[i], W, delay) 
    assert(len(features_temp)==len(label_temp))
    assert(len(label_temp) == len(label_vital_temp))
    scaler_temp = StandardScaler()
    features_temp = scaler_temp.fit_transform(features_temp)
    scaler_list_new.append(scaler_temp)
    if i==0:
        timeseries_features_new = features_temp
    else:
        timeseries_features_new = np.concatenate((timeseries_features_new, features_temp), axis = 0)
        
    timeseries_features_label_new = timeseries_features_label_new + label_temp
    timeseries_features_label_vital_new = timeseries_features_label_vital_new + label_vital_temp





In [None]:
def new_get_test_features(time_series, Windows):
  
    data = []
    
    start_point = 2*max(Windows) - 1
    start_accum = sum(time_series[0:start_point])
    
    # features from tsa models
    #time_series_SARIMA_residuals = get_feature_SARIMA_residuals(time_series)
    time_series_AddES_residuals = get_feature_AddES_residuals(time_series)
    time_series_SimpleES_residuals = get_feature_SimpleES_residuals(time_series)
    time_Series_Holt_residuals = get_feature_Holt_residuals(time_series)
    
    for i in np.arange(start_point, len(time_series)):        
        # the datum to put into the data pool
        datum = []        
        
        # fill the datum with f01-f09
        diff_plain = time_series[i] - time_series[i-1]
        start_accum = start_accum + time_series[i]
        mean_accum = (start_accum)/(i+1)
        
        # f01-f04: residuals
        #datum.append(time_series_SARIMA_residuals[i])
        datum.append(time_series_AddES_residuals[i])
        datum.append(time_series_SimpleES_residuals[i])
        datum.append(time_Series_Holt_residuals[i])
        # f05: logarithm
        datum.append(time_series[i])
        
        # f06: diff
        datum.append(diff_plain)
        # f07: diff percentage
        datum.append(diff_plain/(time_series[i-1] + 1e-8))  # to avoid 0, plus 1e-10
        # f08: diff of diff - derivative
        datum.append(diff_plain - (time_series[i-1] - time_series[i-2]))
        # f09: diff of accumulated mean and current value
        datum.append(time_series[i] - mean_accum)

        # fill the datum with features related to windows
        # loop over different windows size to fill the datum
        for k in Windows:
            mean_w = np.mean(time_series[i-k+1:i+1])
            var_w = np.mean((np.asarray(time_series[i-k+1:i+1]) - mean_w)**2)
            #var_w = np.var(time_series[i-k:i+1])
            
            mean_w_and_1 = mean_w + (time_series[i-k]-time_series[i])/k
            var_w_and_1 = np.mean((np.asarray(time_series[i-k:i]) - mean_w_and_1)**2)
            #mean_w_and_1 = np.mean(time_series[i-k-1:i])
            #var_w_and_1 = np.var(time_series[i-k-1:i])
            
            mean_2w = np.mean(time_series[i-2*k+1:i-k+1])
            var_2w = np.mean((np.asarray(time_series[i-2*k+1:i-k+1]) - mean_2w)**2)
            #var_2w = np.var(time_series[i-2*k:i-k+1])
            
            # diff of sliding windows
            diff_mean_1 = mean_w - mean_w_and_1
            diff_var_1 = var_w - var_w_and_1
            
            # diff of jumping windows
            diff_mean_w = mean_w - mean_2w
            diff_var_w = var_w - var_2w
            
            # f1
            datum.append(mean_w)  # [0:2] is [0,1]
            # f2
            datum.append(var_w)
            # f3
            datum.append(diff_mean_1)
            # f4
            datum.append(diff_mean_1/(mean_w_and_1 + 1e-8))
            # f5
            datum.append(diff_var_1)
            # f6
            datum.append(diff_var_1/(var_w_and_1 + 1e-8))
            # f7
            datum.append(diff_mean_w)
            # f8
            datum.append(diff_mean_w/(mean_2w + 1e-8))
            # f9
            datum.append(diff_var_w)
            # f10
            datum.append(diff_var_w/(var_2w + 1e-8))
            
            # diff of sliding/jumping windows and current value
            # f11
            datum.append(time_series[i] - mean_w_and_1)
            # f12
            datum.append(time_series[i] - mean_2w)

        data.append(np.asarray(datum))
            
    return data

In [None]:
testseries_features_new = []
for i in range(len(testseries_all)):
    print(i, len(testseries_all[i]), len(testseries_features_new))
    features_temp = new_get_test_features(testseries_all[i], W)
    features_temp = scaler_list_new[i].transform(features_temp)
    if i==0:
        testseries_features_new = features_temp
    else:
        testseries_features_new = np.concatenate((testseries_features_new, features_temp), axis = 0)


In [None]:
timeseries_features_label_new = np.array(timeseries_features_label_new)
timeseries_features_label_vital_new = np.array(timeseries_features_label_vital_new)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l2

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

np.random.seed(5)

In [None]:
data_features_diff = len(test_data) - len(testseries_features_new)
print(data_features_diff)
data_features_diff_avg = int(data_features_diff / len(testseries_all))
print(data_features_diff_avg)

In [None]:
sum_non = len(timeseries_features_label_new) - sum(timeseries_features_label_new)
vital_ratio = round((sum_non - sum(timeseries_features_label_new) + sum(timeseries_features_label_vital_new))/sum(timeseries_features_label_vital_new))
sample_ratio_new = vital_ratio * vital_label + 1

In [None]:
m = Sequential()
m.add(Dense(128, input_dim = 128))
m.add(BatchNormalization())
m.add(Activation('relu'))
m.add(Dropout(0.5))

m.add(Dense(64))
m.add(BatchNormalization())
m.add(Activation('relu'))
m.add(Dropout(0.5))

m.add(Dense(1))
m.add(BatchNormalization())
m.add(Activation('sigmoid'))


m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

h = m.fit(timeseries_features_new, timeseries_features_label_new, epochs=30, batch_size=5000, verbose=1,
               sample_weight=sample_ratio_new)

In [None]:
p = m.predict(timeseries_features_new, batch_size=5000,verbose=1)

In [None]:
train_data_check = np.ravel(p>0.96).astype(int)
print(sum(train_data_check)/len(train_data_check))
print(precision_score(timeseries_features_label_new, train_data_check))
print(recall_score(timeseries_features_label_new, train_data_check))
print(f1_score(timeseries_features_label_new, train_data_check))

In [None]:
last_index = 0
evaluation_new = np.zeros(data_features_diff_avg).astype(int)
next_index = 0
for i in range(len(timeseries_all)):
    next_index += len(timeseries_all[i]) - data_features_diff_avg
    evaluation_new = np.concatenate((evaluation_new, train_data_check[last_index : next_index]))
    print(len(evaluation_new),next_index)
    last_index = next_index
    if i != len(timeseries_all)-1:
        evaluation_new = np.concatenate((evaluation_new,np.zeros(data_features_diff_avg)))
print(len(evaluation_new))
assert(len(evaluation_new) == len(train_data))
evaluation_new = evaluation_new.astype(int)
evaluation_df = pd.DataFrame({'KPI ID': train_data['KPI ID'], 
                         'timestamp': train_data['timestamp'], 
                         'predict': evaluation_new})
evaluation_df.to_csv('evaluation.csv', index=False)

In [None]:
!python evaluation.py "../../input/train.csv" "evaluation.csv" 7

In [None]:
pm_t = m.predict(testseries_features_new,batch_size=5000,verbose=1)

In [None]:
predict_flagm = (np.ravel(pm_t)>0.96).astype(int)
print(predict_flagm)
print(sum(predict_flagm)/len(predict_flagm))

In [None]:
last_index = 0
predict_new = np.zeros(data_features_diff_avg).astype(int)
next_index = 0
for i in range(len(testseries_all)):
    next_index += len(testseries_all[i]) - data_features_diff_avg
    predict_new = np.concatenate((predict_new, predict_flagm[last_index : next_index]))
    print(next_index)
    last_index = next_index
    if i != len(testseries_all)-1:
        predict_new = np.concatenate((predict_new,np.zeros(data_features_diff_avg)))
print(len(predict_new))
assert(len(predict_new) == len(test_data))
predict_new = predict_new.astype(int)
predict_df = pd.DataFrame({'KPI ID': test_data['KPI ID'], 
                         'timestamp': test_data['timestamp'], 
                         'predict': predict_new})
predict_df.to_csv('predictDNN.csv', index=False)

In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBClassifier(n_jobs=10, verbosity=2)

In [None]:
ratio = round((len(timeseries_features_label_new) - sum(timeseries_features_label_new)) * 0.05 / sum(timeseries_features_label_new))
print(ratio)
non_anomaly = np.ones(len(timeseries_features_label_new)) - timeseries_features_label_new
print(non_anomaly,non_anomaly.shape)
xgb_sample_ratio = (239*ratio) * vital_label + non_anomaly
print(xgb_sample_ratio,sum(xgb_sample_ratio))
xgb_sample_ratio = xgb_sample_ratio + ratio * timeseries_features_label_new
print(xgb_sample_ratio,sum(xgb_sample_ratio))

In [None]:
xgb_model.fit(timeseries_features_new, timeseries_features_label_new, sample_weight = xgb_sample_ratio, verbose = True)

In [None]:
importance = xgb_model.feature_importances_
print(importance)

In [None]:
sum1 = 0
sort_index = np.argsort(importance)
importance_index = []
for i in range(len(sort_index)):
    if importance[sort_index[len(importance)-1-i]] > 0.001:
        importance_index.append(sort_index[len(importance)-1-i])
        sum1 += importance[sort_index[len(importance)-1-i]]
        print(sort_index[len(importance)-1-i],importance[sort_index[len(importance)-1-i]],sum1)
importance_index.sort()
print(importance_index, len(importance_index))

In [None]:
p_t = xgb_model.predict_proba(testseries_features_new)

In [None]:
xgb_model_test = xgb.XGBClassifier(n_jobs=10, verbosity=2, max_depth=5)

In [None]:
xgb_model_test.fit(timeseries_features_new[:,importance_index], timeseries_features_label_new, sample_weight = sample_ratio_new, verbose = True)

In [None]:
p_test_test = xgb_model_test.predict_proba(testseries_features_new[:,importance_index])

In [None]:
predict_t1 = (np.ravel(p_t[:,1:])>0.98)
predict_t2 = np.ravel(pm_t)>0.96
predict_t3 = (np.ravel(p_test_test[:,1:]) > 0.9325)
predict_xg = ((predict_t1 | predict_t3)).astype(int)
predict_flag = (predict_t2 | predict_xg).astype(int)
print(sum(predict_t2)/len(predict_t2), sum(predict_xg)/len(predict_xg))
print(predict_flag)
print(sum(predict_flag)/len(predict_flag))

In [None]:
last_index = 0
predict_new = np.zeros(data_features_diff_avg).astype(int)
next_index = 0
for i in range(len(testseries_all)):
    next_index += len(testseries_all[i]) - data_features_diff_avg
    predict_new = np.concatenate((predict_new, predict_flag[last_index : next_index]))
    print(next_index)
    last_index = next_index
    if i != len(testseries_all)-1:
        predict_new = np.concatenate((predict_new,np.zeros(data_features_diff_avg)))
print(len(predict_new))
assert(len(predict_new) == len(test_data))
predict_new = predict_new.astype(int)
predict_df = pd.DataFrame({'KPI ID': test_data['KPI ID'], 
                         'timestamp': test_data['timestamp'], 
                         'predict': predict_new})
predict_df.to_csv('predict.csv', index=False)