<a href="https://colab.research.google.com/github/xohyun/TimeSeries-Anomaly-Detection-Dataset/blob/master/Data_preprocessing%2Bdetail.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
Implementation of preprocessing.
References
    - https://github.com/imperial-qore/TranAD/blob/main/preprocess.py
"""

'\nImplementation of preprocessing.\nReferences\n    - https://github.com/imperial-qore/TranAD/blob/main/preprocess.py\n'

## Drive mount

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import

In [3]:
import os
import shutil
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import json
import datetime

In [4]:
def create_folder(directory):
    '''
    Create folder

    Parameters
    ----------
    directory : path and directory that you want to create
    
    Returns
    -------
    None
    '''
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory ' +  directory)

def prepare_data(dataset, *args, **kwargs):
    '''
    Preprocessing function + save dataset as numpy file

    Parameters
    ----------
    dataset : dataset that you want to preprocess (WADI, MSL, SMAP, NAB, SMD)
    
    Returns
    -------
    None
    '''
    if dataset == 'WADI':
        dataset_folder = kwargs['dataset_folder']
        output_folder = '/content/data/WADI'
        create_folder(output_folder)

        train = pd.read_csv(os.path.join(dataset_folder, 'WADI_14days_new.csv'))
        test = pd.read_csv(os.path.join(dataset_folder, 'WADI_attackdataLABLE.csv'), header=0, low_memory=False, index_col=0)
        test = test.rename(columns=test.iloc[0]).iloc[1:].reset_index() # replace first row with column name

        label = test[['Attack LABLE (1:No Attack, -1:Attack)']]
        test = test.drop(['Attack LABLE (1:No Attack, -1:Attack)'], axis='columns')

        row = test[test.isna().all(axis=1)].index.tolist() # find the index of the row in which all columns are NA
        label = label.drop(row, axis=0).replace(['-1', '1'], [1, 0]) # 1 : anomaly

        train = train.dropna(how='all', axis=0) #inplace=True
        test = test.dropna(how='all', axis=0) #inplace=True

        col = train.columns[train.isna().any()] # find the index of the columns in which rows are NA
        train = train.dropna(axis=1)
        test = test.drop(col, axis=1)

        print(f"train shape:{train.shape}, test shape :{test.shape}, label shape:{label.shape}")

        #---# To save train data #---#
        for i in set(train['Date']):
            if len(train[train['Date'] == i]) % 60 != 0:
                print(f"Exclude {i} data")
                continue
            df_train = train[train['Date'] == i]
            
            start = pd.to_datetime(df_train.iloc[0]['Date'] + ' ' + df_train.iloc[0]['Time'])    
            date_range = pd.date_range(start = start, periods=len(df_train), freq='s')
            df_train = df_train.drop(['Date', 'Time'], axis = 1)
            df_train.insert(1, 'Time', date_range)
            
            #---# MinMaxScaler #---#
            values = df_train.iloc[:, 2:] # Exclude the time and index columns
            scaler = MinMaxScaler()
            values = scaler.fit_transform(values)
            df_train.iloc[:, 2:] = values

            date = i.replace('/', '_')
            np.save(os.path.join(output_folder, f"{date}_train.npy"), df_train.to_numpy())

        #---# To save test data, label data #---#
        for i in set(test['Date ']):
            df_test = test[test['Date '] == i]
            df_label = label[test['Date '] == i]
            
            start = pd.to_datetime(df_test.iloc[0]['Date '] + ' ' + df_test.iloc[0]['Time'])
            # end = start + datetime.timedelta(hours=int(len(df_test) / (60*60)))
            # pd.date_range(start = start, end = end, freq='1s')
            
            date_range = pd.date_range(start = start, periods=len(df_test), freq='s')
            df_test = df_test.drop(['Date ', 'Time'], axis = 1)
            df_test.insert(1, 'Time', date_range)

            #---# MinMaxScaler #---#
            values = df_test.iloc[:, 2:] # Exclude the time and index columns
            scaler = MinMaxScaler()
            values = scaler.fit_transform(values)
            df_test.iloc[:, 2:] = values

            date = i.replace('/', '_')
            np.save(os.path.join(output_folder, f"{date}_test.npy"), df_test.to_numpy())
            np.save(os.path.join(output_folder, f"{date}_labels.npy"), df_label.to_numpy())

    elif dataset == 'MSL' or dataset == 'SMAP':
        # choose_data = ['A-4', 'C-2', 'T-1']

        dataset_folder = '/content/original_data/SMAP_MSL'
        output_folder = os.path.join('/content/data', dataset)
        create_folder(output_folder)

        file_ = os.path.join(dataset_folder, 'labeled_anomalies.csv')
        values = pd.read_csv(file_)

        values = values[values['spacecraft'] == dataset]
        filenames = values['chan_id'].values.tolist()    

        for fn in filenames:
            # if fn not in choose_data:
            #     continue
            train = np.load(f'{dataset_folder}/train/{fn}.npy')
            test = np.load(f'{dataset_folder}/test/{fn}.npy')

            #---# MinMaxScaler #---#
            scaler = MinMaxScaler()
            train = scaler.fit_transform(train)
            test = scaler.transform(test)
            
            #---# save train.npy and test.npy #---#
            np.save(f'{output_folder}/{fn}_train.npy', train)
            np.save(f'{output_folder}/{fn}_test.npy', test)

            #---# save labels.npy #---#
            labels = np.zeros(test.shape)
            indices = values[values['chan_id'] == fn]['anomaly_sequences'].values[0]
            indices = indices.replace(']', '').replace('[', '').split(', ')
            indices = [int(i) for i in indices]
            for i in range(0, len(indices), 2):
                labels[indices[i]:indices[i+1], :] = 1
            np.save(f'{output_folder}/{fn}_labels.npy', labels)
  
    elif dataset == 'NAB':
        dataset_folder = 'original_data/NAB/realKnownCause'
        label_folder = 'original_data/NAB/labels'
        output_folder = 'data/NAB'
        create_folder(output_folder)

        file_list = os.listdir(dataset_folder)

        with open(label_folder + '/combined_windows.json') as f:
            labeldict = json.load(f)

        for filename in file_list:
            if not filename.endswith('.csv'): continue
            df = pd.read_csv(dataset_folder+'/'+filename)
        
            print(f"{filename} shape {df.shape}")
            values = df.values[:,1]
            
            #---# MinMaxScaler #---#
            scaler = MinMaxScaler()
            values = scaler.fit_transform(values.reshape(-1,1))

            #---# Label #---#
            labels = np.zeros_like(values, dtype=np.float64)
            for timestamp in labeldict['realKnownCause/'+filename]:
                tstamp = timestamp[0].replace('.000000', '')
                start_index = np.where(((df['timestamp'] == tstamp).values + 0) == 1)[0][0]
                tstamp = timestamp[1].replace('.000000', '')
                end_index = np.where(((df['timestamp'] == tstamp).values + 0) == 1)[0][0]
                labels[start_index : end_index] = 1

            #---# Split train npy and test npy #---#
            train, test = values, values
            train, test, labels = train.reshape(-1, 1), test.reshape(-1, 1), labels.reshape(-1, 1)
            
            #---# Save file #---#
            fn = filename.replace('.csv', '')
            for file in ['train', 'test', 'labels']:
                np.save(os.path.join(output_folder, f'{fn}_{file}.npy'), eval(file))

    elif dataset == 'SMD':
        dataset_folder = '/content/original_data/SMD'
        output_folder = '/content/data/SMD'
        create_folder(output_folder)

        file_list = os.listdir(os.path.join(dataset_folder, "train"))
        for filename in file_list:
            if filename.endswith('.txt'):
                #---# train #---#
                values_train = np.genfromtxt(os.path.join(dataset_folder, 'train', filename), delimiter=',')
                scaler = MinMaxScaler()
                values_train_scale = scaler.fit_transform(values_train)
                np.save(os.path.join(output_folder, f"train_{filename}.npy"), values_train_scale)
                
                #---# test #---#
                values_test = np.genfromtxt(os.path.join(dataset_folder, 'test', filename), delimiter=',')
                scaler = MinMaxScaler()
                values_test_scale = scaler.fit_transform(values_train)
                np.save(os.path.join(output_folder, f"test_{filename}.npy"), values_test_scale)

                #---# label #---#
                values_label = np.genfromtxt(os.path.join(dataset_folder, 'test_label', filename), delimiter=',')
                np.save(os.path.join(output_folder, f"label_{filename}.npy"), values_label)

                #---# interpretation_label #---#
                temp = np.zeros(values_test.shape)
                with open(os.path.join(dataset_folder, 'interpretation_label', filename), "r") as f:
                    ls = f.readlines()
                    for line in ls:
                        pos, value = line.split(':')[0], line.split(':')[1].split(',')
                        start, end, inx = int(pos.split('-')[0]), int(pos.split('-')[1]), [int(i)-1 for i in value]
                        temp[start-1:end-1, inx] = 1
                        np.save(os.path.join(output_folder, f"label_{filename}_interpret.npy"), temp)

    else:
        print("Check the dataset!!")

## SMAP & MSL

In [5]:
!wget https://s3-us-west-2.amazonaws.com/telemanom/data.zip && unzip data.zip && rm data.zip
!cd data && wget https://raw.githubusercontent.com/khundman/telemanom/master/labeled_anomalies.csv
!mkdir original_data
os.rename('/content/data', '/content/SMAP_MSL')
shutil.move('/content/SMAP_MSL', '/content/original_data/SMAP_MSL')

dataset = 'SMAP'
prepare_data(dataset)

dataset = 'MSL'
prepare_data(dataset)

--2023-01-09 08:10:50--  https://s3-us-west-2.amazonaws.com/telemanom/data.zip
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.92.149.64, 52.218.243.24, 52.92.195.32, ...
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.92.149.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85899803 (82M) [application/zip]
Saving to: ‘data.zip’


2023-01-09 08:10:54 (27.3 MB/s) - ‘data.zip’ saved [85899803/85899803]

Archive:  data.zip
   creating: data/2018-05-19_15.00.10/
   creating: data/2018-05-19_15.00.10/models/
  inflating: data/2018-05-19_15.00.10/models/A-1.h5  
  inflating: data/2018-05-19_15.00.10/models/A-2.h5  
  inflating: data/2018-05-19_15.00.10/models/A-3.h5  
  inflating: data/2018-05-19_15.00.10/models/A-4.h5  
  inflating: data/2018-05-19_15.00.10/models/A-5.h5  
  inflating: data/2018-05-19_15.00.10/models/A-6.h5  
  inflating: data/2018-05-19_15.00.10/models/A-7.h5  
  inflating: data/2018-05-19_15.00.10

In [6]:
def counting_data(path, list_, data_detail):
    sum = 0
    for i in list_:
        np_file = np.load(path + i)
        sum += np_file.shape[0]
    print(f"{data_detail} : {sum}")
    print(f"Number of columns : {np_file.shape[1]}")
    return sum

def counting_label(path, label_list, data_detail):
    label_sum = 0
    for i in label_list:
        np_label = np.load(path + i)
        sums = np.sum(np_label, axis=0)
        label_sum += sums[0]
    print(f"{data_detail} : {label_sum}")
    return label_sum

def counting_smd(path, list_, data_detail):
    sum = 0
    for i in list_:
        np_file = np.load(path + i)
        sum += np.sum(np_file.shape[0])
    print(f"{data_detail} : {sum}")
    print(f"Number of columns : {np_file.shape[1]}")
    return sum

def counting_smd_label(path, list_, data_detail):
    sum = 0
    for i in list_:
        np_file = np.load(path + i)
        sum += np.sum(np_file)
    print(f"{data_detail} : {sum}")
    return sum

def counting_wadi(path, list_, data_detail):
    sum = 0 
    for i in list_:
        np_file = np.load(path + i, allow_pickle=True)
        sum += np.sum(np_file.shape[0])
    print(f"{data_detail} : {sum}")
    print(f"Number of columns : {np_file.shape[1]}")
    return sum

def counting_wadi_label(path, list_, data_detail):
    sum = 0 
    for i in list_:
        np_file = np.load(path + i, allow_pickle=True).squeeze()
        sum += np.sum(np_file)
    print(f"{data_detail} : {sum}")
    return sum

In [7]:
#-------------#
#---# MSL #---#
#-------------#
MSL_list = os.listdir("/content/data/MSL")
train_list = [i for i in MSL_list if "train" in i]
test_list = [i for i in MSL_list if "test" in i]
label_list = [i for i in MSL_list if "labels" in i]

train_num = counting_data("/content/data/MSL/", train_list, "Total number of MSL train data")
test_num = counting_data("/content/data/MSL/", test_list, "Total number of MSL test data")
label_num = counting_label("/content/data/MSL/", label_list, "Total number of MSL anomaly data")

print(f"Percentage of anomaly", label_num/test_num)

Total number of MSL train data : 58317
Number of columns : 55
Total number of MSL test data : 73729
Number of columns : 55
Total number of MSL anomaly data : 7730.0
Percentage of anomaly 0.10484341303964519


In [8]:
#--------------#
#---# SMAP #---#
#--------------#
smap_list = os.listdir("/content/data/SMAP")
train_list = [i for i in smap_list if "train" in i]
test_list = [i for i in smap_list if "test" in i]
label_list = [i for i in smap_list if "labels" in i]

train_num = counting_data("/content/data/SMAP/", train_list, "Total number of SMAP train data")
test_num = counting_data("/content/data/SMAP/", test_list, "Total number of SMAP test data")
label_num = counting_label("/content/data/SMAP/", label_list, "Total number of SMAP anomaly data")

print(f"Percentage of anomaly", label_num/test_num)

Total number of SMAP train data : 138004
Number of columns : 25
Total number of SMAP test data : 435826
Number of columns : 25
Total number of SMAP anomaly data : 55854.0
Percentage of anomaly 0.12815664967211685


In [9]:
label_csv = pd.read_csv("/content/original_data/SMAP_MSL/labeled_anomalies.csv")

label_csv[label_csv.chan_id == 'T-10'] # no data

Unnamed: 0,chan_id,spacecraft,anomaly_sequences,class,num_values


In [10]:
label_csv[label_csv.chan_id == 'P-2'] # Duplication of data

Unnamed: 0,chan_id,spacecraft,anomaly_sequences,class,num_values
17,P-2,SMAP,"[[5350, 6575]]",[point],8209
51,P-2,SMAP,"[[5300, 6420]]",[point],8209


## NAB

In [11]:
dataset = 'NAB'
!git clone https://github.com/numenta/NAB.git
shutil.move('/content/NAB/data', '/content/original_data/NAB')
shutil.move('/content/NAB/labels', '/content/original_data/NAB')
shutil.rmtree('/content/NAB')
prepare_data(dataset)

Cloning into 'NAB'...
remote: Enumerating objects: 7029, done.[K
remote: Counting objects: 100% (114/114), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 7029 (delta 48), reused 38 (delta 15), pack-reused 6915[K
Receiving objects: 100% (7029/7029), 86.75 MiB | 22.58 MiB/s, done.
Resolving deltas: 100% (4922/4922), done.
Checking out files: 100% (1186/1186), done.
ec2_request_latency_system_failure.csv shape (4032, 2)
nyc_taxi.csv shape (10320, 2)
cpu_utilization_asg_misconfiguration.csv shape (18050, 2)
rogue_agent_key_updown.csv shape (5315, 2)
rogue_agent_key_hold.csv shape (1882, 2)
ambient_temperature_system_failure.csv shape (7267, 2)
machine_temperature_system_failure.csv shape (22695, 2)


In [12]:
nab_list = os.listdir("/content/data/NAB")
train_list = [i for i in nab_list if 'train' in i]
test_list = [i for i in nab_list if 'test' in i]
label_list = [i for i in nab_list if 'labels' in i]

train_num = counting_data("/content/data/NAB/", train_list, "Total number of NAB train data")
test_num = counting_data("/content/data/NAB/", test_list, "Total number of NAB test data")
label_num = counting_label("/content/data/NAB/", label_list, "Total number of NAB anomaly data")

print(f"Percentage of anomaly", label_num/test_num)

Total number of NAB train data : 69561
Number of columns : 1
Total number of NAB test data : 69561
Number of columns : 1
Total number of NAB anomaly data : 6575.0
Percentage of anomaly 0.09452135535716853


## SMD

In [13]:
dataset = 'SMD'
!git clone https://github.com/NetManAIOps/OmniAnomaly.git
shutil.move('/content/OmniAnomaly/ServerMachineDataset', '/content/original_data/SMD')
shutil.rmtree('/content/OmniAnomaly')
prepare_data(dataset)

Cloning into 'OmniAnomaly'...
remote: Enumerating objects: 204, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 204 (delta 2), reused 0 (delta 0), pack-reused 198[K
Receiving objects: 100% (204/204), 107.11 MiB | 22.08 MiB/s, done.
Resolving deltas: 100% (80/80), done.
Checking out files: 100% (132/132), done.


In [14]:
smd_list = os.listdir("/content/data/SMD/")
train_list = [i for i in smd_list if 'train' in i]
label_list = [i for i in smd_list if ('label' in i) and ('interpret' not in i) ]
test_list = [i for i in smd_list if 'test' in i]


train_num = counting_smd("/content/data/SMD/", train_list, "Total number of SMD train data")
test_num = counting_smd("/content/data/SMD/", test_list, "Total number of SMD test data")
label_num = counting_smd_label("/content/data/SMD/", label_list, "Total number of SMD anomaly data")

print(f"Percentage of anomaly", label_num/test_num)

Total number of SMD train data : 708405
Number of columns : 38
Total number of SMD test data : 708405
Number of columns : 38
Total number of SMD anomaly data : 29444.0
Percentage of anomaly 0.04156379472194578


##WADI


In [15]:
dataset = 'WADI'
# you can download https://itrust.sutd.edu.sg/itrust-labs_datasets/.
# Download the data and upload it to Google Drive. You can put the path there here.
dataset_folder = '/content/drive/MyDrive/WADI/WADI.A2_19 Nov 2019' ## your path!
prepare_data(dataset, dataset_folder=dataset_folder)

train shape:(784571, 122), test shape :(172801, 122), label shape:(172801, 1)
Exclude 9/29/2017 data
Exclude 10/2/17 data
Exclude 10/7/17 data


In [16]:
pd.read_csv("/content/drive/MyDrive/WADI/WADI.A2_19 Nov 2019/WADI_14days_new.csv")

Unnamed: 0,Row,Date,Time,1_AIT_001_PV,1_AIT_002_PV,1_AIT_003_PV,1_AIT_004_PV,1_AIT_005_PV,1_FIT_001_PV,1_LS_001_AL,...,3_MV_001_STATUS,3_MV_002_STATUS,3_MV_003_STATUS,3_P_001_STATUS,3_P_002_STATUS,3_P_003_STATUS,3_P_004_STATUS,LEAK_DIFF_PRESSURE,PLANT_START_STOP_LOG,TOTAL_CONS_REQUIRED_FLOW
0,1,9/25/2017,00:00.0,171.155,0.619473,11.5759,504.645,0.318319,0.001157,0,...,1,1,1,1,1,1,1,67.9651,1,0.68
1,2,9/25/2017,00:01.0,171.155,0.619473,11.5759,504.645,0.318319,0.001157,0,...,1,1,1,1,1,1,1,67.9651,1,0.68
2,3,9/25/2017,00:02.0,171.155,0.619473,11.5759,504.645,0.318319,0.001157,0,...,1,1,1,1,1,1,1,67.9651,1,0.68
3,4,9/25/2017,00:03.0,171.155,0.607477,11.5725,504.673,0.318438,0.001207,0,...,1,1,1,1,1,1,1,67.1948,1,0.68
4,5,9/25/2017,00:04.0,171.155,0.607477,11.5725,504.673,0.318438,0.001207,0,...,1,1,1,1,1,1,1,67.1948,1,0.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
784566,1048567,10/7/17,16:06.0,175.855,0.589478,11.8941,479.191,0.331571,0.001128,0,...,1,1,1,1,1,1,1,60.6305,1,0.25
784567,1048568,10/7/17,16:07.0,175.855,0.589478,11.8941,479.191,0.331571,0.001128,0,...,1,1,1,1,1,1,1,60.6305,1,0.25
784568,1048569,10/7/17,16:08.0,175.855,0.589478,11.8941,479.191,0.331571,0.001128,0,...,1,1,1,1,1,1,1,60.6305,1,0.25
784569,1048570,10/7/17,16:09.0,175.896,0.613476,11.8913,479.224,0.331622,0.001173,0,...,1,1,1,1,1,1,1,60.4477,1,0.25


In [17]:
smd_list = os.listdir("/content/data/WADI/")
train_list = [i for i in smd_list if 'train' in i]
label_list = [i for i in smd_list if 'labels' in i]
test_list = [i for i in smd_list if 'test' in i]

train_num = counting_wadi("/content/data/WADI/", train_list, "Total number of WADI train data")
test_num = counting_wadi("/content/data/WADI/", test_list, "Total number of WADI test data")
label_num = counting_wadi_label("/content/data/WADI/", label_list, "Total number of WADI anomaly data")

print(f"Percentage of anomaly", label_num/test_num)

Total number of WADI train data : 626400
Number of columns : 121
Total number of WADI test data : 172801
Number of columns : 121
Total number of WADI anomaly data : 9977
Percentage of anomaly 0.05773693439274078
