In [1]:
import numpy as np
import os
import pandas as pd
from glob import glob
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
import pickle

In [2]:
import matplotlib.pyplot as plt

# utils functions

In [3]:
def get_anom_pairs(y):
    anom_pairs = []
    anom_index = np.where(y==1)[0]
    tmp_seg = []
    for i in anom_index:
        tmp_seg.append(i)
        if i + 1 not in anom_index:
            anom_pairs.append((tmp_seg[0], tmp_seg[-1]))
            tmp_seg = []
    return anom_pairs


In [4]:
def seqs_split(data, label, abnormal_class, seed=42, downsample_rate=1.0, pure=True):
    # split into normal sequences and abnormal sequences 
    normal_data = []
    abnormal_data = []
    for d, l in zip(data, label):
        if l not in abnormal_class:
            normal_data.append(d)
        else:
            abnormal_data.append(d)

    normal_data = np.array(normal_data)
    abnormal_data = np.array(abnormal_data)

    rng = np.random.RandomState(seed)
    normal_data = normal_data[rng.permutation(len(normal_data))]

    # downsample abnormal class
    rng = np.random.RandomState(seed)
    ds = int(len(abnormal_data) * downsample_rate)
    abnormal_data = abnormal_data[rng.permutation(len(abnormal_data))]
    abnormal_data = abnormal_data[:ds]

    print(downsample_rate, normal_data.shape, abnormal_data.shape)

    
    # split train/test data
    if pure:
        split = int(0.6*len(normal_data))

        rng = np.random.RandomState(seed)
        data_train = normal_data[:split]
        data_train = data_train[rng.permutation(len(data_train))]
        label_train = np.zeros(len(data_train), dtype=int)

        rng = np.random.RandomState(seed)
        normal_data_test =normal_data[split:]
        data_test = np.vstack([normal_data_test, abnormal_data])
        label_test = np.hstack([np.zeros(len(normal_data_test), dtype=int), np.ones(len(abnormal_data), dtype=int)])
        idx = rng.permutation(len(data_test))
        data_test = data_test[idx]
        label_test = label_test[idx]
    
    else:
        split1 = int(0.6*len(normal_data))
        split2 = int(0.6*len(abnormal_data))
        
        data_train1 = normal_data[:split1]
        data_train2 = abnormal_data[:split2]
        data_train = np.vstack([data_train1, data_train2])
        label_train = np.hstack([np.zeros(len(data_train1), dtype=int), np.ones(len(data_train2), dtype=int)])
        
        rng = np.random.RandomState(seed)
        idx = rng.permutation(len(data_train))
        data_train = data_train[idx]
        label_train = label_train[idx]

        data_test1 = normal_data[split1:]
        data_test2 = abnormal_data[split2:]
        data_test = np.vstack([data_test1, data_test2])
        label_test = np.hstack([np.zeros(len(data_test1), dtype=int), np.ones(len(data_test2), dtype=int)])
        
        rng = np.random.RandomState(seed)
        idx = rng.permutation(len(data_test))
        data_test = data_test[idx]
        label_test = label_test[idx]

    # reshape
    dim = data_train.shape[-1]
    seq_len = data_train.shape[1]
    data_train = data_train.reshape(-1, dim)
    label_train = label_train.repeat(seq_len)
    data_test = data_test.reshape(-1, dim)
    label_test = label_test.repeat(seq_len)

    # save
    df_train = pd.DataFrame(data_train, columns=['A'+str(i) for i in range(dim)])
    df_train['label'] = label_train

    df_test = pd.DataFrame(data_test, columns=['A'+str(i) for i in range(dim)])
    df_test['label'] = label_test

    print(Counter(df_train['label']), Counter(df_test['label']))
    return df_train, df_test

In [10]:
save_path = 'data_processed/'
os.makedirs(save_path, exist_ok=True)

# ASD

download link: https://github.com/zhhlee/InterFusion

In [52]:
def save(train_df, test_df, machine_idx, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    train_df.to_csv(os.path.join(output_dir, machine_idx + '_train.csv'))
    test_df.to_csv(os.path.join(output_dir, machine_idx + '_test.csv'))
def create_df(train, test, test_label):
    col = ['A' + str(i) for i in range(train.shape[1])]
    train_df = pd.DataFrame(train, columns=col)
    test_df = pd.DataFrame(test, columns=col)

    train_df['label'] = 0
    test_df['label'] = test_label
    return train_df, test_df


In [55]:
dataset_folder = os.path.join('data/', 'ASD/processed/')
output_root_dir = 'data_processed/'
dataset = 'ASD'


full_lst = os.listdir(dataset_folder)
machine_idx_lst = [a.split('_')[0] for a in full_lst]
for machine_idx in sorted(machine_idx_lst):
    train = pickle.load(open(dataset_folder + machine_idx + '_train.pkl', 'rb'))
    test = pickle.load(open(dataset_folder + machine_idx + '_test.pkl', 'rb'))
    test_label = pickle.load(open(dataset_folder + machine_idx + '_test_label.pkl', 'rb'))
    train_df, test_df = create_df(train, test, test_label)

    output_dir = os.path.join(output_root_dir, dataset, machine_idx)
    save(train_df, test_df, machine_idx, output_dir)

# SMD

In [59]:
dataset_folder = os.path.join('data/', 'SMD/')
output_root_dir = 'data_processed/'
dataset = 'SMD'

# machine_lst = os.listdir(os.path.join(dataset_folder, 'train/'))
machine_lst = ['machine-3-1.txt', 'machine-3-11.txt', 'machine-3-9.txt']
for machine in sorted(machine_lst):
    print(machine)
    train = np.genfromtxt(os.path.join(dataset_folder, 'train', machine),
                          dtype=np.float32, delimiter=',')
    test = np.genfromtxt(os.path.join(dataset_folder, 'test', machine),
                         dtype=np.float32, delimiter=',')
    test_label = np.genfromtxt(os.path.join(dataset_folder, 'test_label', machine),
                               dtype=np.float32, delimiter=',')
    train_df, test_df = create_df(train, test, test_label)

    machine_idx = os.path.splitext(machine)[0]
    output_dir = os.path.join(output_root_dir, dataset, machine_idx)
    save(train_df, test_df, machine_idx, output_dir)

machine-3-1.txt
machine-3-11.txt
machine-3-9.txt


# SWaT

In [49]:
# the following code is adapted from the source code in [Zhihan Li et al. KDD21]
# preprocess for SWaT. SWaT.A2_Dec2015, version 0
dataset_folder = os.path.join('data/', 'SWaT')

test_df = pd.read_csv(os.path.join(dataset_folder, 'SWaT_Dataset_Attack_v0.csv'))

test_df = test_df.set_index(' Timestamp')
test_df['label'] = np.where(test_df['Normal/Attack'] == 'Attack', 1, 0)
# test_df.apply(lambda x: 1 if test_df['Normal/Attack'] == 'Attack' else 0)
test_df = test_df.drop('Normal/Attack', axis=1)
assert test_df.shape == (449919, 52)

train_df = pd.read_csv(os.path.join(dataset_folder, 'SWaT_Dataset_Normal_v0.csv'))
# train_df = train_df.drop(columns=['Unnamed: 0', 'Unnamed: 52'])
train_df = train_df.set_index(' Timestamp')
train_df['label'] = np.where(train_df['Normal/Attack'] == 'Attack', 1, 0)
train_df = train_df.drop('Normal/Attack', axis=1)

# following [Zhihan Li et al. KDD21] & [Dan Li. ICANN. 2019]
# fow SWaT data, due to the cold start of the system, starting point is 21600
train_df = train_df.iloc[21600:]
assert train_df.shape == (475200, 52)


output_dir = 'data_processed/SWaT/'
os.makedirs(output_dir, exist_ok=True)
train_df.to_csv(os.path.join(output_dir, 'SWaT_train.csv'))
test_df.to_csv(os.path.join(output_dir, 'SWaT_test.csv'))

note that some columne names in SWaT_test.csv contain spaces, please manually remove these spaces.

# WaQ

download link:  https://www.spotseven.de/gecco/gecco-challenge/gecco-challenge-2018/

use original train/test split  
trian set is contaminated

In [60]:
import pyreadr

train = pyreadr.read_r('data/GECCO/water.RDS') # also works for RData
df_train = train[None] # extract the pandas data frame 

test = pyreadr.read_r('data/GECCO/water_test.RDS')
df_test = test[None]

In [61]:
print(df_train.shape, df_test.shape)

(139566, 11) (139566, 11)


In [62]:
df_train['label'] = df_train['EVENT']  + 0
df_train = df_train.drop(['EVENT', 'Time'], axis=1)
df_train = df_train.dropna()

df_test['label'] = df_test['EVENT']  + 0
df_test = df_test.drop(['EVENT', 'Time'], axis=1)
df_test = df_test.dropna()


In [63]:
# df_train[df_train.isna().sum(axis=1) !=0]

In [64]:
df_train['label'].sum(), df_test['label'].sum()

(1726, 2329)

In [65]:
df_train.shape, df_test.shape

((138521, 10), (115086, 10))

In [66]:
df_train.head()

Unnamed: 0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,label
0,6.5,0.17,8.36,749.0,211.0,0.011,0.118,1677.0,695.0,0
1,6.5,0.17,8.36,749.0,211.0,0.011,0.118,1561.0,696.0,0
2,6.5,0.17,8.35,749.0,211.0,0.011,0.117,1581.0,696.0,0
3,6.5,0.17,8.35,749.0,211.0,0.011,0.118,1579.0,693.0,0
4,6.5,0.17,8.35,749.0,211.0,0.011,0.118,1567.0,689.0,0


In [67]:
os.makedirs(os.path.join(save_path, f'WaQ/'), exist_ok=True)
df_train.to_csv(os.path.join(save_path, f'WaQ/WaQ_train.csv'))
df_test.to_csv(os.path.join(save_path, f'WaQ/WaQ_test.csv'))

# DSADS

this dataset can be downloaded from https://github.com/zhangyuxin621/AMSL

1,sitting,

2,standing,

3,lying on back,

4,lying on right side,

5,ascending stairs,

6,descending stairs,

7,standing in an elevator still,

8,moving around in an elevator,

9,walking in a parking lot,

10,walking on a treadmill with a speed of 4 kmh,

11,walking in flat and 15 deg inclined positions,

12,running on a treadmill with a speed of 8 kmh,

13,exercising on a stepper,

14,exercising on a cross trainer,

15,cycling on an exercise bike in horizontal positions,

16,cycling on an exercise bike in vertical positions,

17,rowing,

18,jumping,

19,playing basketball

use running, rowing, and jumping as anomalies

In [28]:
n_person = 8
persons = ['p' + str(i) for i in range(1, 9)]


In [31]:
classes = sorted([os.path.split(f)[1] for f in glob('data/DASADS/a*')])

In [32]:
all_data = {}
all_label = {}

for p in persons:
    data = []
    label = []
    for c in classes:
        f_lst = glob(f'data/DASADS/{c}/{p}/*')
        
        seqs = []
        for f in f_lst:
            seq = np.loadtxt(f, delimiter= ',')
            seqs.append(seq)
        seqs = np.array(seqs)
        data.extend(seqs)
        label.extend([c] * len(seqs))
    
    data = np.array(data)
    label = np.array(label)
    
    # idx = np.random.permutation(len(data))
    # data = data[idx]
    # label = label[idx]
    
    all_data[p] = data
    all_label[p] = label
    
    print(p, data.shape, Counter(label))

p1 (1140, 125, 45) Counter({'a01': 60, 'a02': 60, 'a03': 60, 'a04': 60, 'a05': 60, 'a06': 60, 'a07': 60, 'a08': 60, 'a09': 60, 'a10': 60, 'a11': 60, 'a12': 60, 'a13': 60, 'a14': 60, 'a15': 60, 'a16': 60, 'a17': 60, 'a18': 60, 'a19': 60})
p2 (1140, 125, 45) Counter({'a01': 60, 'a02': 60, 'a03': 60, 'a04': 60, 'a05': 60, 'a06': 60, 'a07': 60, 'a08': 60, 'a09': 60, 'a10': 60, 'a11': 60, 'a12': 60, 'a13': 60, 'a14': 60, 'a15': 60, 'a16': 60, 'a17': 60, 'a18': 60, 'a19': 60})
p3 (1140, 125, 45) Counter({'a01': 60, 'a02': 60, 'a03': 60, 'a04': 60, 'a05': 60, 'a06': 60, 'a07': 60, 'a08': 60, 'a09': 60, 'a10': 60, 'a11': 60, 'a12': 60, 'a13': 60, 'a14': 60, 'a15': 60, 'a16': 60, 'a17': 60, 'a18': 60, 'a19': 60})
p4 (1140, 125, 45) Counter({'a01': 60, 'a02': 60, 'a03': 60, 'a04': 60, 'a05': 60, 'a06': 60, 'a07': 60, 'a08': 60, 'a09': 60, 'a10': 60, 'a11': 60, 'a12': 60, 'a13': 60, 'a14': 60, 'a15': 60, 'a16': 60, 'a17': 60, 'a18': 60, 'a19': 60})
p5 (1140, 125, 45) Counter({'a01': 60, 'a02': 60

In [33]:
import pickle
pickle.dump(all_data, open('data/DASADS/all_data.pkl', 'wb'))

In [34]:
import pickle
pickle.dump(all_label, open('data/DASADS/all_label.pkl', 'wb'))

In [35]:
dim = all_data['p1'].shape[-1]
seq_len = all_data['p1'].shape[1]

In [36]:
downsample_rate = 1.0
# abnormal_class = ['a05', 'a06', 'a12', 'a17', 'a18']
abnormal_class = ['a12', 'a17', 'a18']
seed=42

In [38]:
for p in persons:
    # print(p)
    data = all_data[p]
    label = all_label[p]
    df_train, df_test = seqs_split(data, label, abnormal_class=abnormal_class, seed=seed, downsample_rate=downsample_rate, pure=True)
    # print(df_train.iloc[13])
    
    # os.makedirs(os.path.join(save_path, f'DASADS/{p}/'), exist_ok=True)    
    # df_train.to_csv(os.path.join(save_path, f'DASADS/{p}/{p}_train.csv'))
    # df_test.to_csv(os.path.join(save_path, f'DASADS/{p}/{p}_test.csv'))

1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000}) Counter({0: 48000, 1: 22500})
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000}) Counter({0: 48000, 1: 22500})
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000}) Counter({0: 48000, 1: 22500})
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000}) Counter({0: 48000, 1: 22500})
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000}) Counter({0: 48000, 1: 22500})
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000}) Counter({0: 48000, 1: 22500})
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000}) Counter({0: 48000, 1: 22500})
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000}) Counter({0: 48000, 1: 22500})


In [40]:
downsample_rate = 1.0
# abnormal_class = ['a05', 'a06', 'a12', 'a17', 'a18']
abnormal_class = ['a12', 'a17', 'a18']
seed=42

for p in persons:
    print(p)
    data = all_data[p]
    label = all_label[p]
    df_train, df_test = seqs_split(data, label, abnormal_class=abnormal_class, seed=seed, downsample_rate=downsample_rate, pure=False)
    # print(df_train.iloc[13])
    
    os.makedirs(os.path.join(save_path, f'DSADS/{p}/'), exist_ok=True)    
    df_train.to_csv(os.path.join(save_path, f'DSADS/{p}/{p}_train.csv'))
    df_test.to_csv(os.path.join(save_path, f'DSADS/{p}/{p}_test.csv'))

p1
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000, 1: 13500}) Counter({0: 48000, 1: 9000})
p2
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000, 1: 13500}) Counter({0: 48000, 1: 9000})
p3
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000, 1: 13500}) Counter({0: 48000, 1: 9000})
p4
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000, 1: 13500}) Counter({0: 48000, 1: 9000})
p5
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000, 1: 13500}) Counter({0: 48000, 1: 9000})
p6
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000, 1: 13500}) Counter({0: 48000, 1: 9000})
p7
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000, 1: 13500}) Counter({0: 48000, 1: 9000})
p8
1.0 (960, 125, 45) (180, 125, 45)
Counter({0: 72000, 1: 13500}) Counter({0: 48000, 1: 9000})


# Epilepsy

this is downloaded from the released repository of NeuTraL data  
https://github.com/boschresearch/NeuTraL-AD/tree/NTL_full/

In [42]:
path = 'data/'
name = 'epilepsy/'
train_x = np.load(path + name + 'train_array.npy')
train_y = np.load(path + name + 'train_label.npy')
test_x = np.load(path + name + 'test_array.npy')
test_y = np.load(path + name + 'test_label.npy')
train_x.shape, train_y.shape

((137, 206, 3), (137,))

In [43]:
data = np.concatenate([train_x, test_x])
label = np.hstack([train_y, test_y])
data.shape, label.shape, Counter(label)

((275, 206, 3),
 (275,),
 Counter({'EPILEPSY': 68, 'WALKING': 74, 'RUNNING': 73, 'SAWING': 60}))

In [44]:
# # contaminated training set
seed=42
abnormal_class = ['EPILEPSY']
df_train, df_test = seqs_split(data, label, abnormal_class, seed=seed, downsample_rate=1.0, pure=False)

1.0 (207, 206, 3) (68, 206, 3)
Counter({0: 25544, 1: 8240}) Counter({0: 17098, 1: 5768})


In [46]:
os.makedirs(os.path.join(save_path, f'Epilepsy/'), exist_ok=True)    
df_train.to_csv(os.path.join(save_path, f'Epilepsy/Epilepsy_train.csv'))
df_test.to_csv(os.path.join(save_path, f'Epilepsy/Epilepsy_test.csv'))