# データの前処理

## パラメータ

In [None]:
label_data_source = 'https://raw.githubusercontent.com/numenta/NAB/master/labels/raw/known_labels_v1.0.json'
dataset_path = '../working/test_dataset.csv'
data_s3_path = 's3://bucket-name/sagemaker/iot-analytics/machine-temperature/train.csv'
shingle_size = 12 * 24


In [None]:
# ラベルデータを指定して有る場合はラベルデータが必要
need_label_flg = label_data_source != ''

## データの読みこみ

In [None]:
import pandas as pd
df = pd.read_csv(dataset_path)

## データの前処理

不要なカラムを削除

In [None]:
df.index = pd.to_datetime(df.timestamp)
df = df.drop(columns=['timestamp', '__dt']).sort_index()

シングリング処理用関数

In [None]:
def shingle(data, shingle_size):
    import numpy as np
    
    num_data = len(data)
    shingled_data = np.zeros((num_data-shingle_size, shingle_size))
    
    for n in range(num_data - shingle_size):
        shingled_data[n] = data[n:(n+shingle_size)]
    return shingled_data

シングリング処理

In [None]:
shingled_data = shingle(df.value, shingle_size)

## ラベルデータを付与

In [None]:
if need_label_flg:
    import numpy as np
    from urllib import request
    import json
    
    with request.urlopen(label_data_source) as f:
        label_data = json.loads(f.read().decode())
    
    anomaly_dates = label_data['realKnownCause/machine_temperature_system_failure.csv']
    anomaly_datetimes = [pd.to_datetime(dt) for dt in anomaly_dates]
    is_anomaly = [int(timestamp in anomaly_datetimes) for timestamp in df.index]

    df['is_anomaly'] = pd.Series(is_anomaly, index=df.index)

    # シングリングすることでシングルサイズ分のデータが無くなるので、データフレームも合わせておく
    shingled_df = df.iloc[shingle_size:]

    # 各行の先頭に異常値かどうかのラベルをつける(異常値:1, 正常値:0)
    labeled_data = [np.insert(row, 0, shingled_df.is_anomaly.iloc[i]) for i, row in enumerate(shingled_data)]


## データをS3に保存

In [None]:
local_path = '/tmp/data.csv'

In [None]:
import numpy as np
np.savetxt(
    local_path,
    labeled_data if need_label_flg else shingled_data,
    delimiter=','
)

In [None]:
!aws s3 cp $local_path $data_s3_path