In [1]:
import pickle
import pandas as pd
import numpy as np

import boto3
import sagemaker 

In [2]:
s3 = boto3.client('s3')
sagemaker_session = sagemaker.Session()

bucket = 'hyundai-elevator-poc'
prefix = 'donghwa/sagemaker'
role = sagemaker.get_execution_role()

#s3.download_file(bucket,'/'.join([prefix,'754246db-d587-4f74-8bd6-a2bc7004de76.pkl']),'754246db-d587-4f74-8bd6-a2bc7004de76.pkl')
#sagemaker.Session().default_bucket()

In [3]:
filename = '754246db-d587-4f74-8bd6-a2bc7004de76.pkl'
s3.upload_file(
    filename,
    bucket,
    '/'.join(prefix,filename)
)

### Do not execute if this is not your first job.

## Download file to Sagemaker storage

In [None]:
!wget https://s3.ap-northeast-2.amazonaws.com/kanto.public/anomaly/754246db-d587-4f74-8bd6-a2bc7004de76.pkl

In [4]:
# generate new dataset using resample
def dataset_freq(data, freq='1min'):
    
    poc_data = data.copy()
    
    poc_data.rename(columns={'micro_data_ac_dt':'timestamp'},inplace=True)
    poc_data.dropna(inplace=True)
    poc_data['timestamp'] = pd.to_datetime(poc_data['timestamp'].astype('int'), format='%Y%m%d%H%M%S' )
    #poc_data['timestamp'] = pd.to_datetime(poc_data['timestamp'].astype('int'), format='%Y-%m-%d %H:%M:%S' )

    poc_data = poc_data.set_index('timestamp').sort_index()
    poc_data = poc_data[poc_data.tag_223m1_cont != 0]
    return poc_data.resample(freq).mean().dropna()

# shingle data
def shingle(data, shingle_size):
    num_data = len(data)
    shingled_data = np.zeros((num_data-shingle_size, shingle_size))
    
    for n in range(num_data - shingle_size):
        shingled_data[n] = data[n:(n+shingle_size)]
    return shingled_data

In [6]:
%%time
with open('754246db-d587-4f74-8bd6-a2bc7004de76.pkl', 'rb') as f:
    data = pickle.load(f)

CPU times: user 0 ns, sys: 1.11 s, total: 1.11 s
Wall time: 1.11 s


In [7]:
train_data = dataset_freq(data, freq='1min')
test = train_data['2020-4-28']

In [9]:
shingle_size = 30
shingled_data = shingle(test.tag_dscrn_cont, shingle_size) #make data have 30 min interval information 

#prefix_shingled = 'sagemaker/randomcutforest_shingled'

In [11]:
#np.savetxt('data/donghwa-dscrn-shingle.csv',shingled_data,delimiter=',')

- just for data chekcing

## Dataset ready. Now Execute Training

In [10]:
from sagemaker.pytorch import PyTorch

In [11]:
#sagemaker upload function
inputs = sagemaker_session.upload_data(path=f'data/', bucket=bucket, key_prefix='train/donghwa-dscrn-shingle.csv')

In [15]:
import pandas as pd
s3_output_location = 's3://{}'.format(bucket)
estimator = PyTorch(entry_point='train_donghwa.py',
                    role=role,
                    framework_version='1.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.m4.xlarge',
                    output_path=s3_output_location,
                    hyperparameters={
                        'epochs': 2,
                        'backend': 'gloo',
                        'batch-size': 64,
                        'test-batch-size': 64,
                    })
estimator.fit({'training': inputs})

'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-10-21 08:56:07 Starting - Starting the training job...
2020-10-21 08:56:09 Starting - Launching requested ML instances......
2020-10-21 08:57:10 Starting - Preparing the instances for training...
2020-10-21 08:57:59 Downloading - Downloading input data......
2020-10-21 08:59:07 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-10-21 08:59:09,318 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-10-21 08:59:09,321 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-10-21 08:59:09,333 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-10-21 08:59:12,356 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-10-21 08:59:12,708 sagemaker-containers INFO    

## End of Training : Stop here before next instruction

In [15]:
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


---------------!

### Execute below if you created an endpoint outside Jupyter Notebook

In [1]:
import sagemaker
from sagemaker import predictor
from sagemaker.predictor import csv_serializer, json_deserializer

In [94]:
predictor = predictor.RealTimePredictor('auto-encoder', sagemaker_session=sagemaker.Session(), serializer =csv_serializer, deserializer=  json_deserializer)

### Getting a predicted values

In [95]:
res = predictor.predict(shingled_data.astype('float32'))

In [102]:
from sklearn.metrics import mean_squared_error
mean_squared_error(shingled_data[0], res[0])

13.480018505374764

In [103]:
plotted = []
for i in range(len(res)):
    plotted.append(mean_squared_error(shingled_data[i], res[i]))

In [38]:
import matplotlib.pyplot as plt
%matplotlib inline

In [105]:
result_graphic = pd.DataFrame(plotted,index=test.index[:1410],columns=['Anomaly Score'])

In [108]:
predicted = test.join(result_graphic/1000)

In [109]:
predicted.rename(columns={'tag_223m1_cont':'vibration','tag_dscrn_cont':'current','Anomaly Score':'scores'},inplace=True)
predicted.reset_index(inplace=True)

In [110]:
import bokeh
import bokeh.io
from bokeh.models import HoverTool
bokeh.io.output_notebook()
from bokeh.plotting import figure, show, output_file, ColumnDataSource
from bokeh.models.formatters import DatetimeTickFormatter
import bokeh.palettes
from bokeh.models import Range1d, LinearAxis

def prediction(data, threshold=None):
    prediction_data = data
    prediction_data['timestamp'] = pd.to_datetime(prediction_data['timestamp'])
    output_file("donghwa_2020_break.html")
    vibr = prediction_data['vibration']
    curr = prediction_data['current']
    scores = prediction_data['scores']

    hover = HoverTool(
        names = ["vibr","curr","scores"],
        tooltips=[
            ( 'date',   '@event_date_formatted'),
            ( 'vibration',  '@vibr_y' ), 
            ('current', '@cur_y' ),
           ( 'scores',  '@scores_y' ), 
        ],
    )
    source = ColumnDataSource(prediction_data)
    source.add(prediction_data['timestamp'].apply(lambda d: d.strftime('%Y-%m-%d %H:%M:%S')), 'event_date_formatted')
    source.add(vibr, 'vibr_y')
    source.add(curr, 'cur_y')
    source.add(scores, 'scores_y')

    p = figure(x_axis_type='datetime', plot_width=1000, plot_height=350, tools=[hover, 'pan','wheel_zoom','box_zoom','reset']) 
    p.line( name = "vib", x='timestamp',y= 'vibration',source=source, line_width=2,color='navy', alpha=0.5, legend_label="Vibration")
    p.line( name = "cur", x='timestamp',y= 'current',source=source, line_width=2,color='green', alpha=0.5, legend_label="Current")

    p.extra_y_ranges = {"Anomaly": Range1d(start=0, end=10)}
    p.add_layout(LinearAxis(y_range_name="Anomaly"), 'right')
    p.line( name = "scores", x='timestamp',y='scores',source=source, line_width=2,color='red', alpha=0.5, y_range_name="Anomaly",legend_label="Score")
    p.legend.location = "top_left"
    p.legend.click_policy="hide"
    p.title.text = "Anomaly Detection for Hammer mil"

    #select the highest anomaly scores
    score_mean = prediction_data['scores'].mean()
    score_std = prediction_data['scores'].std()
    if not threshold:
        score_cutoff = score_mean + 3*score_std
    else:
        score_cutoff = threshold
    print("The best threshold value is: " + str(score_cutoff))
    anomalies = prediction_data[prediction_data['scores'] > score_cutoff]
    sorted_anomalies = anomalies.sort_values(by=['scores'], ascending=False)
    print(sorted_anomalies)
    source = ColumnDataSource(prediction_data)
    p.circle( sorted_anomalies['timestamp'],sorted_anomalies['scores'], line_width=3,color='black', alpha=0.3, y_range_name="Anomaly")
    p.circle( sorted_anomalies['timestamp'],sorted_anomalies['current'], line_width=2,color='black')


    show(p)
    

prediction(predicted)

The best threshold value is: 0.12079090546250108
              timestamp  vibration    current    scores
959 2020-04-28 15:59:00  95.201333  95.093333  0.232332
960 2020-04-28 16:00:00  94.255000  91.433333  0.229612
958 2020-04-28 15:58:00  95.598667  91.936667  0.229569
961 2020-04-28 16:01:00  94.257333  95.966667  0.228495
957 2020-04-28 15:57:00  94.933000  91.586667  0.224892
956 2020-04-28 15:56:00  94.972667  92.846667  0.220917
962 2020-04-28 16:02:00  95.089333  89.066667  0.220021
963 2020-04-28 16:03:00  94.187667  95.083333  0.217135
955 2020-04-28 15:55:00  94.738000  94.106667  0.215973
954 2020-04-28 15:54:00  94.156333  95.500000  0.214776
689 2020-04-28 11:29:00  92.934667  95.556667  0.208049
964 2020-04-28 16:04:00  95.665667  91.180000  0.207160
688 2020-04-28 11:28:00  92.302667  91.523333  0.207039
691 2020-04-28 11:31:00  93.932000  89.773333  0.205806
690 2020-04-28 11:30:00  93.433333  88.730000  0.204650
692 2020-04-28 11:32:00  93.236333  95.053333  0.204075