In [1]:
import joblib

In [2]:
import pickle

In [3]:
import copy

In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
#from sklearn.externals import joblib
import seaborn as sns
sns.set(color_codes=True)
import matplotlib.pyplot as plt
%matplotlib inline

from numpy.random import seed
from sklearn.model_selection import train_test_split

In [8]:
import torch
from torch import nn, optim
import torch.nn.functional as F

In [9]:
RANDOM_SEED = 10
np.random.seed(RANDOM_SEED)

In [10]:
%%time
with open('754246db-d587-4f74-8bd6-a2bc7004de76.pkl', 'rb') as f:
    data = pickle.load(f)

CPU times: user 295 ms, sys: 839 ms, total: 1.13 s
Wall time: 1.13 s


In [11]:
def dataset_freq(data, freq='1min'):
    
    poc_data = data.copy()
    
    poc_data.rename(columns={'micro_data_ac_dt':'timestamp'},inplace=True)
    poc_data.dropna(inplace=True)
    poc_data['timestamp'] = pd.to_datetime(poc_data['timestamp'].astype('int'), format='%Y%m%d%H%M%S' )
    #poc_data['timestamp'] = pd.to_datetime(poc_data['timestamp'].astype('int'), format='%Y-%m-%d %H:%M:%S' )

    poc_data = poc_data.set_index('timestamp').sort_index()
    poc_data = poc_data[poc_data.tag_223m1_cont != 0]
    return poc_data.resample(freq).mean().dropna()

# shingle data
def shingle(data, shingle_size):
    num_data = len(data)
    shingled_data = np.zeros((num_data-shingle_size, shingle_size))
    
    for n in range(num_data - shingle_size):
        shingled_data[n] = data[n:(n+shingle_size)]
    return shingled_data

In [12]:
train_data = dataset_freq(data, freq='1min')

In [13]:
raw_data = train_data.copy()

- drop anomaly data from training_dataset

In [14]:
test = raw_data['2020-4-28']
test2= raw_data['2019-12-16']
test = pd.concat((test,test2))

In [15]:
#create new column based on a column
train_data.loc[test2.index,'anomaly'] = 1

In [16]:
train_data = train_data[train_data['anomaly'] != 1].drop(labels='anomaly',axis=1)

In [17]:
train_data.head()

Unnamed: 0_level_0,tag_223m1_cont,tag_dscrn_cont
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-11-06 08:05:00,0.374286,-3.214286
2019-11-06 08:06:00,0.989333,-3.23
2019-11-06 08:07:00,0.374333,-3.273333
2019-11-06 08:08:00,0.372667,-3.263333
2019-11-06 08:09:00,0.563,-3.246667


In [18]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(train_data)
X_test = scaler.transform(test)
scaler_filename = "scaler_data"
joblib.dump(scaler, scaler_filename)

['scaler_data']

- create a 30*1 2D training data using shingling method 'you can now sample it since data is now captured with time'

In [19]:
train_data_seq = shingle(train_data['tag_dscrn_cont'],30)

In [20]:
train_data_seq_df = pd.DataFrame(train_data_seq)

In [21]:
train_df, val_df = train_test_split(
  train_data_seq_df,
  test_size=0.15,
  random_state=RANDOM_SEED
)

val_df, test_df = train_test_split(
  val_df,
  test_size=0.33, 
  random_state=RANDOM_SEED
)

In [22]:
anomaly_df = pd.DataFrame(shingle(test['tag_dscrn_cont'], 30))

In [23]:
def create_dataset(df):

    sequences = df.astype(np.float32).to_numpy().tolist()

    dataset = [torch.tensor(s).unsqueeze(1).float() for s in sequences]

    n_seq, seq_len, n_features = torch.stack(dataset).shape

    return dataset, seq_len, n_features

In [24]:
train_dataset, seq_len, n_features = create_dataset(train_df)
val_dataset, _, _ = create_dataset(val_df)
test_normal_dataset, _, _ = create_dataset(test_df)
test_anomaly_dataset, _, _ = create_dataset(anomaly_df)

In [25]:
print("seq len is: ",seq_len, "\nn_features is: ",n_features)

seq len is:  30 
n_features is:  1


In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [34]:
class Encoder(nn.Module):

    def __init__(self, seq_len, n_features, embedding_dim=64):
        super(Encoder, self).__init__()

        self.seq_len, self.n_features = seq_len, n_features
        self.embedding_dim, self.hidden_dim = embedding_dim, 2 * embedding_dim

        self.rnn1 = nn.LSTM(
          input_size=n_features,
          hidden_size=self.hidden_dim,
          num_layers=1,
          batch_first=True
        )

        self.rnn2 = nn.LSTM(
          input_size=self.hidden_dim,
          hidden_size=embedding_dim,
          num_layers=1,
          batch_first=True
        )

    def forward(self, x):
        x = x.reshape((64, self.seq_len, self.n_features))

        x, (_, _) = self.rnn1(x)
        x, (hidden_n, _) = self.rnn2(x)
        print(hidden_n.shape)

        return hidden_n.reshape((self.n_features, self.embedding_dim))

In [28]:
class Decoder(nn.Module):

    def __init__(self, seq_len, input_dim=64, n_features=1):
        super(Decoder, self).__init__()

        self.seq_len, self.input_dim = seq_len, input_dim
        self.hidden_dim, self.n_features = 2 * input_dim, n_features

        self.rnn1 = nn.LSTM(
          input_size=input_dim,
          hidden_size=input_dim,
          num_layers=1,
          batch_first=True
        )

        self.rnn2 = nn.LSTM(
          input_size=input_dim,
          hidden_size=self.hidden_dim,
          num_layers=1,
          batch_first=True
        )

        self.output_layer = nn.Linear(self.hidden_dim, n_features)

    def forward(self, x):
        x = x.repeat(self.seq_len, self.n_features)
        x = x.reshape((self.n_features, self.seq_len, self.input_dim))

        x, (hidden_n, cell_n) = self.rnn1(x)
        x, (hidden_n, cell_n) = self.rnn2(x)
        x = x.reshape((self.seq_len, self.hidden_dim))

        return self.output_layer(x)

In [36]:
class RecurrentAutoencoder(nn.Module):

    def __init__(self, seq_len, n_features, embedding_dim=64):
        super(RecurrentAutoencoder, self).__init__()

        self.encoder = Encoder(seq_len, n_features, embedding_dim).to(device)
        self.decoder = Decoder(seq_len, embedding_dim, n_features).to(device)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)

        return x

In [37]:
model = RecurrentAutoencoder(seq_len, n_features, 64)
model = model.to(device)

In [38]:
def train_model(model, train_dataset, val_dataset, n_epochs):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.L1Loss(reduction='sum').to(device)
    history = dict(train=[], val=[])

    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 10000.0

    for epoch in range(1, n_epochs + 1):
        model = model.train()

        train_losses = []
        for _, seq_true in enumerate(train_dataset):
            optimizer.zero_grad()

            seq_true = seq_true.to(device)
            seq_pred = model(seq_true)

            loss = criterion(seq_pred, seq_true)

            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())
            
            if _ % 1000 == 0:
                print(f'Process during training: {loss}')

        val_losses = []
        model = model.eval()
        with torch.no_grad():
            for seq_true in val_dataset:
                seq_true = seq_true.to(device)
                seq_pred = model(seq_true)

                loss = criterion(seq_pred, seq_true)
                val_losses.append(loss.item())

        train_loss = np.mean(train_losses)
        val_loss = np.mean(val_losses)

        history['train'].append(train_loss)
        history['val'].append(val_loss)

        if val_loss < best_loss:
            best_loss = val_loss
            best_model_wts = copy.deepcopy(model.state_dict())

        print(f'Epoch {epoch}: train loss {train_loss} val loss {val_loss}')

    model.load_state_dict(best_model_wts)
    return model.eval(), history

In [None]:
#batch_size is injected to data_loader, so got to know about data_loader

In [39]:
%%time
model, history = train_model(
  model, 
  train_dataset, 
  val_dataset, 
  n_epochs=1
)

RuntimeError: shape '[64, 30, 1]' is invalid for input of size 30

In [4]:
train_data

NameError: name 'train_data' is not defined

In [None]:
ax = plt.figure().gca()

ax.plot(history['train'])
ax.plot(history['val'])
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'test'])
plt.title('Loss over training epochs')
plt.show();

In [None]:
scored.plot(logy=True,  figsize=(16,9), ylim=[1e-2,1e0], color=['blue','red'])

In [None]:
# save all model information, including weights, in h5 format
model.save("Cloud_model.pt")
print("Model saved")
