In [None]:
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from scipy.stats import multivariate_normal
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from tqdm import trange
import time
from algorithm_utils import Algorithm, PyTorchUtils
from sklearn import metrics
from utils import LSTMEDModule

class LSTMED(Algorithm, PyTorchUtils):
    def __init__(self, name: str = 'LSTM-ED', num_epochs: int = 10, batch_size: int = 32, lr: float = 1e-3,
                 hidden_size: int = 5, sequence_length: int = 30, train_gaussian_percentage: float = 0.25,
                 n_layers: tuple = (1, 1), use_bias: tuple = (True, True), dropout: tuple = (0, 0),
                 seed: int = None, gpu: int = 0, details=True):
        Algorithm.__init__(self, __name__, name, seed, details=details)
        PyTorchUtils.__init__(self, seed, gpu)
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.lr = lr

        self.hidden_size = hidden_size
        self.sequence_length = sequence_length
        self.train_gaussian_percentage = train_gaussian_percentage

        self.n_layers = n_layers
        self.use_bias = use_bias
        self.dropout = dropout

        self.lstmed = None
        self.mean, self.cov = None, None

    def fit(self, X):
        data =  X.copy()#X.values
        sequences = [data[i:i + self.sequence_length] for i in range(0,data.shape[0] - self.sequence_length + 1, self.sequence_length)]
        print(len(sequences))
        indices = np.random.permutation(len(sequences))
        split_point = int(self.train_gaussian_percentage * len(sequences))
        train_loader = DataLoader(dataset=sequences, batch_size=self.batch_size, drop_last=True,
                                  sampler=SubsetRandomSampler(indices[:-split_point]), pin_memory=True)
        train_gaussian_loader = DataLoader(dataset=sequences, batch_size=self.batch_size, drop_last=True,
                                           sampler=SubsetRandomSampler(indices[-split_point:]), pin_memory=True)

        self.lstmed = LSTMEDModule(X.shape[1], self.hidden_size,
                                   self.n_layers, self.use_bias, self.dropout,
                                   seed=self.seed, gpu=self.gpu)
        self.to_device(self.lstmed)
        optimizer = torch.optim.Adam(self.lstmed.parameters(), lr=self.lr)

        self.lstmed.train()
        #loss_list=[]
        for epoch in trange(self.num_epochs):
            #st=time.time()
            logging.debug(f'Epoch {epoch+1}/{self.num_epochs}.')
            for ts_batch in train_loader:
                output = self.lstmed(self.to_var(ts_batch))
                loss = nn.MSELoss(size_average=False)(output, self.to_var(ts_batch.float()))
                self.lstmed.zero_grad()
                loss.backward()
                optimizer.step()

        self.lstmed.eval()
        error_vectors = []
        for ts_batch in train_gaussian_loader:
            output = self.lstmed(self.to_var(ts_batch))
            error = nn.L1Loss(reduce=False)(output, self.to_var(ts_batch.float()))
            error_vectors += list(error.view(-1, X.shape[1]).data.cpu().numpy())

        self.mean = np.mean(error_vectors, axis=0)
        self.cov = np.cov(error_vectors, rowvar=False)

    def predict(self, X):
        data = X.copy() #data = X.values
        sequences = [data[i:i + self.sequence_length] for i in range(0,data.shape[0] - self.sequence_length + 1, self.sequence_length)]
        data_loader = DataLoader(dataset=sequences, batch_size=self.batch_size, shuffle=False, drop_last=False)

        self.lstmed.eval()
        mvnormal = multivariate_normal(self.mean, self.cov, allow_singular=True)
        scores = []
        outputs = []
        errors = []
        for idx, ts in enumerate(data_loader):
            output = self.lstmed(self.to_var(ts))
            error = nn.L1Loss(reduce=False)(output, self.to_var(ts.float()))
            score = -mvnormal.logpdf(error.view(-1, X.shape[1]).data.cpu().numpy())
            #print(idx,'error',(error.float()).mean(),'score',np.mean(score))
            scores.append(score.reshape(ts.size(0), self.sequence_length))
            if self.details:
                outputs.append(output.data.cpu().numpy())
                errors.append(error.data.cpu().numpy())

        # stores seq_len-many scores per timestamp and averages them
        scores = np.concatenate(scores)
        print('scores.shape',scores.shape,np.sum(np.isnan(scores)))#(5760, 90)

        if self.details:
            outputs = np.concatenate(outputs)
            print(outputs.shape)#(5760, 90, 14)
            self.prediction_details.update({'reconstructions_mean': outputs})

            errors = np.concatenate(errors)
            self.prediction_details.update({'errors_mean': errors})

        return scores,outputs


#data preprecessing
data_list=[]
file_names=['2006-05-13.csv','2006-05-14.csv','2006-05-15.csv','2006-05-16.csv','2006-05-17.csv',
            '2006-05-18.csv','2006-05-19.csv','2007-12-14.csv','2009-05-05.csv','2011-11-01.csv']
x_train=[];X=[]
for f in file_names:
    data=pd.read_csv('data/raw_data/'+f)
    #anomaly detection
    if f in ['2007-12-14.csv','2009-05-05.csv','2011-11-01.csv']:
            q25,q75=np.quantile(data['SJS13'],[0.25,0.75]);print(q25-1.5*(q75-q25),q75+1.5*(q75-q25))
            data['SJS13'][data['SJS13']<q25-1.5*(q75-q25)]=np.nan#3000
            data['SJS13'][data['SJS13']>q75+1.5*(q75-q25)]=np.nan#2700
    data=data.interpolate(method='linear')
    data=data.iloc[90:-90]
    #stack
    data_list.append(data)
    X.append(data.values)
    if f in ['2006-05-13.csv','2006-05-14.csv','2006-05-15.csv','2006-05-16.csv','2006-05-17.csv',
             '2006-05-18.csv','2006-05-19.csv','2007-12-14.csv']:
        x_train.append(data.values)
X=np.concatenate(X,axis=0)
print(len(x_train))
#scaling-training set
x_train=np.concatenate(x_train,axis=0)#np.concatenate([data_list[0].values,data_list[1].values],axis=0);print(x_train.shape)#data_list[0].values
m_train=np.mean(x_train,axis=0)#m_train=np.mean(X,axis=0)#mn=np.min(x_train,axis=0)#
std_train=np.std(x_train,axis=0)#std_train=np.std(X,axis=0)#mx=np.max(x_train,axis=0)#
x_train=(x_train-m_train)/std_train#x_train=(x_train-mn)/(mx-mn)#
for i in range(14):  print(i,np.min(x_train[:,i]),np.max(x_train[:,i]),np.mean(x_train[:,i]),np.std(x_train[:,i]))
#scaling-test set

x_test1=data_list[-2].values
#m_train=np.mean(x_test1,axis=0);std_train=np.std(x_test1,axis=0)
x_test1=(x_test1-m_train)/std_train#x_test1=(x_test1-mn)/(mx-mn)##

x_test2=data_list[-1].values
#m_train=np.mean(x_test2,axis=0);std_train=np.std(x_test2,axis=0)
x_test2=(x_test2-m_train)/std_train#x_test2=(x_test2-mn)/(mx-mn)##

#x_test3=data_list[-1].values
#m_train=np.mean(x_test2,axis=0);std_train=np.std(x_test2,axis=0)
#x_test3=(x_test3-m_train)/std_train
for i in range(14): print(i,np.min(x_test1[:,i]),np.max(x_test1[:,i]),np.mean(x_test1[:,i]),np.std(x_test1[:,i]))
for i in range(14): print(i,np.min(x_test2[:,i]),np.max(x_test2[:,i]),np.mean(x_test2[:,i]),np.std(x_test1[:,i]))
    
lstmed=LSTMED(num_epochs=50,sequence_length=90)
lstmed.fit(x_train)

state=torch.load('model2.pth')
lstmed.lstmed.load_state_dict(state['model_state_dict'])
'''target0 = [x_train[i:i + 30] for i in range(0,x_train.shape[0] - 30 + 1,30)]
target0=np.stack(target0,axis=0)
target1 = [x_test1[i:i + 30] for i in range(0,x_test1.shape[0] - 30 + 1,30)]
target1=np.stack(target1,axis=0)
target2 = [x_test2[i:i + 30] for i in range(0,x_test2.shape[0] - 30 + 1,30)]
target2=np.stack(target2,axis=0)'''

sc0,output0=lstmed.predict(x_train)
sc1,output1=lstmed.predict(x_test1)
sc2,output2=lstmed.predict(x_test2)
output0=output0.reshape((-1,14));sc0=sc0.reshape((-1))
output1=output1.reshape((-1,14));sc1=sc1.reshape((-1))
output2=output2.reshape((-1,14));sc2=sc2.reshape((-1))
print(np.mean(sc0)+np.std(sc0),np.mean(sc1)+np.std(sc1),np.mean(sc2)+np.std(sc2))

'''idx=np.where(sc1>2225029)[0]
len(idx)
idx=np.where(sc2>2225029)[0]
len(idx)'''

fig=plt.figure(figsize=(20,3))
plt.plot(sc0,'.',label='sc0',alpha=0.1)
plt.plot(sc1,'.',label='sc1',alpha=0.1)
plt.grid()

fig=plt.figure(figsize=(20,3))
plt.plot(sc2,'.',label='sc2',alpha=0.7)
plt.grid()
plt.legend()
#plt.savefig('images/sjs13_2.png',bbox_inces='tight', pad_inches=0)
plt.show()

for i in range(14):
    fig=plt.figure(figsize=(20,3))
    plt.plot(x_train[:,i],label='target1',alpha=0.8)
    plt.plot(output0[:,i],label='output1',alpha=0.8)
    plt.grid()
    plt.legend()
    plt.savefig('images/results/'+str(i)+'.png',bbox_inces='tight', pad_inches=0)
    plt.show()

for i in range(14):
    fig=plt.figure(figsize=(20,3))
    plt.plot(x_test1[:,i],label='target1',alpha=0.8)
    plt.plot(output1[:,i],label='output1',alpha=0.8)
    plt.grid()
    plt.legend()
    plt.savefig('images/results/val/'+str(i)+'.png',bbox_inces='tight', pad_inches=0)
    plt.show()

for i in range(14):
    fig=plt.figure(figsize=(20,3))
    plt.plot(x_test2[:,i],label='target1',alpha=0.8)
    plt.plot(output2[:,i],label='output1',alpha=0.8)
    plt.grid()
    plt.legend()
    plt.savefig('images/results/test/'+str(i)+'.png',bbox_inces='tight', pad_inches=0)
    plt.title(str(i))
    plt.show()

y_t=(x_test2*std_train)+m_train
y_p=(output2*std_train)+m_train
for i in range(14):
    mape=np.mean(np.abs((y_t[:,i]-y_p[:,i])/y_t[:,i]))*100
    print(i,mape)
    
fig=plt.figure(figsize=(20,3))
for i in range(14):
    plt.plot(np.abs((x_test2[:,i]-output2[:,i])/x_test2[:,i]),'.',label=str(i),alpha=0.8)
plt.grid()
plt.legend()
#plt.savefig('images/sjs13_2.png',bbox_inces='tight', pad_inches=0)
plt.show()