In [64]:
import numpy as np
# import optuna
import mlflow
import torch
from mlflow import pytorch
from argparse import Namespace
from torch.utils.data import Dataset, DataLoader
from pprint import pformat
from pathlib import Path
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn

import os

import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')

from trainer import Trainer
from encoder_decoder import EncoderDecoderWrapper

import random
import os
from config import exp_dict_all
import pandas as pd


import matplotlib.pyplot as plt
import pandas as pd

In [65]:
SEGMENT = "segment_1445_1455"

In [66]:
params = Namespace(batch_size=50,
   in_channels = 2*(len(exp_dict_all[SEGMENT]['inputs']['phases'])), out_channels = 1, sequence_len = 20,rnn_hid_size = 50, output_size=2, teacher_forcing=0.3,
    lr=1e-4,
    num_epochs=1000,
    patience=10,
TIME_SLICE_NAME = 'exemplarid',
store_path = '/blue/ranka/yashaswikarnati/interruption/leakage_modelling/train_data/',
                   processed_run_name = 'run_2022_09_test_set',  data_params = {'inp_agg_level':4,
                          'oup_agg_level':20,
                      'oup_window_use': (0,40)}, segment_name = SEGMENT
)


logging.info(f"{torch.cuda.is_available()}, {torch.cuda.get_device_name(0)}")
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')

EXP_NAME = SEGMENT
# mlflow.set_experiment(experiment_name=EXP_NAME)
LOAD_PRETRAINED = True
pretrained_epoch = 998

2022-11-08 12:50:31 INFO     True, NVIDIA A100-SXM4-80GB


In [67]:
# testing

model =  EncoderDecoderWrapper(in_channels = params.in_channels, out_channels = params.out_channels,
                                    sequence_len = params.sequence_len,rnn_hid_size = params.rnn_hid_size, device = device, spatial_inp_size= 4 + int(params.in_channels/2),output_size=params.output_size,
                                    teacher_forcing=params.teacher_forcing,learning_rate = params.lr)
if(LOAD_PRETRAINED):
    print(f"loading pretrained model {EXP_NAME}_epoch_{pretrained_epoch}.pth")
    model.load_state_dict(torch.load(f'pthfiles/{EXP_NAME}_epoch_{pretrained_epoch}.pth'))

trainer_obj =  Trainer(
    model  = model,
    device = device,
    exp_name = EXP_NAME,
    exp = 'segment_leak',
    loss_fn = torch.nn.MSELoss()
)

loading pretrained model segment_1445_1455_epoch_998.pth


In [68]:
import torch
import os
from torch.utils.data import Dataset, DataLoader
from numpy.random import Generator, PCG64
import uuid
from functools import partial
import pickle
import numpy as np

import random


def loop_inplace_sum(arrlist):
    # assumes len(arrlist) > 0
    sum = arrlist[0].copy()
    for a in arrlist[1:]:
        sum += a
    return sum

def return_aggregated_x(X_batch, j):
    return X_batch.reshape(-1,j).sum(axis=1).reshape(X_batch.shape[0],-1)


class LeakageDataset(Dataset):
    def __init__(self, store_path,processed_run_name,segment_name,  data_params):
        
        self.store_path = store_path
        self.processed_data_path = os.path.join(self.store_path, processed_run_name,segment_name)
        self.samples_name = 'exemplar_'
        self.all_files = os.listdir(self.processed_data_path)
        
        
#         all processing constants
        self.inp_agg_level = data_params['inp_agg_level']
        self.oup_agg_level = data_params['oup_agg_level']
        self.oup_window_use = data_params['oup_window_use']
        
        
        
        
    def __len__(self):

        return len(os.listdir(self.processed_data_path))
    
    
    @property
    def raw_file_names(self):
        return os.listdir(self.processed_data_path)

    @property
    def processed_file_names(self):
        return os.listdir(self.processed_data_path)
    
    
    def __getitem__(self, idx):
        random_choice = random.choice(self.all_files)
        data = torch.load(os.path.join(self.processed_data_path,random_choice ))        
        x,sig,y,params, timestamp = self.return_isc_x_y(data)
        
        time_split_arr = [timestamp.year, timestamp.month, timestamp.day,timestamp.hour, timestamp.minute, timestamp.second ]
        return {'x':x.transpose(1,0),'sig':sig.transpose(1,0),'y':y.reshape(-1,1) ,'params':params, 'timestamp':time_split_arr}
    
    
    
    def return_isc_x_y(self, data):
        inp_arr = [return_aggregated_x(x.reshape(1,-1),self.inp_agg_level ) for x in data['inp'] ]
        oup_inp =return_aggregated_x(data['oup'][self.oup_window_use[0]:self.oup_window_use[1]].reshape(1,-1),self.oup_agg_level)
        oup_arr = return_aggregated_x(data['oup'][self.oup_window_use[1]:].reshape(1,-1),self.oup_agg_level)
        sig_timing = [return_aggregated_x(np.array(data['sig'][k]).reshape(1,-1),self.inp_agg_level ) for k in data['sig']]
        tod_dow= np.array([data['hour'], data['day_of_week']])
        
        timestamp = data['timestamp']
        
        x = np.concatenate(inp_arr,axis=0)
        sig = np.concatenate(sig_timing,axis=0)
        y  = oup_arr.reshape(-1)
        params = np.concatenate([oup_inp.reshape(-1),tod_dow ],axis=0)
        
        
        return x,sig,y,params, timestamp
    
    
    
    def get_each_batch(self, batch_size):
        no_files = len(self.all_files)
        
        batch_count = 0
        
        while(batch_count<no_files):
            batch_x, batch_sig, batch_y, batch_params, batch_ts, batch_time_split = [],[],[],[],[],[]
            for jj in range(batch_size):
                if(batch_count>=no_files):
                    break
                curr_data =  torch.load(os.path.join(self.processed_data_path,self.all_files[batch_count] ))
                x,sig,y,params, timestamp = self.return_isc_x_y(curr_data)
                ts = np.array([timestamp])
                time_split_arr = np.array([timestamp.year, timestamp.month, timestamp.day,timestamp.hour, timestamp.minute, timestamp.second ])
                batch_count +=1
                batch_x.append(x.transpose(1,0)[np.newaxis])
                batch_sig.append(sig.transpose(1,0)[np.newaxis])
                batch_y.append(y.reshape(-1,1)[np.newaxis])
                batch_params.append(params[np.newaxis])
                batch_ts.append(ts[np.newaxis])
                batch_time_split.append(time_split_arr[np.newaxis])
                
                
            batch_obj = {}
            batch_obj['x'] = torch.Tensor(np.concatenate(batch_x,axis=0))
            batch_obj['sig'] = torch.Tensor(np.concatenate(batch_sig,axis=0))
            batch_obj['y'] = torch.Tensor(np.concatenate(batch_y,axis=0))
            batch_obj['params'] = torch.Tensor(np.concatenate(batch_params,axis=0))
            batch_obj['timestamp'] = torch.Tensor(np.concatenate(batch_time_split,axis=0))
            
#             break
            
            yield batch_obj

            
    


        
    
    
    
    
            
            
            

In [69]:
dataset = LeakageDataset(params.store_path,params.processed_run_name, params.segment_name, params.data_params)

In [70]:
data_gen_obj = dataset.get_each_batch(50)

In [71]:
trainer_obj.model.eval()
all_percent_errors = []
all_actual = []
all_pred = []

all_inputs_major = []
all_inputs_all = []

timestamps_full =[]


with torch.no_grad():
    for data in data_gen_obj:
        output = model(data)
        actual =  data['y'].detach().numpy()[:,:,0] # 200 sec window (100 sec agg)
        pred = output.cpu().detach().numpy()[:,:,0]
        
        inp_major  = np.sum((data['x'][:,:,0].detach().numpy()),  axis = 1) # 400 sec total (20 sec agg)
        
        inp_all  = np.sum((data['x'][:,:,:].detach().numpy()),  axis = 1)
        
        
        ts_full = data['timestamp'].detach().numpy()
        ts_full= np.array([f"{int(x[0])}-{int(x[1])}-{int(x[2])} {int(x[3])}:{int(x[4])}:{int(x[5])}" for x in ts_full])
        
        ts_full = np.array([pd.to_datetime(str(x)) +  pd.Timedelta(seconds=220) for x in ts_full])
        
        
        all_actual.extend(list(actual[:,0]))
        all_pred.extend(list(pred[:,0]))
        
        
        all_inputs_major.extend(list(inp_major))
        all_inputs_all.extend(list(inp_all))
        timestamps_full.extend(ts_full)
        
        curr_loss = trainer_obj.criterion(output, data['y'].float().to(trainer_obj.device)).item()
        print(f"curr_loss {curr_loss}")
        
        
        
        percent_error = ( (pred[:,0]-actual[:,0])/(pred[:,0]))*100
        print(f"percent error {np.mean(percent_error)}")
        all_percent_errors.extend(list(percent_error.flatten()))

curr_loss 26.71368408203125
percent error -0.14415332674980164
curr_loss 70.19075012207031
percent error -1.2265887260437012
curr_loss 19.567424774169922
percent error -0.8521702289581299
curr_loss 58.34610366821289
percent error -1.114267110824585
curr_loss 14.929593086242676
percent error -1.9251381158828735
curr_loss 63.4195671081543
percent error -4.047396183013916
curr_loss 154.33099365234375
percent error 1.2969485521316528
curr_loss 12.841854095458984
percent error -1.5857937335968018
curr_loss 22.21442413330078
percent error -0.022514337673783302
curr_loss 16.35689353942871
percent error -0.9571629166603088
curr_loss 16.057907104492188
percent error -1.0417178869247437
curr_loss 27.89375114440918
percent error -0.7651650905609131
curr_loss 40.37902069091797
percent error -4.919216156005859
curr_loss 25.409669876098633
percent error 0.4578074514865875
curr_loss 56.7137336730957
percent error -12.645630836486816
curr_loss 38.37605667114258
percent error -0.28398597240448
curr_los

curr_loss 16.283004760742188
percent error -2.7584381103515625
curr_loss 32.74290466308594
percent error 0.7679675221443176
curr_loss 23.41133689880371
percent error -5.8149333000183105
curr_loss 24.949499130249023
percent error 1.272316336631775
curr_loss 21.12275505065918
percent error -1.3426191806793213
curr_loss 28.02214241027832
percent error 0.6050848960876465
curr_loss 25.09117317199707
percent error -1.45805025100708
curr_loss 19.177404403686523
percent error 0.19856154918670654
curr_loss 23.6358642578125
percent error -0.5651656985282898
curr_loss 16.939342498779297
percent error 1.567000150680542
curr_loss 27.572084426879883
percent error 1.7647842168807983
curr_loss 17.468969345092773
percent error 2.3235576152801514
curr_loss 13.162748336791992
percent error 1.6382559537887573
curr_loss 47.01606369018555
percent error -4.591358661651611
curr_loss 45.58131790161133
percent error -2.914287805557251
curr_loss 59.76837158203125
percent error -0.5901241302490234
curr_loss 29.35

In [72]:
error_df = pd.DataFrame()
error_df['actual'] = all_actual
error_df['predicted'] = all_pred
error_df['input_major'] = all_inputs_major
error_df['inputs_all'] = all_inputs_all
error_df['percent_error'] = all_percent_errors

error_df['diff_major'] =  error_df['input_major'] - error_df['actual']
error_df['diff_all_inp'] =  error_df['inputs_all'] - error_df['actual']

error_df['segment'] = SEGMENT
error_df['timestamp'] = timestamps_full
error_df.sort_values(by =['timestamp'], inplace = True)
error_df.to_csv(f'res_store/error_test_set_2022_09_{SEGMENT}.csv')

In [61]:
error_df

Unnamed: 0,actual,predicted,input_major,inputs_all,percent_error,diff_major,diff_all_inp,segment,timestamp
4658,29.0,35.219791,234.0,"[234.0, 4.0, 24.0]",17.659933,205.0,"[205.0, -25.0, -5.0]",segment_1455_1460,2022-09-01 07:03:40
1250,94.0,100.505524,196.0,"[196.0, 5.0, 23.0]",6.472802,102.0,"[102.0, -89.0, -71.0]",segment_1455_1460,2022-09-01 07:05:20
4473,24.0,25.719507,203.0,"[203.0, 6.0, 20.0]",6.685615,179.0,"[179.0, -18.0, -4.0]",segment_1455_1460,2022-09-01 07:07:00
8945,68.0,66.243141,235.0,"[235.0, 3.0, 17.0]",-2.652137,167.0,"[167.0, -65.0, -51.0]",segment_1455_1460,2022-09-01 07:08:40
7774,74.0,75.982353,168.0,"[168.0, 4.0, 29.0]",2.608965,94.0,"[94.0, -70.0, -45.0]",segment_1455_1460,2022-09-01 07:10:20
...,...,...,...,...,...,...,...,...,...
6502,38.0,41.355629,86.0,"[86.0, 0.0, 6.0]",8.114080,48.0,"[48.0, -38.0, -32.0]",segment_1455_1460,2022-09-30 20:49:30
2378,11.0,10.472858,100.0,"[100.0, 1.0, 5.0]",-5.033407,89.0,"[89.0, -10.0, -6.0]",segment_1455_1460,2022-09-30 20:51:10
3358,26.0,27.967808,106.0,"[106.0, 2.0, 5.0]",7.035975,80.0,"[80.0, -24.0, -21.0]",segment_1455_1460,2022-09-30 20:52:50
4121,31.0,33.217133,80.0,"[80.0, 3.0, 6.0]",6.674666,49.0,"[49.0, -28.0, -25.0]",segment_1455_1460,2022-09-30 20:54:30


In [41]:
# trainer_obj.model.eval()
# # val_set_size = len(val_loader.dataset)
# val_loss = 0
# correct = 0
# all_percent_errors = []
# all_actual = []
# all_pred = []
# all_times_of_day = []
# all_days_of_week =[]
# all_inputs_major = []
# all_inputs_all = []

# timestamps_full =[]

# with torch.no_grad():
#     for data in data_gen_obj:
#         output = model(data)
#         actual =  data['y'].detach().numpy()[:,:,0].flatten()
#         pred = output.cpu().detach().numpy()[:,:,0].flatten()
        
#         time_of_day = data['params'][:,2].detach().numpy()
#         time_of_day = np.concatenate([time_of_day[:,np.newaxis], time_of_day[:,np.newaxis]], axis =1).flatten()
#         day_of_week = data['params'][:,3].detach().numpy()
#         day_of_week = np.concatenate([day_of_week[:,np.newaxis], day_of_week[:,np.newaxis]], axis =1).flatten()
# #         day_of_week = np.concatenate([day_of_week[:,np.newaxis], day_of_week[:,np.newaxis]], axis =1).flatten()
        
#         inp_major_1,inp_major_2  = np.sum((data['x'][:,10:15,0].detach().numpy()),  axis = 1), np.sum((data['x'][:,15:,0].detach().numpy()),  axis = 1)
#         inp_major = np.concatenate([inp_major_1[:,np.newaxis], inp_major_2[:,np.newaxis]], axis = 1).flatten()
#         inp_all_1,inp_all_2  = np.sum((data['x'][:,10:15,:].detach().numpy()),  axis = (1,2)), np.sum((data['x'][:,15:,:].detach().numpy()),  axis = (1,2))
#         inp_all = np.concatenate([inp_all_1[:,np.newaxis], inp_all_2[:,np.newaxis]], axis = 1).flatten()
        
#         ts_full = data['timestamp'].detach().numpy()
#         ts_full= np.array([f"{int(x[0])}-{int(x[1])}-{int(x[2])} {int(x[3])}:{int(x[4])}:{int(x[5])}" for x in ts_full])
#         ts_full = np.concatenate([ts_full[:,np.newaxis], ts_full[:,np.newaxis]], axis =1).flatten()
        
        
#         non_zero_indices = np.where(actual>0)[0]
#         actual = actual[non_zero_indices]
#         pred = pred[non_zero_indices]
#         time_of_day = time_of_day[non_zero_indices]
#         day_of_week = day_of_week[non_zero_indices]
#         inp_major = inp_major[non_zero_indices]
#         inp_all = inp_all[non_zero_indices]
        
#         ts_full  = ts_full[non_zero_indices]
        
        
        
        
        
#         all_actual.extend(list(actual))
#         all_pred.extend(list(pred))
#         all_times_of_day.extend(list(time_of_day))
#         all_days_of_week.extend(list(day_of_week))
#         all_inputs_major.extend(list(inp_major))
#         all_inputs_all.extend(list(inp_all))
#         timestamps_full.extend(ts_full)
        
#         curr_loss = trainer_obj.criterion(output, data['y'].float().to(trainer_obj.device)).item()
#         print(f"curr_loss {curr_loss}")
#         percent_error = ( np.abs(actual-pred)/(actual))*100
#         print(f"percent error {np.mean(percent_error)}")
#         all_percent_errors.extend(list(percent_error.flatten()))
#         val_loss += curr_loss # sum up batch loss         




In [11]:
error_df = pd.DataFrame()
error_df['actual'] = all_actual
error_df['predicted'] = all_pred
error_df['input_major'] = all_inputs_major
error_df['inputs_all'] = all_inputs_all
error_df['percent_error'] = all_percent_errors
error_df['time_of_day'] = all_times_of_day
error_df['day_of_week'] = all_days_of_week
error_df['diff_major'] =  error_df['input_major'] - error_df['actual']
error_df['diff_all_inp'] =  error_df['inputs_all'] - error_df['actual']
dayOfWeek={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
error_df['weekday'] = error_df['day_of_week'].map(dayOfWeek)
error_df['segment'] = SEGMENT
error_df['timestamp'] = timestamps_full
# error_df.to_csv(f'res_store/error_all_05_08_{SEGMENT}.csv')

In [15]:
error_df['timestamp'] = error_df['timestamp'].apply(lambda x: pd.to_datetime(str(x)))

In [17]:
error_df.sort_values(by =['timestamp'], inplace = True)

In [18]:
error_df

Unnamed: 0,actual,predicted,input_major,inputs_all,percent_error,time_of_day,day_of_week,diff_major,diff_all_inp,weekday,segment,timestamp
9303,29.0,35.129330,26.0,42.0,21.135620,7.0,3.0,-3.0,13.0,Thursday,segment_1455_1460,2022-09-01 07:00:00
9304,94.0,98.696800,94.0,95.0,4.996596,7.0,3.0,0.0,1.0,Thursday,segment_1455_1460,2022-09-01 07:00:00
2494,94.0,100.730125,94.0,95.0,7.159708,7.0,3.0,0.0,1.0,Thursday,segment_1455_1460,2022-09-01 07:01:40
2495,24.0,23.369627,13.0,21.0,2.626554,7.0,3.0,-11.0,-3.0,Thursday,segment_1455_1460,2022-09-01 07:01:40
8933,24.0,25.719507,13.0,21.0,7.164613,7.0,3.0,-11.0,-3.0,Thursday,segment_1455_1460,2022-09-01 07:03:20
...,...,...,...,...,...,...,...,...,...,...,...,...
6704,26.0,27.967808,26.0,27.0,7.568491,20.0,4.0,0.0,1.0,Friday,segment_1455_1460,2022-09-30 20:49:10
8229,31.0,33.217133,30.0,33.0,7.152040,20.0,4.0,-1.0,2.0,Friday,segment_1455_1460,2022-09-30 20:50:50
8230,24.0,20.041443,16.0,19.0,16.493988,20.0,4.0,-8.0,-5.0,Friday,segment_1455_1460,2022-09-30 20:50:50
10240,24.0,27.031683,31.0,32.0,12.632012,20.0,4.0,7.0,8.0,Friday,segment_1455_1460,2022-09-30 20:52:30
