In [1]:
import numpy as np
# import optuna
import mlflow
import torch
from mlflow import pytorch
from argparse import Namespace
from torch.utils.data import Dataset, DataLoader
from pprint import pformat
from pathlib import Path
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn

import os

import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')

from trainer import Trainer
from encoder_decoder import EncoderDecoderWrapper

import random
import os
from config import exp_dict_all
import pandas as pd


import matplotlib.pyplot as plt
import pandas as pdp

In [2]:
SEGMENT = "segment_1425_1430"

In [24]:
params = Namespace(batch_size=50,
   in_channels = 2*(len(exp_dict_all[SEGMENT]['inputs']['phases'])), out_channels = 1, sequence_len = 20,rnn_hid_size = 50, output_size=2, teacher_forcing=0.3,
    lr=1e-4,
    num_epochs=1000,
    patience=10,
TIME_SLICE_NAME = 'exemplarid',
store_path = '/blue/ranka/yashaswikarnati/interruption/leakage_modelling/train_data/',
                   processed_run_name = 'run_2022_05_to_08_ts_yash',  data_params = {'inp_agg_level':4,
                          'oup_agg_level':20,
                      'oup_window_use': (0,40)}, segment_name = SEGMENT
)


logging.info(f"{torch.cuda.is_available()}, {torch.cuda.get_device_name(0)}")
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')

EXP_NAME = SEGMENT
# mlflow.set_experiment(experiment_name=EXP_NAME)
LOAD_PRETRAINED = True
pretrained_epoch = 998

2022-11-07 15:41:49 INFO     True, NVIDIA A100-SXM4-80GB


In [6]:
# testing

model =  EncoderDecoderWrapper(in_channels = params.in_channels, out_channels = params.out_channels,
                                    sequence_len = params.sequence_len,rnn_hid_size = params.rnn_hid_size, device = device, spatial_inp_size= 4 + int(params.in_channels/2),output_size=params.output_size,
                                    teacher_forcing=params.teacher_forcing,learning_rate = params.lr)
if(LOAD_PRETRAINED):
    print(f"loading pretrained model {EXP_NAME}_epoch_{pretrained_epoch}.pth")
    model.load_state_dict(torch.load(f'pthfiles/{EXP_NAME}_epoch_{pretrained_epoch}.pth'))

trainer_obj =  Trainer(
    model  = model,
    device = device,
    exp_name = EXP_NAME,
    exp = 'segment_leak',
    loss_fn = torch.nn.MSELoss()
)

loading pretrained model segment_1425_1430_epoch_998.pth


In [7]:
train_loader, val_loader = trainer_obj.get_dataloaders(params.batch_size,params.store_path,params.processed_run_name,params.data_params, params.segment_name)

2022-11-07 14:48:28 INFO     TRAIN SET SIZE 8992 VALIDATION SET SIZE 2247


In [8]:
temp_batch = next(iter(val_loader))

In [33]:
import torch
import os
from torch.utils.data import Dataset, DataLoader
from numpy.random import Generator, PCG64
import uuid
from functools import partial
import pickle
import numpy as np

import random


def loop_inplace_sum(arrlist):
    # assumes len(arrlist) > 0
    sum = arrlist[0].copy()
    for a in arrlist[1:]:
        sum += a
    return sum

def return_aggregated_x(X_batch, j):
    return X_batch.reshape(-1,j).sum(axis=1).reshape(X_batch.shape[0],-1)


class LeakageDataset(Dataset):
    def __init__(self, store_path,processed_run_name,segment_name,  data_params):
        
        self.store_path = store_path
        self.processed_data_path = os.path.join(self.store_path, processed_run_name,segment_name)
        self.samples_name = 'exemplar_'
        self.all_files = os.listdir(self.processed_data_path)
        
        
#         all processing constants
        self.inp_agg_level = data_params['inp_agg_level']
        self.oup_agg_level = data_params['oup_agg_level']
        self.oup_window_use = data_params['oup_window_use']
        
        
        
        
    def __len__(self):

        return len(os.listdir(self.processed_data_path))
    
    
    @property
    def raw_file_names(self):
        return os.listdir(self.processed_data_path)

    @property
    def processed_file_names(self):
        return os.listdir(self.processed_data_path)
    
    
    def __getitem__(self, idx):
        random_choice = random.choice(self.all_files)
        data = torch.load(os.path.join(self.processed_data_path,random_choice ))        
        x,sig,y,params, timestamp = self.return_isc_x_y(data)
        
        time_split_arr = [timestamp.year, timestamp.month, timestamp.day,timestamp.hour, timestamp.minute, timestamp.second ]
        return {'x':x.transpose(1,0),'sig':sig.transpose(1,0),'y':y.reshape(-1,1) ,'params':params, 'timestamp':time_split_arr}
    
    
    
    def return_isc_x_y(self, data):
        inp_arr = [return_aggregated_x(x.reshape(1,-1),self.inp_agg_level ) for x in data['inp'] ]
        oup_inp =return_aggregated_x(data['oup'][self.oup_window_use[0]:self.oup_window_use[1]].reshape(1,-1),self.oup_agg_level)
        oup_arr = return_aggregated_x(data['oup'][self.oup_window_use[1]:].reshape(1,-1),self.oup_agg_level)
        sig_timing = [return_aggregated_x(np.array(data['sig'][k]).reshape(1,-1),self.inp_agg_level ) for k in data['sig']]
        tod_dow= np.array([data['hour'], data['day_of_week']])
        
        timestamp = data['timestamp']
        
        x = np.concatenate(inp_arr,axis=0)
        sig = np.concatenate(sig_timing,axis=0)
        y  = oup_arr.reshape(-1)
        params = np.concatenate([oup_inp.reshape(-1),tod_dow ],axis=0)
        
        
        return x,sig,y,params, timestamp
    
    
    
    def get_each_batch(self, batch_size):
        no_files = len(self.all_files)
        
        batch_count = 0
        
        while(batch_count<no_files):
            batch_x, batch_sig, batch_y, batch_params, batch_ts, batch_time_split = [],[],[],[],[],[]
            for jj in range(batch_size):
                curr_data =  torch.load(os.path.join(self.processed_data_path,self.all_files[batch_count] ))
                x,sig,y,params, timestamp = self.return_isc_x_y(curr_data)
                ts = np.array([timestamp])
                time_split_arr = np.array([timestamp.year, timestamp.month, timestamp.day,timestamp.hour, timestamp.minute, timestamp.second ])
                batch_count +=1
                batch_x.append(x[np.newaxis])
                batch_sig.append(sig[np.newaxis])
                batch_y.append(y[np.newaxis])
                batch_params.append(params[np.newaxis])
                batch_ts.append(ts[np.newaxis])
                batch_time_split.append(time_split_arr[np.newaxis])
                
                
            batch_obj = {}
            batch_obj['x'] = torch.Tensor(np.concatenate(batch_x,axis=0))
            batch_obj['sig'] = torch.Tensor(np.concatenate(batch_sig,axis=0))
            batch_obj['y'] = torch.Tensor(np.concatenate(batch_y,axis=0))
            batch_obj['params'] = torch.Tensor(np.concatenate(batch_params,axis=0))
            batch_obj['timestamp'] = torch.Tensor(np.concatenate(batch_time_split,axis=0))
            
            break
            
        return batch_obj

            
    


        
    
    
    
    
            
            
            

In [34]:
dataset = LeakageDataset(params.store_path,params.processed_run_name, params.segment_name, params.data_params)

In [35]:
temp_b = dataset.get_each_batch(50)

In [36]:
temp_b.keys()

dict_keys(['x', 'sig', 'y', 'params', 'timestamp'])

In [45]:
temp_b['timestamp'].shape


torch.Size([50, 6])

In [23]:
# curr_data =  torch.load(os.path.join(dataset.processed_data_path,dataset.all_files[0] ))
# x,sig,y,params, timestamp = dataset.return_isc_x_y(curr_data)


In [19]:
x[np.newaxis].shape

(1, 2, 20)