# Required Imports

In [1]:
import sys, random, math, pickle
from time import time
import numpy as np
import gc
import matplotlib
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import MSELoss
import torch.nn.functional as F
from datetime import timedelta
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import pytorch_lightning as pl
import torchmetrics.functional as FM
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.plugins import DDPPlugin
from pytorch_lightning.callbacks import ModelCheckpoint
sys.path.append('../DG/gan')

# Loading Real Train Data

In [2]:
def get_one_class(X,Y_labels,flag,class_label):
    indices_class_label = np.where(Y_labels==class_label)
    return X[indices_class_label], Y_labels[indices_class_label], flag[indices_class_label] 
    
def get_n_samples(X,Y_labels,flag,n_samples):
    randomList = random.sample(range(0, Y_labels.shape[0]), n_samples)
    return X[randomList], Y_labels[randomList], flag[randomList]

# In real data, if flag sum is 1 --> Then no timestep at all. 
            # So we do remove those ones by converting them to zeros, then return only non-zero flags indices
# In real data, there is no flag of length ZERO
def remove_zero_datapoints(X,Y_labels,flag):
    indices_non_zero = torch.nonzero(torch.sum(flag,1)-1).squeeze()
    return X[indices_non_zero], Y_labels[indices_non_zero], flag[indices_non_zero]


In [3]:
training_real = np.load('../data/google/data_train_reduced.npz')

real_train_X = torch.from_numpy(training_real['data_feature']).float() #[50000, 2500, 9]
real_train_Y = torch.from_numpy(training_real['data_attribute']) #[50000,4]
real_train_Y_labels = torch.argmax(real_train_Y,1) #[50000,]  returns a list of the class label, no one hot encoding any more
real_train_flags = torch.from_numpy(training_real['data_gen_flag'])   # (50000, 2500)

real_train_X,real_train_Y_labels,real_train_flags = remove_zero_datapoints(real_train_X,real_train_Y_labels,real_train_flags)

real_train_lengths = torch.sum(real_train_flags,1).long()

real_train_masks = real_train_flags == 0

In [4]:
val_real = np.load('../data/google/data_train_val.npz')

real_val_X = torch.from_numpy(val_real['data_feature']).float() #[50000, 2500, 9]
real_val_Y = torch.from_numpy(val_real['data_attribute']) #[50000,4]
real_val_Y_labels = torch.argmax(real_val_Y,1) #[50000,]  returns a list of the class label, no one hot encoding any more
real_val_flags = torch.from_numpy(val_real['data_gen_flag'])   # (50000, 2500)

real_val_X,real_val_Y_labels,real_val_flags = remove_zero_datapoints(real_val_X,real_val_Y_labels,real_val_flags)

real_val_lengths = torch.sum(real_val_flags,1).long()

real_val_masks = real_val_flags == 0

In [5]:
test_real = np.load('../data/google/data_test_reduced.npz')

real_test_X = torch.from_numpy(test_real['data_feature']).float() #[50000, 2500, 9]
real_test_Y = torch.from_numpy(test_real['data_attribute']) #[50000,4]
real_test_Y_labels = torch.argmax(real_test_Y,1) #[50000,]  returns a list of the class label, no one hot encoding any more
real_test_flags = torch.from_numpy(test_real['data_gen_flag'])   # (50000, 2500)

real_test_X,real_test_Y_labels,real_test_flags = remove_zero_datapoints(real_test_X,real_test_Y_labels,real_test_flags)

real_test_lengths = torch.sum(real_test_flags,1).long()

real_test_masks = real_test_flags == 0

# Classification

In [6]:
real_train_timesteps_X = [] # Contains many timesteps each of 9 features [# of samples is 285,352]
real_train_timesteps_Binary = [] # Classify each timestep as end or real data [0: real, 1: end]
for sample, length in zip(real_train_X,real_train_lengths):
    for i in range(length):
        if i==length -1 :
            real_train_timesteps_X.append(sample[i])
            real_train_timesteps_Binary.append(1)
        elif i==length -2:
            real_train_timesteps_X.append(sample[i])
            real_train_timesteps_Binary.append(0)
            
real_train_timesteps_X = torch.stack(real_train_timesteps_X)
real_train_timesteps_Binary = torch.Tensor(real_train_timesteps_Binary).float()

In [7]:
real_train_timesteps_X.shape

torch.Size([48924, 9])

In [20]:
real_val_timesteps_X = [] # Contains many timesteps each of 9 features [# of samples is 285,352]
real_val_timesteps_Binary = [] # Classify each timestep as end or real data [0: real, 1: end]
for sample, length in zip(real_val_X,real_val_lengths):
    for i in range(length):
        if i==length -1 :
            real_val_timesteps_X.append(sample[i])
            real_val_timesteps_Binary.append(1)
        elif i==length -2:
            real_val_timesteps_X.append(sample[i])
            real_val_timesteps_Binary.append(0)
            
real_val_timesteps_X = torch.stack(real_val_timesteps_X)
real_val_timesteps_Binary = torch.Tensor(real_val_timesteps_Binary).float()

In [21]:
real_test_timesteps_X = [] # Contains many timesteps each of 9 features [# of samples is 285,352]
real_test_timesteps_Binary = [] # Classify each timestep as end or real data [0: real, 1: end]
for sample, length in zip(real_test_X,real_test_lengths):
    for i in range(length):
        if i==length -1 :
            real_test_timesteps_X.append(sample[i])
            real_test_timesteps_Binary.append(1)
        elif i==length -2:
            real_test_timesteps_X.append(sample[i])
            real_test_timesteps_Binary.append(0)
            
real_test_timesteps_X = torch.stack(real_test_timesteps_X)
real_test_timesteps_Binary = torch.Tensor(real_test_timesteps_Binary).float()

In [29]:
class MLPModel(pl.LightningModule):
    def __init__(self,n_features=9,n_hidden=128,n_output=1):
        super().__init__()
        self.model = nn.Sequential(
                  nn.Linear(n_features,n_hidden),
                  nn.ReLU(),
                  nn.Linear(n_hidden,n_hidden),
                  nn.ReLU(),
                  nn.Linear(n_hidden,n_output),
                  nn.Sigmoid()
                )
    def forward(self,x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, class_label = batch
        Y_predicted = self(x)
        loss = nn.BCELoss()(Y_predicted.squeeze(),class_label) #BCELoss doesn't have Softmax by default
        acc = FM.accuracy(Y_predicted.squeeze(),class_label.int())
        self.log('accuracy',acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return {'loss': loss,'acc':acc}
    
    def test_step(self,batch,batch_idx):
        x, class_label = batch
        Y_predicted = self(x)
        loss = nn.BCELoss()(Y_predicted.squeeze(),class_label) #BCELoss doesn't have Softmax by default
        acc = FM.accuracy(Y_predicted.squeeze(),class_label.int())
        self.log('accuracy',acc, on_step=True , prog_bar=True, logger=True)
        return {'loss': loss}
    
    def validation_step(self,batch,batch_idx):
        x, class_label = batch
        Y_predicted = self(x)
        loss = nn.BCELoss()(Y_predicted.squeeze(),class_label) #BCELoss doesn't have Softmax by default
        acc = FM.accuracy(Y_predicted.squeeze(),class_label.int())
        self.log('accuracy',acc, on_step=True , prog_bar=True, logger=True)
        self.log('val_loss',loss)
        return {'val_loss': loss}
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

In [30]:
if __name__ == '__main__':
    params_dataloader = {'shuffle': True,'num_workers':8 ,'batch_size':256} # No need to shuffle rn, they are all the same class
    dataset = torch.utils.data.TensorDataset(real_train_timesteps_X, real_train_timesteps_Binary)
    train_dataloader  = torch.utils.data.DataLoader(dataset, **params_dataloader)

    params_dataloader = {'shuffle': False,'num_workers':8,'batch_size':256} # No need to shuffle rn, they are all the same class
    dataset = torch.utils.data.TensorDataset(real_val_timesteps_X, real_val_timesteps_Binary)
    val_dataloader  = torch.utils.data.DataLoader(dataset, **params_dataloader)
    
    params_dataloader = {'shuffle': False,'num_workers':8,'batch_size':256} # No need to shuffle rn, they are all the same class
    dataset = torch.utils.data.TensorDataset(real_test_timesteps_X, real_test_timesteps_Binary)
    test_dataloader  = torch.utils.data.DataLoader(dataset, **params_dataloader)
    mlp_model = MLPModel()

early_stop_callback = EarlyStopping(monitor='val_loss', mode='min')

# ck = torch.load('lightning_logs/version_56/checkpoints/epoch=52-step=59094.ckpt')['state_dict']
# mlp_model.load_state_dict(ck)

trainer = pl.Trainer(gpus=1,max_epochs=100,progress_bar_refresh_rate=1)
trainer.fit(mlp_model,train_dataloader)
trainer.test(test_dataloaders=test_dataloader) # No need to repass (model), It will by itself work from test_step
print("DONE")

# The path for it is Token\lightning_logs\version_56\checkpoints\epoch=52-step=59094.ckpt --> Test Accuracy is  0.8119

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 17.9 K
-------------------------------------
17.9 K    Trainable params
0         Non-trainable params
17.9 K    Total params
0.072     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'accuracy': 0.8571428656578064, 'accuracy_epoch': 0.94134122133255}
--------------------------------------------------------------------------------
DONE


# Dataset and DataLoader

In [None]:
B = real_train_X.size(0)
S = real_train_X.size(1)
E = real_train_X.size(2)

# 1- Shift the targets
Input_shifted = real_train_X[:,1:]
Zero_at_the_end = torch.zeros((B,1,E))
targets = torch.cat((Input_shifted,Zero_at_the_end),1) # real_train_X shifted to the left one timestep

targets=  targets[:,:400]
real_train_masks = real_train_masks[:,:400]
real_train_X = real_train_X[:,:400]

S = real_train_X.size(1)

params_dataloader = {'shuffle': True,'num_workers':8 ,'batch_size':128} # No need to shuffle rn, they are all the same class
# "num_workers" is how many subprocesses to use for data loading.
dataset = torch.utils.data.TensorDataset(real_train_X, targets, real_train_auxiliary.long(), real_train_masks)
train_dataloader  = torch.utils.data.DataLoader(dataset, **params_dataloader)

In [None]:
# Validation Dataset and DataLoader 

B = real_val_X.size(0)
S = real_val_X.size(1)
E = real_val_X.size(2)

Input_shifted = real_val_X[:,1:]
Zero_at_the_end = torch.zeros((B,1,E))
targets = torch.cat((Input_shifted,Zero_at_the_end),1) # real_train_X shifted to the left one timestep

targets=  targets[:,:400]
real_val_masks = real_val_masks[:,:400]
real_val_X = real_val_X[:,:400]

S = real_val_X.size(1)

params_dataloader = {'shuffle': False,'num_workers':8 ,'batch_size':128} # No need to shuffle rn, they are all the same class
dataset = torch.utils.data.TensorDataset(real_val_X, targets, real_val_auxiliary, real_val_masks)
val_dataloader  = torch.utils.data.DataLoader(dataset, **params_dataloader)

# TST

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class TimeSeriesTransformer(pl.LightningModule):

    def __init__(self, n_features=9, n_auxiliary=3, d_model=256, n_heads=8, n_hidden=256, n_layers=8, dropout=0.0, S=400):
        super().__init__()
        self.model_type = 'Time Series Transformer Model'
        self.InputLinear = nn.Linear(n_features, d_model)
        
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, n_heads, n_hidden, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, n_layers)
        
        self.d_model = d_model
        self.n_features = n_features
        
        self.OutputLinear = nn.Linear(d_model, n_features) # The output of the encoder is similar to the input of the encoder, both are (B,S,d_model)
        self.AuxiliaryLinear = nn.Linear(n_features, n_auxiliary) 
        self.init_weights()
        self.activation = nn.Sigmoid()
        
    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float(-1e6)).masked_fill(mask == 1, float(0.0))
        return mask 

    def init_weights(self):
        initrange = 0.1
        self.InputLinear.weight.data.uniform_(-initrange, initrange)
        self.OutputLinear.bias.data.zero_()
        self.OutputLinear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask,padding_mask):
        src = self.InputLinear(src) * math.sqrt(self.d_model)
        src = self.positional_encoding(src)
        output = self.transformer_encoder(src, src_mask,padding_mask)
        output = self.OutputLinear(output)
        output1 = self.activation(output) # output[...,:9] --> Actual 9 values
        output2 = self.AuxiliaryLinear(output) #aux
        return output1,output2
    
    def training_step(self, batch, batch_idx):

        X,target,aux_real,padding_mask = batch
        src_mask = self.generate_square_subsequent_mask(S).cuda()
        X = X.permute(1,0,2)
        Y_predicted, aux_predicted = self(X,src_mask,padding_mask)
        Y_predicted = Y_predicted.permute(1,0,2)
        aux_predicted =  aux_predicted.permute(1,0,2)
        
        mse = nn.MSELoss(reduction='none')
        diff2 = mse(Y_predicted, target).flatten()
        flags = (~padding_mask).unsqueeze(2).expand(-1,-1,9).float().flatten()
        mse_loss = diff2*flags
        mse_loss = torch.sum(diff2) / torch.sum(flags)
        # In PyTorch Transformer, Calculated the CrossEntropy using torch.Size([35, 20, 28782]), View/Reshape -1: torch.Size([700, 28782]), Targets: torch.Size([700])
        aux_loss = nn.CrossEntropyLoss()(aux_predicted.reshape(-1, 3),aux_real.flatten())
        
        loss = mse_loss + aux_loss
        return {'loss': loss,'aux_loss':aux_loss} # will call loss.backward() on what we return exactly. 
    
    def training_epoch_end(self, outputs):
        if((self.current_epoch+1)%100==0):
            torch.save(self.state_dict(), 'W_transformer_aux')
        print("Epoch Loss:",torch.stack([x["loss"] for x in outputs]).mean().item(), "Aux:", torch.stack([x["aux_loss"] for x in outputs]).mean().item())
        
    # Lightning disables gradients, puts model in eval mode, and does everything needed for validation.
#     def validation_step(self, batch, batch_idx):
#         X,target,aux,padding_mask = batch
#         src_mask = self.generate_square_subsequent_mask(S).cuda()
#         X = X.permute(1,0,2)
#         Y_predicted = self(X,src_mask,padding_mask)
#         Y_predicted = Y_predicted.permute(1,0,2)
            
#         mse_loss = nn.MSELoss(reduction='none')
#         diff2 = mse_loss(Y_predicted, target).flatten()
#         flags = (~padding_mask).unsqueeze(2).expand(-1,-1,9).float().flatten()
#         loss = diff2*flags
#         loss = torch.sum(diff2) / torch.sum(flags)
#         self.log('val_loss', loss)
#         return {'val_loss': loss,} # We may return the predictions themselves
    
#     def validation_epoch_end(self, outputs):
#         print("Validation Loss:",torch.stack([x["val_loss"] for x in outputs]).mean().item())
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.0001)
    


# Training

**Notes on EarlyStopping:**
- The EarlyStopping callback runs at the **end of every validation epoch**, which, under the default configuration, happen after **every training epoch**.
-  However, the frequency of validation can be modified by setting various parameters in the Trainer, for example **check_val_every_n_epoch and val_check_interval**.
- Note that the **patience** parameter counts the number of **validation epochs with no improvement**, and **not the number of training epochs**. 
    - Therefore, with parameters **check_val_every_n_epoch=10 and patience=3**, the trainer will perform at least **40 training epochs before being stopped**. 

In [None]:

def main():
    # pl.seed_everything(42, workers=True) --> sets seeds for numpy, torch, python.random and PYTHONHASHSEED.
    time_all = time()

    model = TimeSeriesTransformer()
    early_stop_callback = EarlyStopping(monitor='val_loss',patience=5, verbose=False, mode='min')
    checkpoint_callback = ModelCheckpoint()
#     trainer = pl.Trainer(gpus=2,max_epochs=400, progress_bar_refresh_rate=50,accelerator ='ddp',
#                         callbacks=[early_stop_callback,checkpoint_callback]
#                          ,plugins=DDPPlugin(find_unused_parameters=False,check_val_every_n_epoch=2))
    
    
    trainer = pl.Trainer(gpus=1,max_epochs=400, progress_bar_refresh_rate=50,check_val_every_n_epoch=3,
                        callbacks=[checkpoint_callback],)
    trainer.fit(model,train_dataloader)
    print("Total Time (in minutes) is {}".format( timedelta(seconds=(time()-time_all))))
    print(checkpoint_callback.best_model_path)

if __name__ == '__main__':
    main()

In [79]:
gc.collect()

Exception ignored in: <bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7f39d9b19438>>
Traceback (most recent call last):
  File "/rhome/yelnady/.local/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1324, in __del__
    self._shutdown_workers()
  File "/rhome/yelnady/.local/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1297, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/rdata/yelnady/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/process.py", line 124, in join
    res = self._popen.wait(timeout)
  File "/rdata/yelnady/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/popen_fork.py", line 47, in wait
    if not wait([self.sentinel], timeout):
  File "/rdata/yelnady/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/connection.py", line 911, in wait
    ready = selector.select(timeout)
  File "/rdata/yelnady/anaconda3/envs/pyt

5057