In [1]:
import math
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import io
import torchtext
from torchtext.utils import download_from_url, extract_archive
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import pytorch_lightning as pl
from pytorch_lightning.trainer.trainer import Trainer
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks.early_stopping import EarlyStopping # The EarlyStopping callback can be used to monitor a validation metric and stop the training when no improvement is observed.
"""
    To enable it:

    Import EarlyStopping callback.

    Log the metric you want to monitor using log() method.

    Init the callback, and set monitor to the logged metric of your choice.

    Pass the EarlyStopping callback to the Trainer callbacks flag.
"""

from pytorch_lightning import seed_everything
# seed_everything(42)

In [2]:
from sklearn.model_selection import train_test_split

class AlarmDataset(Dataset):
    def __init__(self,data,seq_len,batch_size):
        self.length = len(data)//seq_len # how much data i have         
        self.data = data
        self.seq_len = seq_len
        self.batch_size = batch_size
       
    def __getitem__(self, index: int):
        x = self.data[index*self.seq_len:(index*self.seq_len)+seq_len]
        y = self.data[1+index*self.seq_len:1+(index*self.seq_len)+seq_len]
        return x,y
    
    def __len__(self) -> int:
        return self.length

class MyDataModule(pl.LightningDataModule):
    
    def __init__(self, data_path:str, batch_size:int, seq_len:int):
        super().__init__()
        self.batch_size = batch_size
        self.data_path = data_path

        self.tokenizer = get_tokenizer('basic_english')
        self.vocab = build_vocab_from_iterator(map(self.tokenizer,iter(io.open(self.data_path,encoding="utf8"))))
                
        # url = data_path
        # test_filepath, valid_filepath, train_filepath = extract_archive(download_from_url(url))
        seqs = None
        with open(self.data_path) as f:
            seqs = f.readlines()
        seqs = [seq for seq in seqs if len(seq.split())<=350]

        print(f"total seqs= {len(seqs)}")
        print(seqs[:4])
        train, valid = train_test_split(seqs,test_size=0.30,shuffle=False)
        valid, test = train_test_split(valid,test_size=0.30, shuffle=False)

        with open("./.data/train.tokens","w") as f:
            for seq in train:
                f.write(seq)
        
        with open("./.data/val.tokens","w") as f:
            for seq in valid:
                f.write(seq)
            
        with open("./.data/test.tokens","w") as f:
            for seq in test:
                f.write(seq)

        train_data = self.data_process(iter(io.open("./.data/train.tokens", encoding="utf8")))
        val_data = self.data_process(iter(io.open("./.data/val.tokens", encoding="utf8")))
        test_data = self.data_process(iter(io.open("./.data/test.tokens", encoding="utf8")))

    
        self.train_dataset = AlarmDataset(train_data, seq_len,self.batch_size)
        self.valid_dataset = AlarmDataset(val_data,seq_len,self.batch_size)
        self.test_dataset = AlarmDataset(test_data, seq_len,self.batch_size)

    
    def data_process(self, raw_text_iter):
        data = [torch.tensor([self.vocab[token] for token in self.tokenizer(item)],dtype=torch.long) for item in raw_text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))


    def prepare_data(self):
        """
            Use this method to do things that might write to disk or that need to be done only from a single GPU in distributed settings.
            e.g., download,tokenize,etc…
        """ 
        return None


    def setup(self, stage: None):
        """
            There are also data operations you might want to perform on every GPU. Use setup to do things like:
            count number of classes,build vocabulary,perform train/val/test splits,apply transforms (defined explicitly in your datamodule or assigned in init),etc…
        """
        return None

    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=False,num_workers=8,drop_last=True, pin_memory=True)
    
    def val_dataloader(self) -> DataLoader:
        return DataLoader(self.valid_dataset, batch_size=self.batch_size, shuffle=False,num_workers=8,drop_last=True, pin_memory=True)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False,num_workers=8,drop_last=True, pin_memory=True)



In [3]:
class PositionalEncoding(torch.nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [4]:
class TransformerModel(pl.LightningModule):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5, seq_len=None, lr=0.0013,weight_decay=0.0):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.lr = lr
        self.weight_decay = weight_decay
        self.ntoken = ntoken
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = torch.nn.TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = torch.nn.TransformerEncoder(encoder_layers, nlayers)
        self.encoder = torch.nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = torch.nn.Linear(ninp, ntoken)
        self.src_mask = self.generate_square_subsequent_mask(seq_len)
        self.seq_len = seq_len 
        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src_mask = src_mask.to(self.device)
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        src_mask = src_mask.to(self.device)
      
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        
        return output

   # The ReduceLROnPlateau scheduler requires a monitor
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr,weight_decay=self.weight_decay)
        d = {
       'optimizer': optimizer,
       'lr_scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = "min", factor = 0.5, patience=3, verbose=True),
       'monitor': 'val_epoch_loss',
        'interval': 'epoch'
        }
        return d    
    
    def training_step(self,batch,batch_idx):
        x,y = batch
        x = x.T
        y = y.T.reshape(-1)

        if x.size(0) != self.seq_len:
           self.src_mask =  self.generate_square_subsequent_mask(x.size(0))
        
        y_hat = self(x,self.src_mask)
        loss = F.cross_entropy(y_hat.view(-1, self.ntoken),y)
        return loss

    def validation_step(self,batch, batch_idx):
        x,y = batch
        x = x.T
        y = y.T.reshape(-1)
        # print("Validation Shape: ", x.size(),y.size())
        
        if x.size(0) != self.seq_len:
           self.src_mask =  self.generate_square_subsequent_mask(x.size(0))
        
        y_hat = self(x,self.src_mask)
        loss = F.cross_entropy(y_hat.view(-1, self.ntoken),y)
        return {'val_loss':loss}
    
    def test_step(self,batch, batch_idx):
        x,y = batch
        x = x.T
        y = y.T.reshape(-1)
        if x.size(0) != self.seq_len:
           self.src_mask =  self.generate_square_subsequent_mask(x.size(0))

        y_hat = self(x,self.src_mask)
        loss = F.cross_entropy(y_hat.view(-1, self.ntoken),y)
        return {'test_loss':loss}
    
    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([d['loss']  for d in outputs]).mean()
        print(f">Epoch ={self.current_epoch}, Avg Training loss = {avg_loss}")
        
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([d['val_loss'] for d in outputs]).mean()
        print(f"> Average Valid Loss = {avg_loss}")
        self.log("val_epoch_loss",avg_loss)
    
    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([d['test_loss'] for d in outputs]).mean()
        print(f"> Average Test Loss = {avg_loss}")
    

In [5]:
file_path = './.data/seqs.tokens'
tb_logger = pl_loggers.TensorBoardLogger('logs/')

bsize = 1048*(10*4)
seq_len = 4

dm = MyDataModule(file_path,bsize,seq_len)
ntokens = len(dm.vocab.stoi) # the size of vocabulary
print(f"> Vocab Size (Number of Unique Alarms): {ntokens}")

emsize = 256 # embedding dimension
nhid = 256 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers =8 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 16 # the number of heads in the multiheadattention models

dropout = 0.0 # the dropout value
weight_decay = 0.0
lr = 0.001

model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout,seq_len=seq_len,lr=lr,weight_decay=weight_decay)
print(f"> Customised lr = {model.lr}")

30825lines [00:04, 6687.88lines/s]
total seqs= 27975
['A17 A75 A17 A57 A17 A99 A98 A56\n', 'A245 A246 A50 A243\n', 'A243 A9 A559 A1025\n', 'A50 A59 A60 A64 A392 A726 A9 A725 A726 A725 A243 A725\n']
> Vocab Size (Number of Unique Alarms): 960
> Customised lr = 0.001


# Trainer

**Note: When monitoring any parameter after the validation epoch end then you should pass check_val_every_n_epoch=1  not to other. This is very important.**

In [6]:
early_stop_callback = EarlyStopping(
   monitor='val_epoch_loss',
   min_delta=0.00,
   patience=10,
   verbose=True,
   mode='min'
)

trainer = Trainer(precision=16,gpus=-1, num_nodes=1, accelerator='dp', max_epochs=1200, check_val_every_n_epoch=1,deterministic=True, gradient_clip_val=0.5,enable_pl_optimizer=True,callbacks=[early_stop_callback], logger=tb_logger,progress_bar_refresh_rate=0)
trainer.fit(model,dm) # traning and validation


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Using native 16bit precision.
Set SLURM handle signals.

  | Name                | Type               | Params
-----------------------------------------------------------
0 | pos_encoder         | PositionalEncoding | 0     
1 | transformer_encoder | TransformerEncoder | 6.3 M 
2 | encoder             | Embedding          | 245 K 
3 | decoder             | Linear             | 246 K 
-----------------------------------------------------------
6.8 M     Trainable params
0         Non-trainable params
6.8 M     Total params
> Average Valid Loss = 7.070106506347656
> Average Valid Loss = 1.8159294128417969
>Epoch =0, Avg Training loss = 2.164904832839966
> Average Valid Loss = 1.7070680856704712
>Epoch =1, Avg Training loss = 1.5566819906234741


1

In [None]:
# trainer = Trainer(precision=16,gpus=1,max_epochs=1200,check_val_every_n_epoch=4,deterministic=True, gradient_clip_val=0.5,logger=tb_logger)
# trainer.fit(model,dm) # traning and validation

### Learning Rate Finder

In [None]:
# model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout,seq_len=seq_len)
# trainer = Trainer(precision=16,gpus=1,max_epochs=400,check_val_every_n_epoch=4,deterministic=True, gradient_clip_val=0.5,logger=tb_logger,progress_bar_refresh_rate=50,auto_lr_find=0.002)
# trainer.tune(model,dm) # finding the lr : first way
# 2nd way
# lr_finder = trainer.tuner.lr_find(model)
# print(lr_finder.results)
# fig = lr_finder.plot(suggest=True) # Plot with
# fig.show()
# new_lr = lr_finder.suggestion() # Pick point based on plot, or get suggestion



# Main trainer

In [None]:
# model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout,seq_len=seq_len)
# trainer = Trainer(precision=16,gpus=1,max_epochs=1,check_val_every_n_epoch=4,deterministic=True, gradient_clip_val=0.5,logger=tb_logger,progress_bar_refresh_rate=10)
# trainer.fit(model,dm) # traning and validation

# Testing

In [None]:
# trainer.test(datamodule=dm) # testing
# # %%