In [1]:
# # for logging 
import os
from comet_ml import OfflineExperiment

from pytorch_lightning import metrics

import math
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import io
import torchtext
from torchtext.utils import download_from_url, extract_archive
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import pytorch_lightning as pl
from pytorch_lightning.trainer.trainer import Trainer




from pytorch_lightning.callbacks.early_stopping import EarlyStopping # The EarlyStopping callback can be used to monitor a validation metric and stop the training when no improvement is observed.
"""
    To enable it:

    Import EarlyStopping callback.

    Log the metric you want to monitor using log() method.

    Init the callback, and set monitor to the logged metric of your choice.

    Pass the EarlyStopping callback to the Trainer callbacks flag.
"""

from pytorch_lightning import seed_everything
# seed_everything(42)

In [2]:
from sklearn.model_selection import train_test_split

class AlarmDataset(Dataset):
    def __init__(self,data,seq_len,batch_size):
        self.length = len(data)//seq_len # how much data i have         
        self.data = data
        self.seq_len = seq_len
        self.batch_size = batch_size
       
    def __getitem__(self, index: int):
        x = self.data[index*self.seq_len:(index*self.seq_len)+seq_len]
        y = self.data[1+index*self.seq_len:1+(index*self.seq_len)+seq_len]
        return x,y
    
    def __len__(self) -> int:
        return self.length

class MyDataModule(pl.LightningDataModule):
    
    def __init__(self, dir_path:str, file_name:str, batch_size:int=64, seq_len:int=8, filter_seq=350):
        super().__init__()
        self.batch_size = batch_size
        # self.data_path = data_path

        self.tokenizer = get_tokenizer('basic_english')
        self.vocab = build_vocab_from_iterator(map(self.tokenizer,iter(io.open(dir_path+file_name,encoding="utf8"))))
                
        # url = data_path
        # test_filepath, valid_filepath, train_filepath = extract_archive(download_from_url(url))
        seqs = None
        with open(dir_path+file_name) as f:
            seqs = f.readlines()
        seqs = [seq for seq in seqs if len(seq.split())<=filter_seq]

        print(f"total seqs= {len(seqs)}")
        print(seqs[:4])
        train, valid = train_test_split(seqs,test_size=0.30,shuffle=False)
        valid, test = train_test_split(valid,test_size=0.30, shuffle=False)

        with open(dir_path +"train.tokens","w") as f:
            for seq in train:
                f.write(seq)
        
        with open(dir_path +"val.tokens","w") as f:
            for seq in valid:
                f.write(seq)
            
        with open(dir_path +"test.tokens","w") as f:
            for seq in test:
                f.write(seq)

        train_data = self.data_process(iter(io.open(dir_path +"train.tokens", encoding="utf8")))
        val_data = self.data_process(iter(io.open(dir_path +"val.tokens", encoding="utf8")))
        test_data = self.data_process(iter(io.open(dir_path +"test.tokens", encoding="utf8")))

    
        self.train_dataset = AlarmDataset(train_data, seq_len,self.batch_size)
        self.valid_dataset = AlarmDataset(val_data,seq_len,self.batch_size)
        self.test_dataset = AlarmDataset(test_data, seq_len,self.batch_size)

    
    def data_process(self, raw_text_iter):
        data = [torch.tensor([self.vocab[token] for token in self.tokenizer(item)],dtype=torch.long) for item in raw_text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))


    def prepare_data(self):
        """
            Use this method to do things that might write to disk or that need to be done only from a single GPU in distributed settings.
            e.g., download,tokenize,etc…
        """ 
        return None


    def setup(self, stage: None):
        """
            There are also data operations you might want to perform on every GPU. Use setup to do things like:
            count number of classes,build vocabulary,perform train/val/test splits,apply transforms (defined explicitly in your datamodule or assigned in init),etc…
        """
        return None

    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=False,num_workers=8,drop_last=True, pin_memory=True)
    
    def val_dataloader(self) -> DataLoader:
        return DataLoader(self.valid_dataset, batch_size=self.batch_size, shuffle=False,num_workers=8,drop_last=True, pin_memory=True)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False,num_workers=8,drop_last=True, pin_memory=True)



In [3]:
class PositionalEncoding(torch.nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [4]:
class TransformerModel(pl.LightningModule):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0, seq_len=None, lr=0.0013,weight_decay=0.0):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.lr = lr
        self.weight_decay = weight_decay
        self.ntoken = ntoken
        self.seq_len = seq_len 
        self.ninp = ninp

        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = torch.nn.TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = torch.nn.TransformerEncoder(encoder_layers, nlayers)
        self.encoder = torch.nn.Embedding(ntoken, ninp)
        self.decoder = torch.nn.Linear(ninp, ntoken)
        self.src_mask = self.generate_square_subsequent_mask(seq_len)
        self.init_weights()

        self.train_F1 = metrics.classification.F1(num_classes=self.ntoken)
        self.val_F1 = metrics.classification.F1(num_classes=self.ntoken)
        self.test_F1 = metrics.classification.F1(num_classes=self.ntoken)


        self.log("seq_len",self.seq_len)
        self.log("lr",lr)
        self.log("# of tokens {unique alarms}",self.ntoken)
        self.log("weight_decay",self.weight_decay)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src_mask = src_mask.to(self.device)
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        src_mask = src_mask.to(self.device)
      
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        
        return output

   # The ReduceLROnPlateau scheduler requires a monitor
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr,weight_decay=self.weight_decay)
        d = {
       'optimizer': optimizer,
       'lr_scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = "min", factor = 0.5, patience=8, verbose=True),
       'monitor': 'val_epoch_loss',
        'interval': 'epoch'
        }
        return d    
    
    def training_step(self,batch,batch_idx):
        x,y = batch
        x = x.T
        y = y.T.reshape(-1)

        if x.size(0) != self.seq_len:
           self.src_mask =  self.generate_square_subsequent_mask(x.size(0))
        
        y_hat = self(x,self.src_mask)
        y_hat =  y_hat.view(-1, self.ntoken)
        loss = F.cross_entropy(y_hat,y) # cross entropy itself compute softmax 
        
        self.log('train_loss',loss,logger=True)
        self.log('train_F1',self.train_F1(F.softmax(y_hat),y),logger=True, on_step=True,prog_bar=True)
        return loss

    def validation_step(self,batch, batch_idx):
        x,y = batch
        x = x.T
        y = y.T.reshape(-1)
        
        if x.size(0) != self.seq_len:
           self.src_mask =  self.generate_square_subsequent_mask(x.size(0))
        
        y_hat = self(x,self.src_mask)
        y_hat =  y_hat.view(-1, self.ntoken)
        loss = F.cross_entropy(y_hat,y)

        self.log('val_loss',loss,logger=True)
        self.log('val_F1',self.val_F1(F.softmax(y_hat) ,y),logger=True, on_step=True,prog_bar=True)
        return {'val_loss':loss}
    
    def test_step(self,batch, batch_idx):
        x,y = batch
        x = x.T
        y = y.T.reshape(-1)
        if x.size(0) != self.seq_len:
           self.src_mask =  self.generate_square_subsequent_mask(x.size(0))

        y_hat = self(x,self.src_mask)
        y_hat =  y_hat.view(-1, self.ntoken)
        loss = F.cross_entropy(y_hat,y)

        self.test_F1(F.softmax(y_hat) ,y)
        
        self.log('test_loss',loss,logger=True)
        self.log('test_F1', self.test_F1(F.softmax(y_hat) ,y),logger=True)
        return {'test_loss':loss}
    
    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([d['loss']  for d in outputs]).mean()
        f1 = self.train_F1.compute()
        print(f">Epoch ={self.current_epoch}, Avg Training loss = {avg_loss}, F1 = {f1}")
        self.log("train_epoch_loss",avg_loss,logger=True,prog_bar=True)
        self.log("train_epoch_F1", f1, logger=True,prog_bar=True)
  
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([d['val_loss'] for d in outputs]).mean()
        f1 = self.val_F1.compute()
        print(f">== Average Valid Loss = {avg_loss}, F1 = {f1}")
        self.log("val_epoch_loss",avg_loss,logger=True)
        self.log("val_epoch_F1",f1,logger=True,prog_bar=True)
    
    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([d['test_loss'] for d in outputs]).mean()
        f1 = self.test_F1.compute()
        print(f">Average Test Loss = {avg_loss}, f1= {f1}")
        self.log("test_epoch_loss",avg_loss, logger = True)
        self.log("test_epoch_F1",f1, logger=True)
    

In [5]:
dir_name = "../.data/"
fname = 'seqs.tokens'
# tb_logger = pl_loggers.TensorBoardLogger('logs/')

bsize = 64
seq_len = 128
filter_seq = 350

dm = MyDataModule(dir_path=dir_name,file_name=fname,batch_size=bsize,seq_len=seq_len,filter_seq=filter_seq)
ntokens = len(dm.vocab.stoi) # the size of vocabulary
print(f"> Vocab Size (Number of Unique Alarms): {ntokens}")

emsize = 256 # embedding dimension
nhid = 256 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers =2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models

dropout = 0.0 # the dropout value
weight_decay = 0.000
lr = 0.0001

model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout,seq_len=seq_len,lr=lr,weight_decay=weight_decay)
print(f"> Customised lr = {model.lr}")

30825lines [00:01, 15493.72lines/s]
total seqs= 27975
['A17 A75 A17 A57 A17 A99 A98 A56\n', 'A245 A246 A50 A243\n', 'A243 A9 A559 A1025\n', 'A50 A59 A60 A64 A392 A726 A9 A725 A726 A725 A243 A725\n']
> Vocab Size (Number of Unique Alarms): 960
> Customised lr = 0.0001


# Trainer

**Note: When monitoring any parameter after the validation epoch end then you should pass check_val_every_n_epoch=1  not to other. This is very important.**

In [6]:
import mlflow.pytorch
from mlflow.tracking import MlflowClient

early_stop_callback = EarlyStopping(
   monitor='val_epoch_loss',
   min_delta=0.00,
   patience=20,
   verbose=True,
   mode='min'
)



trainer = Trainer(precision=16,gpus=-1, num_nodes=1,  max_epochs=1200, check_val_every_n_epoch=1,deterministic=True, gradient_clip_val=0.5,enable_pl_optimizer=True,callbacks=[early_stop_callback])
# accelerator='dp'
progress_bar_refresh_rate=0 # set to zero to disable it


def print_auto_logged_info(r):
    tags = {k: v for k, v in r.data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in MlflowClient().list_artifacts(r.info.run_id, "model")]
    print("run_id: {}".format(r.info.run_id))
    print("artifacts: {}".format(artifacts))
    print("params: {}".format(r.data.params))
    print("metrics: {}".format(r.data.metrics))
    print("tags: {}".format(tags))

# Auto log all MLflow entities
mlflow.pytorch.autolog()

# Train the model
with mlflow.start_run() as run:
    trainer.fit(model,dm) # traning and validation

# fetch the auto logged parameters and metrics
print_auto_logged_info(mlflow.get_run(run_id=run.info.run_id))



GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.
COMET INFO: No Comet API Key was found, creating an OfflineExperiment. Set up your API Key to get the full Comet experience https://www.comet.ml/docs/python-sdk/advanced/#python-configuration
2021/01/16 06:26:39 INFO mlflow.utils.autologging_utils: pytorch autologging will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pytorch workflow to the MLflow run with ID 'fdb13fd9cda94cc78af9a9c831c1d1be'

  | Name                | Type               | Params
-----------------------------------------------------------
0 | pos_encoder         | PositionalEncoding | 0     
1 | transformer_encoder | TransformerEncoder | 791 K 
2 | encoder             | Embedding          | 245 K 
3 | decoder             | Linear             | 246 K 
4 | train_F1            | F1                 | 0     
5 | val_F1        

HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

>== Average Valid Loss = 7.471125602722168, F1 = 6.103515625e-05


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.9445329308509827, F1 = 0.815323531627655
>Epoch =0, Avg Training loss = 1.00809645652771, F1 = 0.8267511129379272


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.9541664123535156, F1 = 0.8113608956336975
>Epoch =1, Avg Training loss = 0.6413501501083374, F1 = 0.852878212928772


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.9257034659385681, F1 = 0.8092100620269775
>Epoch =2, Avg Training loss = 0.5997321009635925, F1 = 0.854438304901123


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.9060231447219849, F1 = 0.8113204836845398
>Epoch =3, Avg Training loss = 0.5760810375213623, F1 = 0.8577659130096436


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.8975859880447388, F1 = 0.8075355887413025
>Epoch =4, Avg Training loss = 0.5647791624069214, F1 = 0.8590824007987976


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.9236698150634766, F1 = 0.7823100686073303
>Epoch =5, Avg Training loss = 0.5564301609992981, F1 = 0.860651433467865


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.8929443359375, F1 = 0.7981109619140625
>Epoch =6, Avg Training loss = 0.5451700091362, F1 = 0.8629342317581177


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.9004809260368347, F1 = 0.7930722236633301
>Epoch =7, Avg Training loss = 0.5399174094200134, F1 = 0.8641495704650879


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.8831403255462646, F1 = 0.8035324811935425
>Epoch =8, Avg Training loss = 0.5312461853027344, F1 = 0.8664150834083557


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.8726280927658081, F1 = 0.8082341551780701
>Epoch =9, Avg Training loss = 0.527281641960144, F1 = 0.8678411841392517


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.863749623298645, F1 = 0.8139999508857727
>Epoch =10, Avg Training loss = 0.5215423703193665, F1 = 0.8686859011650085


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.8608298301696777, F1 = 0.8181198835372925
>Epoch =11, Avg Training loss = 0.517568826675415, F1 = 0.8698300123214722


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.8736092448234558, F1 = 0.806412398815155
>Epoch =12, Avg Training loss = 0.5128885507583618, F1 = 0.8701900243759155


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.8557664752006531, F1 = 0.8194440603256226
>Epoch =13, Avg Training loss = 0.5097476840019226, F1 = 0.8712642788887024


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.9176872968673706, F1 = 0.7712515592575073
>Epoch =14, Avg Training loss = 0.5052396655082703, F1 = 0.8720402717590332


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

>== Average Valid Loss = 0.9014582633972168, F1 = 0.7768309116363525
>Epoch =15, Avg Training loss = 0.503762423992157, F1 = 0.8728628158569336
COMET INFO: ----------------------------------
COMET INFO: Comet.ml OfflineExperiment Summary
COMET INFO: ----------------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : [OfflineExperiment will get URL after upload]
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     loss [761] : (0.15430860221385956, 7.46098518371582)
COMET INFO:   Others:
COMET INFO:     Created from       : MLFlow auto-logger
COMET INFO:     Mode               : training
COMET INFO:     offline_experiment : True
COMET INFO:   Parameters:
COMET INFO:     amsgrad        : 1
COMET INFO:     betas          : (0.9, 0.999)
COMET INFO:     epochs         : 20
COMET INFO:     eps            : 1e-08
COMET INFO:     lr             : 0.0001
COMET INFO:     min_delta      : 1
COMET INFO:     mode           : 

In [7]:
# trainer = Trainer(precision=16,gpus=1,max_epochs=1200,check_val_every_n_epoch=4,deterministic=True, gradient_clip_val=0.5,logger=tb_logger)
# trainer.fit(model,dm) # traning and validation

### Learning Rate Finder

In [8]:
# model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout,seq_len=seq_len)
# trainer = Trainer(precision=16,gpus=1,max_epochs=400,check_val_every_n_epoch=4,deterministic=True, gradient_clip_val=0.5,logger=tb_logger,progress_bar_refresh_rate=50,auto_lr_find=0.002)
# trainer.tune(model,dm) # finding the lr : first way
# 2nd way
# lr_finder = trainer.tuner.lr_find(model)
# print(lr_finder.results)
# fig = lr_finder.plot(suggest=True) # Plot with
# fig.show()
# new_lr = lr_finder.suggestion() # Pick point based on plot, or get suggestion



# Main trainer

In [9]:
# model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout,seq_len=seq_len)
# trainer = Trainer(precision=16,gpus=1,max_epochs=1,check_val_every_n_epoch=4,deterministic=True, gradient_clip_val=0.5,logger=tb_logger,progress_bar_refresh_rate=10)
# trainer.fit(model,dm) # traning and validation

# Testing

In [10]:
trainer.test(datamodule=dm) # testing
# %%

2021/01/16 06:22:16 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '99b0d1fa7fc3453ba397207d640591de', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pytorch workflow


HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…

COMET INFO: No Comet API Key was found, creating an OfflineExperiment. Set up your API Key to get the full Comet experience https://www.comet.ml/docs/python-sdk/advanced/#python-configuration
COMET INFO: ----------------------------------
COMET INFO: Comet.ml OfflineExperiment Summary
COMET INFO: ----------------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : [OfflineExperiment will get URL after upload]
COMET INFO:   Metrics [count]:
COMET INFO:     test_F1 [2]          : 0.8368929624557495
COMET INFO:     test_epoch_F1 [2]    : 0.8368929028511047
COMET INFO:     test_epoch_loss [2]  : 0.7977280020713806
COMET INFO:     test_loss [2]        : 0.7977280020713806
COMET INFO:     train_F1 [2]         : 0.8369140625
COMET INFO:     train_epoch_F1 [2]   : 0.8753800392150879
COMET INFO:     train_epoch_loss [2] : 0.4936947524547577
COMET INFO:     train_loss [2]       : 0.6517497897148132
COMET INFO:     val_F1 [2]    

[{'test_loss': 0.7977280020713806,
  'test_F1': 0.8368929624557495,
  'test_epoch_loss': 0.7977280020713806,
  'test_epoch_F1': 0.8368929028511047}]