In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchmetrics import MeanSquaredError
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from tqdm.auto import tqdm
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning import Callback
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

In [4]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

In [5]:
class CFG:
    model_name="bert-base-uncased"
    val_size=0.25
    max_len=192
    batch_size=32
    epochs=6
    lr=2e-5
    max_lr=1e-3
    steps_per_epoch=None
    pct_start=0.3
    div_factor=1e+2
    final_div_factor=1e+4
    accumulate=1
    patience=3
    monitor="val_loss"
    seed=42
    debug=True

In [6]:
CFG.__dict__

mappingproxy({'__module__': '__main__',
              'model_name': 'bert-base-uncased',
              'val_size': 0.25,
              'max_len': 192,
              'batch_size': 32,
              'epochs': 6,
              'lr': 2e-05,
              'max_lr': 0.001,
              'steps_per_epoch': None,
              'pct_start': 0.3,
              'div_factor': 100.0,
              'final_div_factor': 10000.0,
              'accumulate': 1,
              'patience': 3,
              'monitor': 'val_loss',
              'seed': 42,
              'debug': True,
              '__dict__': <attribute '__dict__' of 'CFG' objects>,
              '__weakref__': <attribute '__weakref__' of 'CFG' objects>,
              '__doc__': None})

In [7]:
class PhraseSimilarityDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        self.tokenizer_params = {
            "max_length": CFG.max_len,
            "padding": "max_length",
            "truncation": True
        }
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        anchor = self.df.anchor.iloc[index].lower()
        target = self.df.target.iloc[index].lower()        
        
        tokens = self.tokenizer(anchor + '[SEP]' + target, **self.tokenizer_params)
        score = torch.tensor(self.df.score.iloc[index], dtype=torch.float32)
        
        return (
            np.array(tokens["input_ids"]),
            np.array(tokens["attention_mask"]),
            score
        )

In [8]:
class PhraseSimilarityTestset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        self.tokenizer_params = {
            "max_length": CFG.max_len,
            "padding": "max_length",
            "truncation": True
        }
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        anchor = self.df.anchor.iloc[index].lower()
        target = self.df.target.iloc[index].lower()        
        
        tokens = self.tokenizer(anchor + '[SEP]' + target, **self.tokenizer_params)
        
        return (
            np.array(tokens["input_ids"]),
            np.array(tokens["attention_mask"]),
        )

In [9]:
class PhraseSimilarityModelImpl(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.head = nn.Linear(768, 1, bias=True)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, text, mask):
        # calculate bert output
        feats = self.bert(text, mask)
        # calculate sum of all tokens then divide by the number of tokens
        feats = torch.sum(feats[0], 1) / feats[0].shape[1]
        feats = self.dropout(feats)
        output = self.head(feats)
        return output

In [10]:
class PhraseSimilarityModel(pl.LightningModule):
    def __init__(self, model, criterion, metric):
        super(PhraseSimilarityModel, self).__init__()
        self.model = model
        self.criterion = criterion
        self.metric = metric
    
    def forward(self, text, mask):
        return self.model(text, mask)
    
    def configure_optimizers(self):
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=CFG.lr)
        return self.optimizer

    def training_step(self, batch, batch_idx):
        ids, mask = batch[0], batch[1]
        preds = self.model(ids, mask)
        loss = self.criterion(preds.squeeze(1), batch[2])
        rmse = self.metric(preds.squeeze(1), batch[2])
        logs = {"train_loss": loss, "train_error": rmse, "lr": self.optimizer.param_groups[0]['lr']}
        
        self.log_dict(logs, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        ids, mask = batch[0], batch[1]
        preds = self.model(ids, mask)
        loss = self.criterion(preds.squeeze(1), batch[2])
        rmse = self.metric(preds.squeeze(1), batch[2])
        logs = {"val_loss": loss, "val_error": rmse}
        self.log_dict(logs, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def predict_step(self, batch, batch_idx):
        ids, mask = batch[0], batch[1]
        preds = self.model(ids, mask)
        return preds

In [11]:
if CFG.debug == True:
    train_data = train_data.iloc[:200]

scores = train_data.score.values
train_data.drop("score", inplace=True, axis=1)
train_data, val_data, train_labels, val_labels = train_test_split(train_data, scores,
                                                                 stratify=scores,
                                                                 test_size=CFG.val_size,
                                                                 random_state=CFG.seed)

train_data["score"] = train_labels
val_data["score"] =  val_labels

tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
train_dataset = PhraseSimilarityDataset(train_data, tokenizer)

val_dataset = PhraseSimilarityDataset(val_data, tokenizer)
test_dataset = PhraseSimilarityTestset(test_data, tokenizer)

In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [14]:
for i in train_dataloader:
    break

In [16]:
len(i)

3

In [60]:
CFG.steps_per_epoch = len(train_dataloader)
CFG.steps_per_epoch

5

In [61]:
logger = CSVLogger(save_dir='./', name=CFG.model_name.split('/')[-1]+'_log')
logger.log_hyperparams(CFG.__dict__)

In [62]:
checkpoint_callback = ModelCheckpoint(monitor=CFG.monitor,
                                     save_top_k=1,
                                     save_last=True,
                                     save_weights_only=True,
                                     filename="{epoch:02d}-{valid_loss:.4f}-{valid_acc:.4f}",
                                     verbose=False,
                                     mode="min")

In [63]:
early_stop_callback = EarlyStopping(monitor=CFG.monitor,
                                   patience=CFG.patience,
                                   verbose=False,
                                   mode="min")

In [64]:
model = PhraseSimilarityModelImpl(CFG.model_name)
criterion = nn.HuberLoss(reduction='mean', delta=1.0)
metric = MeanSquaredError()
driver = PhraseSimilarityModel(model, criterion, metric)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [65]:
trainer = Trainer(
    max_epochs=CFG.epochs,
    accumulate_grad_batches=CFG.accumulate,
    callbacks=[checkpoint_callback, early_stop_callback], 
    logger=logger,
    weights_summary='top',
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [66]:
trainer.fit(driver, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)


  | Name      | Type                      | Params
--------------------------------------------------------
0 | model     | PhraseSimilarityModelImpl | 109 M 
1 | criterion | HuberLoss                 | 0     
2 | metric    | MeanSquaredError          | 0     
--------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.932   Total estimated model params size (MB)
  warn(f"Skipping '{k}' parameter because it is not possible to safely dump to YAML.")
  warn(f"Skipping '{k}' parameter because it is not possible to safely dump to YAML.")


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]