In [1]:
%%capture
!wget -o dataset https://www.dropbox.com/s/vx57bs5ca4s9cuh/mednli-a-natural-language-inference-dataset-for-the-clinical-domain-1.0.0.zip
!pip install transformers
!pip install git+https://github.com/PyTorchLightning/pytorch-lightning

In [2]:
!nvidia-smi

Thu Apr 15 03:42:49 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
%load_ext tensorboard

In [4]:
import pandas as pd
import torch
import torch.nn as nn
import transformers
import torchmetrics
import pytorch_lightning as pl

import zipfile

In [5]:
CONFIG={
    'ZIP_PATH':'./mednli-a-natural-language-inference-dataset-for-the-clinical-domain-1.0.0.zip', # PATH TO ZIP FILE
    'DATA_PATH':'./mednli-a-natural-language-inference-dataset-for-the-clinical-domain-1.0.0/', # PATH TO UNZIP DATASET
    'sentence1':'sentence1',
    'sentence2':'sentence2',
    'labels':'gold_label',
    'SEED':13,
    'MAX_LEN':256,
    'MODEL_NAME_OR_PATH':'dmis-lab/biobert-v1.1',
    'LEARNING_RATE':2e-5,
    'ADAM_EPSILON':1e-8,
    'WEIGHT_DECAY':0.0,
    'NUM_CLASSES':3,
    'TRAIN_BS':32,
    'VAL_BS':32,
    'WARMUP_STEPS':0,
    'MAX_EPOCHS':5,
    'CHECKPOINT_DIR':'./checkpoints',
    'NUM_WORKERS':2,
    'PRECISION':16,
    'MODEL_SAVE_NAME':'biobert_v1'
}

In [6]:
_=pl.seed_everything(CONFIG['SEED'])

Global seed set to 13


In [7]:
class NLIDataset(torch.utils.data.Dataset):

  def __init__(self,max_len:int,tokenizer,sentence1,sentence2,labels):
    super().__init__()
    self.max_len=max_len
    self.tokenizer=tokenizer
    self.sentence1=sentence1
    self.sentence2=sentence2
    self.labels=labels
  
  def __len__(self):
    return len(self.sentence1)

  def __getitem__(self,idx):
    sentence_1=self.sentence1[idx]
    sentence_2=self.sentence2[idx]
    encoded_input=self.tokenizer.encode_plus(
        text=sentence_1,
        text_pair=sentence_2,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=self.max_len,
        return_token_type_ids=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    return {
        'labels':torch.tensor(self.labels[idx]),
        'input_ids':encoded_input['input_ids'].view(-1),
        'attention_mask':encoded_input['attention_mask'].view(-1),
        'token_type_ids':encoded_input['token_type_ids'].view(-1),
    }

In [8]:
def mnli_df(stage):
  label_map={'entailment':0,'contradiction':1,'neutral':2}
  df=pd.read_json(f"{CONFIG['DATA_PATH']}/mli_{stage}_v1.jsonl",lines=True,)
  df=df[[CONFIG['sentence1'],CONFIG['sentence2'],CONFIG['labels']]]
  df[CONFIG['labels']]=df[CONFIG['labels']].map(label_map)
  return df

In [9]:
class NLIDataModel(pl.LightningDataModule):

    def __init__(self,get_split_def):
        super().__init__()
        self.get_split_def=get_split_def

    def prepare_data(self):
        zip = zipfile.ZipFile(CONFIG['ZIP_PATH'])
        zip.extractall()
        self.tokenizer=transformers.AutoTokenizer.from_pretrained(CONFIG['MODEL_NAME_OR_PATH'])

    def setup(self, stage):

      if stage=='fit':
        self.train_df,self.val_df=self.get_split_def('train'),self.get_split_def('dev')

      if stage=='test':
        self.test_df=self.get_split_def('test')

    def get_dataset(self,df):
      dataset = NLIDataset(max_len=CONFIG['MAX_LEN'],
                               tokenizer=self.tokenizer,
                               sentence1=df[CONFIG['sentence1']].values,
                               sentence2=df[CONFIG['sentence2']].values,
                               labels=df[CONFIG['labels']].values)
      return dataset

    def train_dataloader(self):
      train_dataset=self.get_dataset(self.train_df)
      train_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                                     batch_size=CONFIG['TRAIN_BS'], 
                                                     shuffle=True, 
                                                     num_workers=CONFIG['NUM_WORKERS'])
      
      return train_dataloader

    def val_dataloader(self):
      val_dataset=self.get_dataset(self.val_df)
      val_dataloader = torch.utils.data.DataLoader(val_dataset, 
                                                     batch_size=CONFIG['VAL_BS'], 
                                                     shuffle=False, 
                                                     num_workers=CONFIG['NUM_WORKERS'])
      
      return val_dataloader

    def test_dataloader(self):
      test_dataset=self.get_dataset(self.test_df)
      test_dataloader = torch.utils.data.DataLoader(test_dataset, 
                                                     batch_size=CONFIG['VAL_BS'], 
                                                     shuffle=False, 
                                                     num_workers=CONFIG['NUM_WORKERS'])
      
      return test_dataloader

In [10]:
class NLIFineTuningModel(pl.LightningModule):

  def __init__(self,model_name_or_path:str,
               num_labels:int,
               learning_rate:float,
               adam_epsilon:float,
               weight_decay:float,
               max_len:int,
               warmup_steps:int,
               gpus:int,max_epochs:int,accumulate_grad_batches:int):
    super().__init__()
    self.model_name_or_path=model_name_or_path
    self.num_labels=num_labels
    
    self.save_hyperparameters('learning_rate','adam_epsilon','weight_decay','max_len','gpus','accumulate_grad_batches','max_epochs','warmup_steps') 

    self.config = transformers.AutoConfig.from_pretrained(model_name_or_path, num_labels=self.num_labels)
    self.model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)
    # self.model = nn.Sequential( 
    #     OrderedDict(
    #         [
    #          ('base',transformers.AutoModel.from_pretrained(model_name_or_path)),
    #          ('classifier',nn.Linear(in_features=768,out_features=self.num_labels)),
    #          ('softmax',nn.Softmax())
    #         ]
    #     )
    # )
    metrics = torchmetrics.MetricCollection([
        torchmetrics.Accuracy(),
        torchmetrics.F1(num_classes=3,average='macro')
      ]
    )
    self.train_metrics=metrics.clone()
    self.val_metrics=metrics.clone()


  def forward(self,inputs):
    return self.model(**inputs)
  
  def training_step(self,batch,batch_idx):
    loss,logits=self(batch)[:2]
    predictions=torch.argmax(logits,dim=1)
    self.train_metrics(predictions,batch['labels'])
    self.log_dict({'train_accuracy':self.train_metrics['Accuracy'],'train_f1':self.train_metrics['F1']}, on_step=False, on_epoch=True)
    return {
        'loss':loss,
        'predictions':predictions,
        'labels':batch['labels']
    }
  
  def validation_step(self,batch,batch_idx):
    loss,logits=self(batch)[:2]
    predictions=torch.argmax(logits,dim=1)
    self.val_metrics(predictions,batch['labels'])
    self.log_dict({'val_accuracy':self.val_metrics['Accuracy'],'val_f1':self.val_metrics['F1']}, on_step=False, on_epoch=True)
    return {
        'loss':loss,
        'predictions':predictions,
        'labels':batch['labels']
    }
  
  def test_step(self,batch,batch_idx):
    loss,logits=self(batch)[:2]
    predictions=torch.argmax(logits,dim=1)
    self.val_metrics(predictions,batch['labels'])
    self.log_dict({'test_accuracy':self.val_metrics['Accuracy'],'test_f1':self.val_metrics['F1']}, on_step=False, on_epoch=True)
    return {
        'loss':loss,
        'predictions':predictions,
        'labels':batch['labels']
    }

  def validation_epoch_end(self,outputs):
    loss=torch.tensor([x['loss'] for x in outputs])
    loss = loss.mean()
    self.log('val_loss', loss, prog_bar=True,on_step=False, on_epoch=True )
  
  def training_epoch_end(self,outputs):
    loss=torch.tensor([x['loss'] for x in outputs])
    loss = loss.mean()
    self.log('train_loss', loss, prog_bar=True,on_step=False, on_epoch=True )
  
  def setup(self, stage):
    if stage == 'fit':
      train_loader = self.train_dataloader()
      self.total_steps = (
          (len(train_loader.dataset) // (train_loader.batch_size * max(1, self.hparams.gpus)))
          // self.hparams.accumulate_grad_batches * float(self.hparams.max_epochs)
      )

  def configure_optimizers(self):
    model = self.model
    no_decay = ["bias", "LayerNorm.weight","LayerNorm.bias"]
    optimizer_grouped_parameters = [
          {
              "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
              "weight_decay": self.hparams.weight_decay,
          },
          {
              "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
              "weight_decay": 0.0,
          },
    ]
    optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps
    )
    scheduler = {
        'scheduler': scheduler,
        'interval': 'step',
        'frequency': 1
    }
    return [optimizer] ,[scheduler]

In [11]:
model_save_checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    dirpath=CONFIG['CHECKPOINT_DIR'],
    filename=f"{CONFIG['MODEL_SAVE_NAME']}"+'-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1,
    mode='min',
)

In [12]:
trainer = pl.Trainer(gpus=torch.cuda.device_count(),
                     max_epochs=CONFIG['MAX_EPOCHS'],
                     callbacks=[model_save_checkpoint],
                     precision=CONFIG['PRECISION'],
                     num_sanity_val_steps=0
                    )

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.


In [13]:
model=NLIFineTuningModel(
    model_name_or_path=CONFIG['MODEL_NAME_OR_PATH'],
    num_labels=CONFIG['NUM_CLASSES'],
    learning_rate=CONFIG['LEARNING_RATE'],
    adam_epsilon=CONFIG['ADAM_EPSILON'],
    weight_decay=CONFIG['WEIGHT_DECAY'],
    max_len=CONFIG['MAX_LEN'],
    warmup_steps=CONFIG['WARMUP_STEPS'],
    max_epochs=trainer.max_epochs,
    gpus=trainer.gpus,
    accumulate_grad_batches=trainer.accumulate_grad_batches,
)

mnli_dm=NLIDataModel(get_split_def=mnli_df)
trainer.fit(model,mnli_dm)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                          | Params
----------------------------------------------------------------
0 | model         | BertForSequenceClassification | 108 M 
1 | train_metrics | MetricCollection              | 0     
2 | val_metrics   | MetricCollection              | 0     
----------------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.250   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

In [14]:
trainer.logged_metrics

{'train_accuracy': tensor(0.9573, device='cuda:0'),
 'train_f1': tensor(0.9573, device='cuda:0'),
 'epoch': tensor(4.),
 'train_loss': tensor(0.1442, device='cuda:0'),
 'val_accuracy': tensor(0.8495),
 'val_f1': tensor(0.8491),
 'val_loss': tensor(0.5132)}

In [15]:
trainer.test()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': 0.8255977630615234,
 'test_f1': 0.8256306648254395,
 'test_loss': 0.47156623005867004}
--------------------------------------------------------------------------------


[{'test_accuracy': 0.8255977630615234,
  'test_f1': 0.8256306648254395,
  'test_loss': 0.47156623005867004}]

In [16]:
%tensorboard --logdir ./lightning_logs/