In [None]:
!nvidia-smi

In [None]:
!pip install transformers jsonlines datasets pyarrow  

In [None]:
# install nvidia apex to use mixed precision

try:
  import apex
except:
  !git clone https://github.com/NVIDIA/apex
  %cd apex
  !pip install -v --no-cache-dir ./

# Load Data as HF dataset


In [None]:
# mount drive to access data
from google.colab import drive
drive.mount('/content/gdrive')

In [2]:
# load data

import datasets

DIR = "/content/gdrive/MyDrive/CUNY_Comp_Ling/advanced_nlp/term_project/data/"
TRAIN = 'train.jsonl'
DEV = 'dev.jsonl'
TEST = 'test.jsonl'



In [None]:
dataset = datasets.load_dataset('json', data_files={'train': DIR + TRAIN, 'validation': DIR + DEV, 'test': DIR + TEST})  

print(dataset)

In [None]:
# check the label distribution

targets = []
for data in dataset.values():
  for d in data:
    if d['label'] not in targets:
      targets.append(d['label'])

    
sorted(targets)

# Tokenize

In [9]:
from transformers import LongformerTokenizer
# LongformerTokenizer is identical to RobertaTokenizer (SentencePiece)
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

In [10]:
import torch
import datasets
from typing import List, Dict

def prep_data(data):
    encodings = tokenizer.encode_plus(
        data['abstract'], 
        data['text'],
        pad_to_max_length=True, 
        max_length=4096,
        add_special_tokens=True,
        return_token_type_ids=False,
        return_attention_mask=True,
        padding='max_length', 
        truncation=True,
        )
    # add the 1st CLS as a global token
    global_attention_mask = torch.zeros(len(encodings.input_ids), dtype=torch.long)
    global_attention_mask[0] = 1
    encodings.update({'global_attention_mask': global_attention_mask})

    # convert label to float tensor for regression training
    label = data['label']
    targets = torch.tensor(label, dtype=torch.float)
    targets.contiguous()
    # to match the shape of the input tensor (1,1)
    targets = targets.view(-1, 1)
    encodings.update({'labels': targets})

    return encodings

In [None]:
# convert data

train_set = dataset['train']
validation_set = dataset['validation']
test_set = dataset['test']

train_set =  train_set.map(prep_data)
validation_set =  validation_set.map(prep_data)
test_set = test_set.map(prep_data)

columns = ['input_ids', 'attention_mask', 'global_attention_mask', 'labels']
train_set.set_format(type='torch', columns=columns)
validation_set.set_format(type='torch', columns=columns)
test_set.set_format(type='torch', columns=columns)

In [None]:
train_set.shape, validation_set.shape, test_set.shape

In [None]:
print(train_set[0])

# Create Dataloader

In [13]:
from torch.utils.data import DataLoader

batch_size = 1 # batch size of 1 with gradient accumulation to 32

train_loader = DataLoader(train_set, batch_size, shuffle=True, num_workers=2)
valid_loader = DataLoader(validation_set, batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(test_set, batch_size, shuffle=True, num_workers=2)

# Train


In [4]:
import apex
import datasets
import numpy as np
import os
import pandas as pd
from pathlib import Path
import random
import tqdm
from typing import List, Dict, Optional, Union, Tuple

import torch
from torch import nn
from torch import functional as F
from torch.utils.data import (
    TensorDataset,
    random_split,
    RandomSampler,
    DataLoader,
)

from transformers import (
    LongformerForSequenceClassification,
    LongformerModel,
    LongformerConfig, 
    Trainer, 
    TrainingArguments,
    AdamW,
)

from transformers.file_utils import ModelOutput

In [5]:
# Copied from transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput

class LongformerSequenceClassifierOutput(ModelOutput):

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    global_attentions: Optional[Tuple[torch.FloatTensor]] = None

In [6]:
class LongformerCNN(LongformerModel):
    def __init__(self, config,
                 channels=(4096,4096),
                 filter_sizes=(3,3),
                 num_filters=(100,100), 
                 dropout=0.1,
                 num_labels=1,
                 ):
        super().__init__(config)
        self.num_labels=num_labels

        # embedding model
        self.longformer = LongformerModel.from_pretrained(
            'allenai/longformer-base-4096',
            add_pooling_layer=False,
            )
        
        # cnn
        self.filter_sizes=filter_sizes
        self.num_filters=num_filters
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=channels[i],
                      out_channels=self.num_filters[i],
                      kernel_size=self.filter_sizes[i],
                      padding=1,
                      stride=1) # from He et al 2016 PWIM paper
            for i in range(len(filter_sizes))
        ])

        # dense + sigmoid
        self.dense = nn.Linear(np.sum(self.num_filters), self.num_labels)
        self.dropout = nn.Dropout(p=dropout)
        self.sigmoid = nn.Sigmoid()

    def forward(self,  input_ids,
              attention_mask,
              global_attention_mask,
              labels,
              return_dict=True):

        outputs = self.longformer(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask
            )
        seq_output = outputs[0]
        conv_output = [nn.functional.relu(conv(seq_output)) for conv in self.convs]
        pooled_output = [nn.MaxPool1d(kernel_size=output.shape[2])(output)
            for output in conv_output]
        concat_output = torch.cat([pool.squeeze(dim=2) for pool in pooled_output], dim=1)
        logits = self.dense(self.dropout(concat_output))
        logits = self.sigmoid(logits)
        
        # from  transformers.models.longformer.modeling_longformer.LongformerForSequenceClassification
        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            if self.num_labels == 1:
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            else:
                loss = loss_fct(logits, labels)
        
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return LongformerSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            global_attentions=outputs.global_attentions,
        )

In [15]:
# define the training arguments

SAVE_PATH = DIR + 'longformer_cnn_4096/'

training_args = TrainingArguments(
    output_dir = SAVE_PATH,
    do_train = True,
    do_eval = True,
    num_train_epochs = 1,
    per_device_train_batch_size = 1, # as in the paper
    gradient_accumulation_steps = 32, # as in the paper    
    per_device_eval_batch_size= 8,
    evaluation_strategy = "steps",
    eval_steps = 100,
    disable_tqdm = False, 
    load_best_model_at_end=True,
    learning_rate = 3e-5, # from paper (default = 5e-5)
    warmup_steps=len(train_set)//10,
    weight_decay=0.01,
    logging_steps = 500, # =default
    fp16 = True,
    fp16_opt_level = 'O1', # default for apex mixed precision
    logging_dir= DIR + '/logs/',
    dataloader_num_workers = 2,
    run_name = 'longformer-cnn',
)

In [None]:
# instantiate model

config = LongformerConfig(
    num_labels=1, # regression
    gradient_checkpointing=True,
    vocab_size=tokenizer.vocab_size)
lf = LongformerCNN(config)
lf.config

In [None]:
# resize token embeddings
lf.resize_token_embeddings(len(tokenizer))

# train
trainer = Trainer(
    model = lf,
    args = training_args,
    train_dataset = train_set,
    eval_dataset = validation_set,
)

# set device to cuda
device = 'cuda' if torch.cuda.is_available() else 'cpu'

trainer.train()

# save best model
lf.save_pretrained(SAVE_PATH)


In [None]:
import pprint
try:
  eval_metrics = trainer.evaluate()
  pprint.pprint(eval_metrics)
except:
  print("no eval")

# Test

In [None]:
# save predictions

import numpy as np
import pandas as pd
import copy
import gc

SAVE_PATH = DIR + 'longformer_cnn_4096/'
lf = LongformerCNN.from_pretrained(SAVE_PATH)

PREDS_SAVE_PATH = DIR + "longformer_cnn_predictions.csv"

df = pd.DataFrame(columns=['predictions', 'gold_labels'])
df.to_csv(PREDS_SAVE_PATH)

lf.to('cuda')

for batch in test_loader:
    input_ids = batch["input_ids"].to('cuda')
    attention_mask = batch["attention_mask"].to('cuda')
    global_attention_mask = batch['global_attention_mask'].to('cuda')
    
    outputs = lf(
                input_ids=input_ids,
                attention_mask=attention_mask,
                global_attention_mask=global_attention_mask,
                labels=None,
            )

    labels = batch['labels']
    logits = outputs['logits']
    preds = logits.cpu().data.numpy()
    targets = copy.deepcopy(labels[0].numpy())
    predictions = copy.deepcopy(preds)
    results = {
        'predictions': predictions[0][0],
        'gold_labels': targets[0][0],
        }
    df = pd.DataFrame(columns=['predictions', 'gold_labels'])
    df = df.append(results, ignore_index=True)
    df.to_csv(PREDS_SAVE_PATH, mode='a', header=False)

    # clear CUDA memory
    del input_ids
    del attention_mask
    del global_attention_mask
    del logits
    del outputs
    gc.collect()