In [None]:
!nvidia-smi

In [None]:
!pip install transformers jsonlines datasets pyarrow

In [None]:
# install nvidia apex to use mixed precision

try:
  import apex
except:
  !git clone https://github.com/NVIDIA/apex
  %cd apex
  !pip install -v --no-cache-dir ./

# Load Data as HF dataset


In [None]:
# mount drive to access data
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# load data

import datasets

DIR = "/content/gdrive/MyDrive/CUNY_Comp_Ling/advanced_nlp/term_project/data/"
TRAIN = 'train.jsonl'
DEV = 'dev.jsonl'
TEST = 'test.jsonl'



In [None]:
dataset = datasets.load_dataset('json', data_files={'train': DIR + TRAIN, 'validation': DIR + DEV, 'test': DIR + TEST}) # if doesn't work, try: data_files={'train': [DIR+TRAIN, DIR+DEV]}

print(dataset)

# Tokenize

In [None]:
from transformers import LongformerTokenizer

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

In [None]:

import torch
import datasets
from typing import List, Dict

def prep_data(data):
    encodings = tokenizer.encode_plus(
        data['abstract'], 
        data['text'],
        pad_to_max_length=True, 
        max_length=4096,
        add_special_tokens=True,
        return_token_type_ids=False,
        return_attention_mask=True,
        padding='max_length', 
        truncation=True,
        )
    # convert label to float tensor for regression training
    targets = torch.tensor(data['label'], dtype=torch.float)
    targets.contiguous()
    # to match the shape of the input tensor (1,1)
    targets = targets.view(-1, 1)
    encodings.update({'labels': targets})
    return encodings


In [None]:
# convert data

train_set = dataset['train']
validation_set = dataset['validation']
test_set = dataset['test']

train_set =  train_set.map(prep_data, load_from_cache_file=False)
validation_set =  validation_set.map(prep_data, load_from_cache_file=False)
test_set = test_set.map(prep_data, load_from_cache_file=False)

columns = ['input_ids', 'attention_mask', 'labels']
train_set.set_format(type='torch', columns=columns)
validation_set.set_format(type='torch', columns=columns)
test_set.set_format(type='torch', columns=columns)


In [None]:
print(train_set[0])

# Create Dataloader

In [None]:
from torch.utils.data import RandomSampler, DataLoader

batch_size = 1 # batch size of 1 but gradient accumulation to 32

train_loader = DataLoader(train_set, batch_size, shuffle=True, num_workers=2)
valid_loader = DataLoader(validation_set, batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(test_set, batch_size, shuffle=True, num_workers=2)

# Train


In [None]:
import apex
import datasets
import numpy as np
import os
import pandas as pd
from pathlib import Path
import random
import tqdm

import torch
from torch import nn
from torch import functional as F
from torch.utils.data import (
    TensorDataset,
    random_split,
    RandomSampler,
    DataLoader
)

from transformers import (
    LongformerForSequenceClassification,
    LongformerModel,
    LongformerConfig,
    Trainer, 
    TrainingArguments,
    AdamW,
)



In [None]:
# instantiate model

lf = LongformerForSequenceClassification.from_pretrained(
    'allenai/longformer-base-4096',
    gradient_checkpointing=True, # default False, change to True to use in tandem with mixed precision and gradient accumulation
    num_labels=1 # regression 
    )
lf.config

In [None]:
# define the training arguments

SAVE_PATH = DIR + 'longformer_seq_4096/'

training_args = TrainingArguments(
    output_dir = SAVE_PATH,
    do_train = True,
    do_eval = True,
    num_train_epochs = 1,
    per_device_train_batch_size = 1, # as in the paper
    gradient_accumulation_steps = 32, # as in the paper    
    per_device_eval_batch_size= 8,
    evaluation_strategy = "steps",
    eval_steps = 100,
    disable_tqdm = False, 
    load_best_model_at_end=True,
    learning_rate = 3e-5, # from paper (default = 5e-5)
    warmup_steps=len(train_set)//10,
    weight_decay=0.01,
    logging_steps=500, # =default
    fp16 = True, # mixed precision
    fp16_opt_level = 'O1', # default for apex mixed precision
    logging_dir= DIR + '/logs/',
    dataloader_num_workers = 2,
    run_name = 'longformer-seq-tuned'
)

In [None]:
# resize token embeddings

lf.resize_token_embeddings(len(tokenizer))

# train
trainer = Trainer(
    model = lf,
    args = training_args,
    train_dataset = train_set,
    eval_dataset = validation_set,
)
# set device to cuda
device = 'cuda' if torch.cuda.is_available() else 'cpu'
trainer.train()

# save best model
lf.save_pretrained(SAVE_PATH)

In [None]:
try:
  eval_metrics = trainer.evaluate()
  print(eval_metrics)
except:
  print("no eval")

# Test

In [None]:
import datasets
import numpy as np

metric = datasets.load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# save model's predictions

import jsonlines
import json
import pandas as pd

# get predictions
tester = Trainer(model=lf)
predictions = tester.predict(test_set)

# save
PRED_SAVE_PATH = DIR + "longformer_seq_4096_predictions.csv"

preds = [pred[0] for pred in predictions.predictions.tolist()]
label_ids = [label[0] for label in predictions.label_ids.tolist()]
dictionary = {
    'gold_labels': label_ids,
    'predictions': preds,
}
df = pd.DataFrame.from_dict(dictionary)
df.to_csv(PRED_SAVE_PATH)

In [None]:
predictions.metrics