### Use in Colab to resolve environment (otherwise ignore)

In [None]:
%%capture
!pip install pytorch-lightning
!pip install transformers
!pip install adapter-transformers
!pip install scikit-learn 
!pip install datasets

In [None]:
!nvidia-smi

In [None]:
# Module imports
import json
import pandas as pd
import torch
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import Dataset
from transformers import RobertaTokenizer
from datasets import load_dataset
import pytorch_lightning as pl
from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup
from transformers import RobertaForSequenceClassification, AutoAdapterModel
from torchmetrics.functional import f1_score, accuracy
import torch.nn as nn
import math

from google.colab import drive
drive.mount('/content/drive')


### HELPFULNESS Dataset

### IMDb Dataset

In [None]:
imdb_dataset = load_dataset("imdb")

In [None]:
imdb_dataset

In [None]:
from transformers import RobertaTokenizer
# Tokenize dataset
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)

imdb_dataset = imdb_dataset.map(encode, batched=True)
imdb_dataset = imdb_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
imdb_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
imdb_dataset['validation'] = imdb_dataset['train'][20000:]
imdb_dataset['train'] = imdb_dataset['train'][:20000]


In [None]:
train_imdb = torch.utils.data.Subset(imdb_dataset['train'], range(0, 20000))
validation_imdb = torch.utils.data.Subset(imdb_dataset['train'], range(0, 20000))

In [None]:
imdb_dataset['train'] = train_imdb
imdb_dataset['validation'] = validation_imdb

In [None]:
class IMDB_Classifier(pl.LightningModule):

    def __init__(self, config: dict):
        super().__init__()
        self.config = config
        # 1. Static classification head is used for finetuning
        # self.pre_trained_model = RobertaForSequenceClassification.from_pretrained(config['model_name'], 
        #                                                                           problem_type="multi_label_classification", 
        #                                                                           num_labels=self.config['n_labels'],
        #                                                                           return_dict = True)
        

        # 2. AutoAdapterModel with adapters
        self.pre_trained_model = AutoAdapterModel.from_pretrained(config['model_name'])
        self.pre_trained_model.add_adapter("imdb", config="pfeiffer")
        self.pre_trained_model.add_classification_head("imdb", 
                                                       num_labels=self.config['n_labels'],
                                                       id2label={0:'negative',
                                                                 1:'positive'})
        # Enable adapter training 
        self.pre_trained_model.set_active_adapters(["imdb"])   
        self.pre_trained_model.train_adapter("imdb") 
        self.softmax = nn.Softmax()


    def forward(self, input_ids, attention_mask, labels):
        # pre_trained model output
        # use this line when fine-tuning -- slight difference in input format
        # targets = torch.nn.functional.one_hot(labels, num_classes=3).double()
        logits = self.pre_trained_model(input_ids=input_ids, attention_mask=attention_mask).logits
        loss = self.pre_trained_model(input_ids=input_ids, labels=labels).loss
        output = self.softmax(logits)
        return loss, output
    
    def training_step(self, batch, batch_index):
        loss, output = self.forward(**batch)
        acc = accuracy(output, batch["labels"])
        self.log("train_loss", loss)
        self.log("train_acc_step", acc)
        return {"loss": loss, "accuracy": acc, "predictions": output, "labels": batch["labels"]}

    
    def validation_step(self, batch, batch_index):
        loss, output = self.forward(**batch)
        f1 = f1_score(output, batch["labels"])
        self.log("val_loss", loss)
        self.log("val_f1_score", f1)
        return {"val_loss": loss, "f1_score": f1, "predictions": output, "labels": batch["labels"]}

    def test_step(self, batch, batch_index):
        _, output = self.forward(**batch)
        f1 = f1_score(output, batch["labels"])
        self.log("val_f1_score", f1) 
        return output
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'] ,no_deprecation_warning=True, correct_bias=False)
        total_steps = self.config['train_size'] / self.config['batch_size']
        warmup_steps = math.floor(total_steps * self.config['warmup'])
        scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
        return [optimizer], [scheduler]

In [None]:

config = {
    # Some randomly typed-in initial configs
    'model_name': 'roberta-base',
    'batch_size': 32,
    'lr': 1e-4,
    'warmup': 0.06,
    'train_size': len(imdb_dataset['train']),
    'weight_decay': 0.000001,
    'n_epochs': 15,
    'n_labels': 2
}

In [None]:
imdb_train_dl = torch.utils.data.DataLoader(imdb_dataset['train'], config['batch_size'])
imdb_val_dl = torch.utils.data.DataLoader(imdb_dataset['validation'], config['batch_size'])

# model
imdb_model = IMDB_Classifier(config=config)
trainer = pl.Trainer(max_epochs=config['n_epochs'], gpus=1, num_sanity_val_steps=1, default_root_dir='/content/drive/MyDrive/imdb')
trainer.fit(imdb_model, imdb_train_dl, imdb_val_dl)

In [None]:
%reload_ext tensorboard 
%tensorboard --logdir ./drive/MyDrive/imdb/lightning_logs

### SciCite Dataset

In [None]:
scicite_dataset = load_dataset("scicite")

In [None]:
from transformers import RobertaTokenizer
# Tokenize dataset
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

def encode(examples):
    return tokenizer(examples['string'], truncation=True, padding='max_length', max_length=80)

scicite_dataset = scicite_dataset.map(encode, batched=True)
scicite_dataset = scicite_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
scicite_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
sum(scicite_dataset['train']['labels']==2)

### SciCite Model&Training

In [None]:
class Scicite_Classifier(pl.LightningModule):

    def __init__(self, config: dict):
        super().__init__()
        self.config = config
        # 1. Static classification head is used for finetuning
        # self.pre_trained_model = RobertaForSequenceClassification.from_pretrained(config['model_name'], 
        #                                                                           problem_type="multi_label_classification", 
        #                                                                           num_labels=self.config['n_labels'],
        #                                                                           return_dict = True)
        

        # 2. AutoAdapterModel with adapters
        self.pre_trained_model = AutoAdapterModel.from_pretrained(config['model_name'])
        self.pre_trained_model.add_adapter("citation-intent", config="parallel")
        self.pre_trained_model.add_classification_head("citation-intent", 
                                                       num_labels=self.config['n_labels'],
                                                       id2label={0:'method',
                                                                 1:'background', 
                                                                 2:'result'})
        # Enable adapter training 
        self.pre_trained_model.set_active_adapters(["citation-intent"])   
        self.pre_trained_model.train_adapter("citation-intent") 
        self.softmax = nn.Softmax()


    def forward(self, input_ids, attention_mask, labels):
        # pre_trained model output
        # use this line when fine-tuning -- slight difference in input format
        # targets = torch.nn.functional.one_hot(labels, num_classes=3).double()
        logits = self.pre_trained_model(input_ids=input_ids, attention_mask=attention_mask).logits
        loss = self.pre_trained_model(input_ids=input_ids, labels=labels).loss
        output = self.softmax(logits)
        return loss, output
    
    def training_step(self, batch, batch_index):
        loss, output = self.forward(**batch)
        acc = accuracy(output, batch["labels"])
        self.log("train_loss", loss)
        self.log("train_acc_step", acc)
        return {"loss": loss, "accuracy": acc, "predictions": output, "labels": batch["labels"]}

    
    def validation_step(self, batch, batch_index):
        loss, output = self.forward(**batch)
        f1 = f1_score(output, batch["labels"])
        self.log("val_loss", loss)
        self.log("val_f1_score", f1)
        return {"val_loss": loss, "f1_score": f1, "predictions": output, "labels": batch["labels"]}

    def test_step(self, batch, batch_index):
        _, output = self.forward(**batch)
        f1 = f1_score(output, batch["labels"])
        self.log("val_f1_score", f1) 
        return output
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'] ,no_deprecation_warning=True, correct_bias=False)
        total_steps = self.config['train_size'] / self.config['batch_size']
        warmup_steps = math.floor(total_steps * self.config['warmup'])
        scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
        return [optimizer], [scheduler]

In [None]:

config = {
    # Some randomly typed-in initial configs
    'model_name': 'roberta-base',
    'batch_size': 8,
    'lr': 5e-4,
    'warmup': 0.06,
    'train_size': len(scicite_dataset['train']),
    'weight_decay': 0.00001,
    'n_epochs': 15,
    'n_labels': 3

}

In [None]:
scicite_dataset['train']

In [None]:
scicite_train_dl = torch.utils.data.DataLoader(scicite_dataset['train'], config['batch_size'])
scicite_val_dl = torch.utils.data.DataLoader(scicite_dataset['validation'], config['batch_size'])

# model
scicite_model = Scicite_Classifier(config=config)
trainer = pl.Trainer(max_epochs=config['n_epochs'], gpus=1, num_sanity_val_steps=1, default_root_dir='/content/drive/MyDrive/scicite')
trainer.fit(scicite_model, scicite_train_dl, scicite_val_dl)

In [None]:
!kill 433

In [None]:
%reload_ext tensorboard 
%tensorboard --logdir ./drive/MyDrive/scicite/lightning_logs

In [None]:
import numpy as np

test_dataloaders = torch.utils.data.DataLoader(scicite_dataset['test'], config['batch_size'])
trainer.test(scicite_model ,dataloaders=test_dataloaders)

### ACL-ARC Data Inspection

In [None]:
# toy dataReader for exploration 
class DataReader:
    def __init__(self, json_name, shuffle=False):
        with open(json_name, 'r') as json_file:
            raw_json = list(json_file)
        self.raw = raw_json
        self.raw_objects = []
        for item in self.raw:
            self.raw_objects.append(json.loads(item))
        self.df = pd.DataFrame(self.raw_objects)


    def get_stats(self):   
        return self.df.head()

    def get_data(self):
        # import IPython; IPython.embed(); exit(1)
        return self.df['text'], self.df['intent']

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train_path = "./drive/MyDrive/DL project/project/acl-arc/train.jsonl"
val_path = "./drive/MyDrive/DL project/project/acl-arc/dev.jsonl"
test_path = "./drive/MyDrive/DL project/project/acl-arc/test.jsonl"
train_acl = DataReader(json_name=train_path).df
test_acl = DataReader(json_name=val_path).df

In [None]:
train_acl.head()

In [None]:
# Simple label view
labels = list(set(train_acl['intent']))
train_acl.groupby('intent').count()['text'].plot.bar()

In [None]:
attributes = ['Background', 'CompareOrContrast', 'Extends', 'Future', 'Motivation', 'Uses']

### ACL-ARC Dataset

In [None]:
class ACL_Dataset(Dataset):
    def __init__(self, data_path, tokenizer, attributes, max_token_len=128):
        self.data_path = data_path
        self.tokenizer = tokenizer
        self.attribute = attributes
        self.max_token_len = max_token_len
        self._prepare_data()

    def _prepare_data(self):
        '''
        Place to add other data preparations (sampling / train&test separation)

        '''
        if self.data_path[-3:] != "csv":
            with open(self.data_path, 'r') as json_file:
                raw_json = list(json_file)
            raw_objects = []
            for item in raw_json:
                raw_objects.append(json.loads(item))
            self.data = pd.DataFrame(raw_objects)

            # Turn into one-hot encoding
            encoder = OneHotEncoder(handle_unknown='ignore')
            encoder_df = pd.DataFrame(encoder.fit_transform(self.data[['intent']]).toarray())
            encoder_df.columns = self.attribute
            self.data = self.data.join(encoder_df)

        else:
            print("Not yet implemented for csv")

    def __len__ (self):
        return len(self.data)

    def __getitem__(self, index):
        '''
            samples and labels loaded here 
        '''
        item = self.data.iloc[index]
        labels = torch.Tensor(item[self.attribute])
        text = str(item.cleaned_cite_text)
        tokens = self.tokenizer.encode_plus(text, add_special_tokens=True, 
                    return_tensors='pt', truncation=True, max_length = self.max_token_len, 
                    padding="max_length", return_attention_mask=True)
    
        return {"input_ids": tokens.input_ids.flatten(), "attention_mask": tokens.attention_mask.flatten(), "labels": labels}

In [None]:
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
train = ACL_Dataset(train_path, tokenizer, attributes=attributes)


In [None]:
train.data[['intent', 'CompareOrContrast', 'Uses', 'Background', 'Future', 'Motivation', 'Extends']]

### ACL-ARC Data module

In [None]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader

In [None]:
class ACL_DataLoader(pl.LightningDataModule):
    def __init__(self, train_path, val_path, test_path, attributes, batch_size:int = 32, max_token_length: int = 128, model_name = "roberta-base"):
        super().__init__()
        self.train_path = train_path
        self.val_path = val_path
        self.batch_size = batch_size
        self.max_token_length = max_token_length
        self.model_name = model_name
        self.attributes = attributes
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)


    def setup(self, stage=None):
        if stage in (None, "fit"):
            self.train_dataset = ACL_Dataset(train_path, self.tokenizer, attributes=self.attributes)
            self.val_dataset = ACL_Dataset(val_path, self.tokenizer, attributes=self.attributes)

        if stage == "predict":
            self.val_dataset = ACL_Dataset(val_path, self.tokenizer, attributes=self.attributes)
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers = 4, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers = 4, shuffle=False)

    def predict_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers = 4, shuffle=False)

In [None]:
acl_datamodule = ACL_DataLoader(train_path=train_path, val_path=val_path, test_path=test_path, attributes=attributes)
acl_datamodule.setup()
acl_dataloader = acl_datamodule.train_dataloader()

### ACL-ARC Model

In [None]:
attributes

In [None]:
from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup
from transformers import RobertaForSequenceClassification, AutoAdapterModel
from torchmetrics.functional import f1_score, accuracy
import torch.nn as nn
import math

class ACL_Classifier(pl.LightningModule):

    def __init__(self, config: dict):
        super().__init__()
        self.config = config

        # Static classification head is used for finetuning
        # self.pre_trained_model = RobertaForSequenceClassification.from_pretrained(config['model_name'], 
        #                                                                           problem_type="multi_label_classification", 
        #                                                                           num_labels=self.config['n_labels'],
        #                                                                           return_dict = True)
        

        # May switch to dynamic ones 
        self.pre_trained_model = AutoAdapterModel.from_pretrained(config['model_name'])
        self.pre_trained_model.add_adapter("citation-intent", config="parallel")
        self.pre_trained_model.add_classification_head("citation-intent", 
                                                       num_labels=6,
                                                       id2label={0:'Background',
                                                                 1:'CompareOrContrast', 
                                                                 2:'Extends', 
                                                                 3:'Future', 
                                                                 4:'Motivation', 
                                                                 5:'Uses'})

        # Enable adapter training 
        self.pre_trained_model.set_active_adapters(["citation-intent"])   
        self.pre_trained_model.train_adapter("citation-intent") 
        self.softmax = nn.Softmax()


    def forward(self, input_ids, attention_mask, labels):
        # pre_trained model output
        target = torch.argmax(labels, axis=1)
        logits = self.pre_trained_model(input_ids=input_ids, attention_mask=attention_mask).logits
        loss = self.pre_trained_model(input_ids=input_ids, labels=target).loss
        output = self.softmax(logits)
        return loss, output
    
    def training_step(self, batch, batch_index):
        loss, output = self.forward(**batch)
        target = torch.argmax(batch["labels"], axis=1)
        acc = accuracy(output, target)
        self.log("train_loss", loss)
        self.log("train_acc_step", acc)
        return {"loss": loss, "accuracy": acc, "predictions": output, "labels": batch["labels"]}

    
    def validation_step(self, batch, batch_index):
        loss, output = self.forward(**batch)
        target = torch.argmax(batch["labels"], axis=1)
        f1 = f1_score(output, target, average="macro", num_classes=6)
        self.log("val_loss", loss)
        self.log("val_f1_score", f1)
        return {"val_loss": loss, "f1_score": f1, "predictions": output, "labels": batch["labels"]}

    def test_step(self, batch, batch_index):
        _, output = self.forward(**batch)
        target = torch.argmax(batch["labels"], axis=1)
        # f1 = f1_score(output, target, average="macro", num_classes=6)
        f1 = f1_score(output, target)
        self.log("val_f1_score", f1) 
        return output
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'] ,no_deprecation_warning=True, correct_bias=False)
        total_steps = self.config['train_size'] / self.config['batch_size']
        warmup_steps = math.floor(total_steps * self.config['warmup'])
        scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
        return [optimizer], [scheduler]

In [None]:

config = {
    # Some randomly typed-in initial configs
    'model_name': 'roberta-base',
    'batch_size': 32,
    'lr': 1e-4,
    # 1e-4
    'warmup': 0.06,
    'weight_decay': 0.00001,
    'n_epochs': 30,
    'train_size': len(acl_datamodule.train_dataloader()),
    'n_labels': len(labels)


}

In [None]:
'''Single output sanity check'''
model = ACL_Classifier(config=config)
idx = 0
input_ids = train.__getitem__(idx)['input_ids']
attention_mask = train.__getitem__(idx)['attention_mask']
labels = train.__getitem__(idx)['labels']
loss, output = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0), labels.unsqueeze(0))
print("loss" + str(loss))
print("raw prediction: " + str(output))
print("label: " + str(labels))

In [None]:
# Copy datamodule here - for convenience
acl_datamodule = ACL_DataLoader(train_path=train_path, val_path=val_path, test_path=test_path, attributes=attributes, batch_size=config['batch_size'])
acl_datamodule.setup()
model = ACL_Classifier(config=config)

# Auto-Training loop
trainer = pl.Trainer(max_epochs=config['n_epochs'], gpus=1, num_sanity_val_steps=1, default_root_dir='/content/drive/MyDrive/aclarc')
trainer.fit(model, acl_datamodule)

In [None]:
%reload_ext tensorboard
%tensorboard --logdir ./drive/MyDrive/aclarc/lightning_logs/

In [None]:
import numpy as np
test_dataloaders = acl_datamodule.predict_dataloader()
trainer.test(model ,dataloaders=test_dataloaders)