In [1]:
import torch
from transformers import RobertaTokenizer, RobertaForMultipleChoice, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
import json
import json_lines
import os
from tqdm import tqdm

In [2]:
class MultipleChoiceDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids, attention_mask, label = self.data[idx]
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }

In [3]:
answer_map = {'A':0,'B':1,'C':2,'D':3,'E':4}
def load_og_data(file_path):
    data = []
    with open(file_path, 'rb') as f: 
        for item in json_lines.reader(f):
            data.append(item)

    processed_data = []
    for item in data:
        question = item['question']['stem']
        options = [_['text'] for _ in item['question']['choices']]
        examples = []
        for option in options:
            text = question + " " + option
            encoded = tokenizer.encode_plus(
                text,
                truncation=True,
                max_length=512,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt'
            )
            examples.append(encoded)
    
        input_ids = torch.stack([example['input_ids'] for example in examples]).squeeze()
        attention_mask = torch.stack([example['attention_mask'] for example in examples]).squeeze()

        label = torch.tensor(answer_map[item['answerKey']])

        processed_data.append((input_ids, attention_mask, label))

    return processed_data

In [4]:
def load_adv_data(file_path):
    with open(file_path, 'r') as f: 
        data = json.load(f)

    processed_data = []
    for item in data:
        question = item['question']
        options = [str(option) for option in item['choice_list']]
        examples = []
        for option in options:
            text = question + " " + option
            encoded = tokenizer.encode_plus(
                text,
                truncation=True,
                max_length=512,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt'
            )
            examples.append(encoded)
    
        input_ids = torch.stack([example['input_ids'] for example in examples]).squeeze()
        attention_mask = torch.stack([example['attention_mask'] for example in examples]).squeeze()

        label = torch.tensor(item['label'])

        processed_data.append((input_ids, attention_mask, label))

    return processed_data

In [5]:
model = RobertaForMultipleChoice.from_pretrained('roberta-large')
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# train_data = load_og_data("data/rs_train.jsonl")
# valid_data = load_og_data("data/rs_dev.jsonl")

train_data = load_adv_data("data/adversarial_rs_train.json")
valid_data = load_og_data("data/rs_dev.jsonl")

In [7]:
train_dataset = MultipleChoiceDataset(train_data)
valid_dataset = MultipleChoiceDataset(valid_data)

batch_size = 4

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=batch_size)

In [8]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cuda:1"
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [9]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    average_loss = 0
    for index, batch in tqdm(enumerate(train_loader)):
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)
        outputs = model(**inputs, labels=labels)
        if index ==0:
            print(outputs)
        loss = outputs.loss
        total_loss += loss.item()
        average_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        if index%100==0:
            print("###{}####: Average loss: {}".format(index,average_loss / 100))
            average_loss = 0

    avg_train_loss = total_loss / len(train_loader)

    model.eval()
    preds = []
    true_labels = []
    for batch in tqdm(valid_loader):
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"]
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        preds.extend(torch.argmax(logits, dim=1).detach().cpu().numpy())
        true_labels.extend(labels.numpy())

    acc = accuracy_score(true_labels, preds)
    print(f'Epoch: {epoch+1}, Train loss: {avg_train_loss}, Validation accuracy: {acc}')
model.save_pretrained('/usr1/data/devanshj/brainteaser/checkpoints/roberta_rs_finetuned'.format(epoch))

0it [00:00, ?it/s]

MultipleChoiceModelOutput(loss=tensor(1.6194, device='cuda:1', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1231, -0.2414, -0.1179,  0.0579,  0.0723],
        [ 0.2343, -0.0635,  0.0789, -0.1069, -0.1093],
        [-0.2886,  0.1360,  0.1114, -0.0787,  0.1086],
        [ 0.0443,  0.0033,  0.0247,  0.0093, -0.1758]], device='cuda:1',
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)


1it [00:01,  1.67s/it]

###0####: Average loss: 0.016194376945495605


101it [02:17,  1.38s/it]

###100####: Average loss: 1.633327763080597


201it [04:35,  1.38s/it]

###200####: Average loss: 1.640072886943817


301it [06:53,  1.38s/it]

###300####: Average loss: 1.6711408650875093


401it [09:11,  1.38s/it]

###400####: Average loss: 1.6418770849704742


501it [11:29,  1.38s/it]

###500####: Average loss: 1.6380310940742493


601it [13:47,  1.38s/it]

###600####: Average loss: 1.6205302941799165


701it [16:05,  1.38s/it]

###700####: Average loss: 1.6408723163604737


801it [18:22,  1.38s/it]

###800####: Average loss: 1.6535711371898651


901it [20:40,  1.38s/it]

###900####: Average loss: 1.6383634173870087


1001it [22:58,  1.38s/it]

###1000####: Average loss: 1.6374751091003419


1101it [25:16,  1.38s/it]

###1100####: Average loss: 1.65218292593956


1201it [27:34,  1.38s/it]

###1200####: Average loss: 1.6327853882312775


1301it [29:52,  1.38s/it]

###1300####: Average loss: 1.6293265473842622


1401it [32:10,  1.38s/it]

###1400####: Average loss: 1.6356316316127777


1501it [34:27,  1.38s/it]

###1500####: Average loss: 1.6161284244060516


1601it [36:45,  1.38s/it]

###1600####: Average loss: 1.5872885692119598


1701it [39:03,  1.38s/it]

###1700####: Average loss: 1.5333523386716843


1755it [40:18,  1.38s/it]
100%|██████████| 256/256 [02:02<00:00,  2.09it/s]


Epoch: 1, Train loss: 1.6293627437023694, Validation accuracy: 0.1851126346718903


0it [00:00, ?it/s]

MultipleChoiceModelOutput(loss=tensor(1.7430, device='cuda:1', grad_fn=<NllLossBackward0>), logits=tensor([[-0.1863, -0.5929, -0.1724, -0.6392, -1.1309],
        [-1.6457, -0.0419, -1.0845, -1.0261, -1.8096],
        [-0.3004, -1.0264, -0.3677, -1.1997, -0.2362],
        [-0.3636, -1.8944, -0.1251,  0.0121, -0.1777]], device='cuda:1',
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)


1it [00:01,  1.37s/it]

###0####: Average loss: 0.01742978811264038


101it [02:19,  1.38s/it]

###100####: Average loss: 1.6179322063922883


201it [04:37,  1.38s/it]

###200####: Average loss: 1.5683157002925874


301it [06:54,  1.38s/it]

###300####: Average loss: 1.3665251302719117


401it [09:12,  1.38s/it]

###400####: Average loss: 1.2167497685551643


501it [11:30,  1.38s/it]

###500####: Average loss: 1.1131323109567166


601it [13:48,  1.38s/it]

###600####: Average loss: 1.0800309133529664


701it [16:06,  1.38s/it]

###700####: Average loss: 0.948767658919096


801it [18:24,  1.38s/it]

###800####: Average loss: 0.9747387173864991


901it [20:42,  1.38s/it]

###900####: Average loss: 0.9622946005407721


1001it [23:00,  1.38s/it]

###1000####: Average loss: 0.8045174706727266


1101it [25:17,  1.38s/it]

###1100####: Average loss: 0.9582158156484365


1201it [27:35,  1.38s/it]

###1200####: Average loss: 0.8362304078042507


1301it [29:53,  1.38s/it]

###1300####: Average loss: 0.8488803021423519


1401it [32:11,  1.38s/it]

###1400####: Average loss: 0.7387974759750068


1501it [34:29,  1.38s/it]

###1500####: Average loss: 0.7042732928320765


1601it [36:47,  1.38s/it]

###1600####: Average loss: 0.7668518692441285


1701it [39:05,  1.38s/it]

###1700####: Average loss: 0.7322750401403755


1755it [40:20,  1.38s/it]
100%|██████████| 256/256 [02:02<00:00,  2.08it/s]


Epoch: 2, Train loss: 1.0065598510152678, Validation accuracy: 0.5200783545543585


0it [00:00, ?it/s]

MultipleChoiceModelOutput(loss=tensor(0.7846, device='cuda:1', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7264, -1.3636,  0.3102,  1.1300, -0.4455],
        [-3.6481, -2.2580, -2.6616, -0.7726, -2.1941],
        [-2.7697, -0.7451, -1.3309,  4.0201,  0.0925],
        [-2.7175, -3.7272, -3.6943, -4.1817, -3.0349]], device='cuda:1',
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)


1it [00:01,  1.37s/it]

###0####: Average loss: 0.007846204042434692


101it [02:19,  1.38s/it]

###100####: Average loss: 0.6384017168264836


201it [04:37,  1.38s/it]

###200####: Average loss: 0.5641478735720739


301it [06:55,  1.38s/it]

###300####: Average loss: 0.6082120141177438


401it [09:13,  1.38s/it]

###400####: Average loss: 0.6061448626080528


501it [11:31,  1.38s/it]

###500####: Average loss: 0.5971926446235739


601it [13:48,  1.38s/it]

###600####: Average loss: 0.5696255942765857


701it [16:06,  1.38s/it]

###700####: Average loss: 0.5025871004420333


801it [18:24,  1.38s/it]

###800####: Average loss: 0.5602027135739627


901it [20:42,  1.38s/it]

###900####: Average loss: 0.6076388692315959


1001it [23:00,  1.38s/it]

###1000####: Average loss: 0.4916388287645896


1101it [25:18,  1.38s/it]

###1100####: Average loss: 0.4779828909500247


1201it [27:36,  1.38s/it]

###1200####: Average loss: 0.5361312924255617


1301it [29:54,  1.38s/it]

###1300####: Average loss: 0.5912511105634621


1401it [32:12,  1.38s/it]

###1400####: Average loss: 0.5209339944865861


1501it [34:30,  1.38s/it]

###1500####: Average loss: 0.5788109049887862


1601it [36:48,  1.38s/it]

###1600####: Average loss: 0.4259777308237972


1701it [39:06,  1.38s/it]

###1700####: Average loss: 0.5753501329075515


1755it [40:20,  1.38s/it]
100%|██████████| 256/256 [02:02<00:00,  2.08it/s]


Epoch: 3, Train loss: 0.5555712995296058, Validation accuracy: 0.5494613124387855
