__Objective__: To fine-tune Bertweet model using CMV dataset and save it in `MyDrive/DL/models/bertweet`

__Runtime__: GPU

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm 
import pickle as pkl
import matplotlib.pyplot as plt
from matplotlib import colors

# Training Bertweet using CMV dataset

In [None]:
!pip install transformers

In [None]:
import torch
from transformers import RobertaForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader
from transformers import AdamW

In [None]:
import os

def read_split(dir):
    texts = []
    labels = []
    with open(dir, 'r', encoding='utf-8') as f:
        lines = f.readlines()[1:]
        for line in lines:
            line = line.strip().split(",")
            assert(len(line) == 2)
            label = line[0].strip()
            text = line[1].strip()
            texts.append(text)
            if label == 'AH':
                labels.append(1)
            else:
                labels.append(0)
    return texts, labels


train_texts, train_labels = read_split('/content/gdrive/MyDrive/DL/dataset/pytorch/train.csv')
test_texts, test_labels = read_split('/content/gdrive/MyDrive/DL/dataset/pytorch/test.csv')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

In [None]:
max_seq_length = 64
train_encodings = tokenizer(train_texts, truncation=True, max_length=max_seq_length, padding="max_length")
test_encodings = tokenizer(test_texts, truncation=True, max_length=max_seq_length, padding="max_length")

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
model = RobertaForSequenceClassification.from_pretrained("vinai/bertweet-base")
model.to(device)
model.train()

In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
from tqdm import tqdm
for epoch in range(3):
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

In [None]:
import numpy as np
!pip install datasets
from datasets import load_metric

In [None]:
metric= load_metric("accuracy")
model.eval()
eval_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
for batch in eval_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = '/content/gdrive/MyDrive/DL/models/bertweet'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)