In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
import numpy as np
import torch.nn.functional as F
import random

torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

class SST5Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence = str(self.data.iloc[index, 0])
        label = int(self.data.iloc[index, 1])
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, max_len, batch_size,shuffle):
    ds = SST5Dataset(df, tokenizer, max_len)
    return DataLoader(ds, batch_size=batch_size, num_workers=0,shuffle=shuffle)

def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['label'].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, labels)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['label'].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, labels)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

def main():
    train_file = 'train.csv'
    validation_file = 'validation.csv'
    test_file = 'test.csv'

    train_df = pd.read_csv(train_file)
    val_df = pd.read_csv(validation_file)
    test_df = pd.read_csv(test_file)

    PRE_TRAINED_MODEL_NAME = 'roberta-base'
    tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

    MAX_LEN = 128
    BATCH_SIZE = 32
    EPOCHS = 20
    LEARNING_RATE = 1e-5

    train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE,shuffle=True)
    val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE,shuffle=False)
    test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE,shuffle=False)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(torch.cuda.is_available())
    
    model = RobertaForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels=5)
    model = model.to(device)

    for param in model.roberta.parameters():
        param.requires_grad = False

    for param in model.roberta.encoder.layer[-3:].parameters():
        param.requires_grad = True
    
    for param in model.classifier.parameters():
        param.requires_grad = True

    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE, correct_bias=False, weight_decay=0.01)
    loss_fn = torch.nn.CrossEntropyLoss().to(device)

    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    best_loss = np.inf
    early_stopping_patience = 3
    early_stopping_counter = 0

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 50)

        train_acc, train_loss = train_epoch(
            model,
            train_data_loader,
            loss_fn,
            optimizer,
            device,
            scheduler,
            len(train_df)
        )

        print(f'Train loss {train_loss} accuracy {train_acc}')

        val_acc, val_loss = eval_model(
            model,
            val_data_loader,
            loss_fn,
            device,
            len(val_df)
        )

        print(f'Validation loss {val_loss} accuracy {val_acc}')

        if val_loss < best_loss:
            torch.save(model.state_dict(), 'Labeling_Roberta_model.bin')
            best_loss = val_loss
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1

        if early_stopping_counter >= early_stopping_patience:
            print("Early stopping")
            break

    model.load_state_dict(torch.load('Labeling_Roberta_model.bin'))
    model = model.to(device)

    test_acc, _ = eval_model(
        model,
        test_data_loader,
        loss_fn,
        device,
        len(test_df)
    )

    print(f'Test Accuracy: {test_acc}')

if __name__ == "__main__":
    main()


True


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch 1/20
--------------------------------------------------
Train loss 1.2497323269254705 accuracy 0.44159644194756553
Validation loss 1.1100154825619288 accuracy 0.5113533151680291
Epoch 2/20
--------------------------------------------------
Train loss 1.0372185742810425 accuracy 0.5429541198501873
Validation loss 1.1381742204938616 accuracy 0.48501362397820164
Epoch 3/20
--------------------------------------------------
Train loss 0.9892496794797061 accuracy 0.5668305243445693
Validation loss 1.1419281670025416 accuracy 0.5158946412352406
Epoch 4/20
--------------------------------------------------
Train loss 0.952115599135781 accuracy 0.5812265917602997
Validation loss 1.1071717381477355 accuracy 0.5331516802906449
Epoch 5/20
--------------------------------------------------
Train loss 0.9207605357920186 accuracy 0.6052200374531835
Validation loss 1.140475709097726 accuracy 0.5322434150772025
Epoch 6/20
--------------------------------------------------
Train loss 0.8914668140