# RoBERTa
Use RoBERTa Model with pretrained RoBERTa tokenizer from huggingface.

## Import Packages and Environment Variables

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm.auto as tqdm
import transformers
from sklearn.model_selection import train_test_split
import os

In [2]:
DATA_PATH = "data/"
DIR_PATH = os.getcwd()
TRAIN_DATA_PATH = os.path.join(DIR_PATH, DATA_PATH, "train.csv")
TEST_DATA_PATH = os.path.join(DIR_PATH, DATA_PATH, "test.csv")

VALIDATION_RATIO = 0.05
SEED = 1234

REMOVE_DUPLICATE = False
TRANSFORMER_NAME = "roberta-base"

BATCH_SIZE = 32
LEARNING_RATE = 1e-7
EPOCHS = 7

In [3]:
if torch.cuda.is_available():
    DEVICE = torch.device("cuda:0")
else:
    DEVICE = torch.device("cpu")
print(f"Using device: {DEVICE}")

Using device: cuda:0


In [4]:
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
tqdm.tqdm.pandas()# enable progress_apply and progress_map for pandas
torch.cuda.empty_cache()

## Data Preprocessed and Feature Engineering

In [6]:
train_data = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)
train_data.head()

Unnamed: 0,review,sentiment
0,the single worst film i've ever seen in a thea...,negative
1,I was actually around 13 years old camping nea...,positive
2,A small town is attacked by a horde of bloodth...,negative
3,I think the problem with this show not getting...,positive
4,"Wow, this movie was horrible. As a Bills fan I...",negative


In [7]:
test.head()

Unnamed: 0,id,review
0,0,An expedition party made up of constantly bick...
1,1,"Well, I'll be honest: It is not exactly a Shol..."
2,2,"This is not a boring movie, the audience might..."
3,3,My boyfriend and I decided to go see this movi...
4,4,It's a shame this movie is rated PG 13--it is ...


In [8]:
# Remove Duplicated Data
if REMOVE_DUPLICATE:
    print(train_data.duplicated(subset=["review"]).sum())
    train_data.drop_duplicates(subset=["review"], inplace=True)
    train_data = train_data.reset_index(drop=True)

In [9]:
train_data["sentiment"] = train_data["sentiment"].progress_map(
    lambda x:1 if x == "positive" else 0
    )
train_data.head()

  0%|          | 0/40000 [00:00<?, ?it/s]

Unnamed: 0,review,sentiment
0,the single worst film i've ever seen in a thea...,0
1,I was actually around 13 years old camping nea...,1
2,A small town is attacked by a horde of bloodth...,0
3,I think the problem with this show not getting...,1
4,"Wow, this movie was horrible. As a Bills fan I...",0


In [10]:
# Tockenizer
tokenizer = transformers.RobertaTokenizer.from_pretrained(TRANSFORMER_NAME)

def tokenize_function(examples, tokenizer):
    return tokenizer(examples, truncation=True)["input_ids"] 

In [11]:
train_data["input_ids"] = train_data["review"].progress_map(lambda x :tokenize_function(x, tokenizer=tokenizer))
test["input_ids"] = test["review"].progress_map(lambda x :tokenize_function(x, tokenizer=tokenizer))    

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [12]:
train_data.head()

Unnamed: 0,review,sentiment,input_ids
0,the single worst film i've ever seen in a thea...,0,"[0, 627, 881, 2373, 822, 939, 348, 655, 450, 1..."
1,I was actually around 13 years old camping nea...,1,"[0, 100, 21, 888, 198, 508, 107, 793, 16724, 5..."
2,A small town is attacked by a horde of bloodth...,0,"[0, 250, 650, 1139, 16, 4487, 30, 10, 44666, 9..."
3,I think the problem with this show not getting...,1,"[0, 100, 206, 5, 936, 19, 42, 311, 45, 562, 5,..."
4,"Wow, this movie was horrible. As a Bills fan I...",0,"[0, 23692, 6, 42, 1569, 21, 11385, 4, 287, 10,..."


In [13]:
test.head()

Unnamed: 0,id,review,input_ids
0,0,An expedition party made up of constantly bick...,"[0, 4688, 25512, 537, 156, 62, 9, 5861, 741, 1..."
1,1,"Well, I'll be honest: It is not exactly a Shol...","[0, 8346, 6, 38, 581, 28, 5322, 35, 85, 16, 45..."
2,2,"This is not a boring movie, the audience might...","[0, 713, 16, 45, 10, 15305, 1569, 6, 5, 2437, ..."
3,3,My boyfriend and I decided to go see this movi...,"[0, 2387, 6578, 8, 38, 1276, 7, 213, 192, 42, ..."
4,4,It's a shame this movie is rated PG 13--it is ...,"[0, 243, 18, 10, 9208, 42, 1569, 16, 5211, 144..."


In [14]:
tokenizer.pad_token # Padding token

'<pad>'

In [15]:
tokenizer.pad_token_id # Padding token id

1

In [16]:
# Use this index as padding value
pad_index = tokenizer.pad_token_id 

## Split trainning  set and validation set

In [17]:
train_data, valid_data = train_test_split(train_data, 
                                          test_size=VALIDATION_RATIO,
                                          random_state=SEED)
train_data.reset_index(drop=True, inplace=True)
valid_data.reset_index(drop=True, inplace=True)

In [18]:
train_data.size, valid_data.size

(114000, 6000)

In [19]:
train_data.head()

Unnamed: 0,review,sentiment,input_ids
0,What an empty and lack lustre rendition of the...,0,"[0, 2264, 41, 5802, 8, 1762, 30864, 241, 25202..."
1,"Bad plot (though good for a B-movie), good fas...",0,"[0, 26954, 6197, 36, 18401, 205, 13, 10, 163, ..."
2,This is one of the worst movies ever made. Tri...,0,"[0, 713, 16, 65, 9, 5, 2373, 4133, 655, 156, 4..."
3,This series was a cut above the rest of the TV...,1,"[0, 713, 651, 21, 10, 847, 1065, 5, 1079, 9, 5..."
4,You can only describe this with one word and t...,0,"[0, 1185, 64, 129, 6190, 42, 19, 65, 2136, 8, ..."


## Dataset & DataLoader

In [20]:
class dataset(Dataset):
    """
    Custom dataset
    Args:
        data (pd.DataFrame): DataFrame containing 'input_ids' and 'sentiment' columns.
    """
    def __init__(self, data:pd.DataFrame):
        super().__init__()
        self.ids = data["input_ids"].to_list()
        self.labels = data["sentiment"].to_list()
    
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, ind):
        ids_tensor = torch.tensor(self.ids[ind], dtype=torch.int64)

        labels_tensor = torch.tensor(self.labels[ind], dtype=torch.int64)
        return {"ids" : ids_tensor, "label" : labels_tensor}

### Make custon collate function for padding
Since the review length is different, we need to pad the input_ids to the same length in a batch.

In [21]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = [i["ids"] for i in batch]
        batch_ids = nn.utils.rnn.pad_sequence(
            batch_ids, padding_value=pad_index, batch_first=True
        )
        batch_label = [i["label"] for i in batch]
        batch_label = torch.stack(batch_label)
        batch = {"ids": batch_ids, "label": batch_label}
        return batch

    return collate_fn

### Create DataLoader

In [22]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=True):
    """Get DataLoader with custom collate_fn for padding."""
    collate_fn = get_collate_fn(pad_index)
    data_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=collate_fn
    )
    return data_loader

In [23]:
train_dataset = dataset(train_data)
valid_dataset = dataset(valid_data)

train_loader = get_data_loader(train_dataset, BATCH_SIZE, 
                               pad_index, shuffle=True)
valid_loader = get_data_loader(valid_dataset, BATCH_SIZE, 
                               pad_index, shuffle=False) 


## Model 
Build and Fine-tune the pre-trained model.

In [24]:
class Transformer(nn.Module):
    def __init__(self, transformer_name, num_classes, freeze):
        super().__init__()
        self.transformer = transformers.RobertaModel.from_pretrained(transformer_name)
        self.classifier = nn.Linear(self.transformer.config.hidden_size,
                                    num_classes)
        if freeze:
            for param in self.transformer.parameters():
                param.requires_grad = False

    def forward(self, input_ids):
        outputs = self.transformer(input_ids=input_ids).last_hidden_state
        cls_output = outputs[:, 0, :]  # [CLS] token output
        logits = self.classifier(torch.tanh(cls_output))
        return logits

In [25]:
output_dim = len(train_data["sentiment"].unique())

In [26]:
model = Transformer(TRANSFORMER_NAME, output_dim, freeze=False)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Trainning and Testing

### Trainning Setup

In [27]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [28]:
model = model.to(DEVICE)

## Trainning, Evaluating and Computing Accuracy

In [29]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [30]:
def evaluate(data_loader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    with torch.no_grad():
        for batch in tqdm.tqdm(data_loader, desc="evaluating..."):
            ids = batch["ids"].to(device)
            label = batch["label"].to(device)
            prediction = model(ids)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)


In [31]:
def train(data_loader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []
    for batch in tqdm.tqdm(data_loader, desc="training..."):
        ids = batch["ids"].to(device)
        label = batch["label"].to(device)
        prediction = model(ids)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [32]:
best_valid_loss = float("inf")

metrics = {"train_losses": [],
           "train_accs": [],
           "valid_losses": [],
           "valid_accs": []}

for epoch in range(EPOCHS):
    train_loss, train_acc = train(
        train_loader, model, criterion, optimizer, DEVICE
    )
    valid_loss, valid_acc = evaluate(valid_loader, model, criterion, DEVICE)
    metrics["train_losses"].append(train_loss)
    metrics["train_accs"].append(train_acc)
    metrics["valid_losses"].append(valid_loss)
    metrics["valid_accs"].append(valid_acc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "transformer.pt")
    print(f"epoch: {epoch}")
    print(f"train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}")
    print(f"valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}")

training...:   0%|          | 0/1188 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


evaluating...:   0%|          | 0/63 [00:00<?, ?it/s]

epoch: 0
train_loss: 0.693, train_acc: 0.514
valid_loss: 0.691, valid_acc: 0.541


training...:   0%|          | 0/1188 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Plot

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(metrics["train_losses"], label="train loss")
ax.plot(metrics["valid_losses"], label="valid loss")
ax.set_xlabel("epoch")
ax.set_ylabel("loss")
ax.set_xticks(range(EPOCHS))
ax.legend()
ax.grid()

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(metrics["train_accs"], label="train accuracy")
ax.plot(metrics["valid_accs"], label="valid accuracy")
ax.set_xlabel("epoch")
ax.set_ylabel("loss")
ax.set_xticks(range(EPOCHS))
ax.legend()
ax.grid()

## Generate the test answer

In [None]:
class testDataset(Dataset):
    def __init__(self, data:pd.DataFrame):
        super().__init__()
        self.ids = data["input_ids"].to_list()
    
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, ind):
        ids_tensor = torch.tensor(self.ids[ind], dtype=torch.int64)
        return ids_tensor

In [None]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = [i for i in batch]
        batch_ids = nn.utils.rnn.pad_sequence(
            batch_ids, padding_value=pad_index, batch_first=True
        )
        batch = {"ids": batch_ids}
        return batch

    return collate_fn

In [None]:
def predict(data_loader, model, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in tqdm.tqdm(data_loader, desc="predicting..."):
            ids = batch["ids"].to(device)
            prediction = model(ids)
            preds.append(prediction)
    return torch.cat(preds)

In [None]:
# load the best model
model.load_state_dict(torch.load("transformer.pt"))
model = model.to(DEVICE)
test_dataset = testDataset(test)
test_loader = get_data_loader(test_dataset, BATCH_SIZE, 
                               pad_index, shuffle=False)
predictions = predict(test_loader, model, DEVICE)

In [None]:
prediction_classes = predictions.argmax(dim=-1)
answer = pd.DataFrame({"id" : test["id"],
                       "sentiment": ["positive" if i == 1 else "negative" for i in prediction_classes.cpu().numpy()]})
answer.to_csv("submission.csv", index=False)

### Clean GPU

In [None]:
del model
torch.cuda.empty_cache()