# Common imports


In [37]:
import torch

#DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
DEVICE = torch.device("cpu")

# Text Transformations

# Dataset

In [38]:
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchtext.transforms as T
from torch.hub import load_state_dict_from_url
import torch.nn as nn

class Custom_IMDB(Dataset):
    def __init__(self, data_split_type, data_path='../data/IMDB Dataset.csv'):
        
        self.data = self.get_data(data_split_type, data_path)

        padding_idx = 1
        bos_idx = 0
        eos_idx = 2
        max_seq_len = 512
        xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
        xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"

        self.text_transform = T.Sequential(
            T.SentencePieceTokenizer(xlmr_spm_model_path),
            T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)),
            T.Truncate(max_seq_len - 2),
            T.AddToken(token=bos_idx, begin=True),
            T.AddToken(token=eos_idx, begin=False),
            T.ToTensor(),
            T.PadTransform(max_seq_len, pad_value = 0)
        )

    def label_to_index(self, index):
        label = self.data['sentiment'][index]
        label_to_index = {"positive" : [1, 0] , "negative" : [0, 1]}

        return torch.tensor(label_to_index[label]).float()
    
    def __len__(self):
        return len(self.data.index)

    def __getitem__(self, index):
        review_text = self.data['review'][index]
        transformed_text = self.text_transform(review_text)

        label = self.label_to_index(index)

        return {'text': transformed_text, 'label': label, 'length' : transformed_text.size(dim=0)}
    
    def get_data(self, data_split_type, data_path):
        data = pd.read_csv(data_path)
        self.train_slice = int(len(data)*0.7)
        self.val_slice = self.train_slice + int(len(data)*0.2)
        #self.test_slice = val_slice + int(len(data)*0.1)

        if data_split_type == 'train':
            return data[:self.train_slice].reset_index()
        elif data_split_type == 'val':
            return data[self.train_slice:self.val_slice].reset_index()
        elif data_split_type == 'test':
            return data[self.val_slice:].reset_index()
        else:
            print('Error!!')



# Data Preparations

In [39]:

batch_size = 10

train_datapipe = Custom_IMDB(data_split_type="train")
val_datapipe = Custom_IMDB(data_split_type="val")
test_datapipe = Custom_IMDB(data_split_type="test")


train_dataloader = DataLoader(train_datapipe, batch_size=batch_size, collate_fn=None)

val_dataloader = DataLoader(val_datapipe, batch_size=batch_size, collate_fn=None)

test_dataloader = DataLoader(test_datapipe, batch_size=batch_size, collate_fn=None)

# Model Preparation

In [40]:
num_classes = 2
input_dim = 768

from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER

classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
model = XLMR_BASE_ENCODER.get_model(head=classifier_head)
model.to(DEVICE)

RobertaModel(
  (encoder): RobertaEncoder(
    (transformer): TransformerEncoder(
      (token_embedding): Embedding(250002, 768, padding_idx=1)
      (layers): TransformerEncoder(
        (layers): ModuleList(
          (0-11): 12 x TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
            )
            (linear1): Linear(in_features=768, out_features=3072, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (linear2): Linear(in_features=3072, out_features=768, bias=True)
            (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout1): Dropout(p=0.1, inplace=False)
            (dropout2): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (positional_embedding): PositionalEmbedding(
        (embedding): Embedding(5

# Training methods

In [41]:
import torchtext.functional as F
from torch.optim import AdamW

learning_rate = 1e-5
optim = AdamW(model.parameters(), lr=learning_rate)
criteria = nn.CrossEntropyLoss()


def train_step(input, target):
    output = model(input)
    loss = criteria(output, target)
    optim.zero_grad()
    loss.backward()
    optim.step()


def eval_step(input, target):
    output = model(input)
    loss = criteria(output, target).item()
    return float(loss), (output.argmax(1) == target).type(torch.float).sum().item()


def evaluate():
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    counter = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input = batch['text'].to(DEVICE)
            target = batch['label'].to(DEVICE)
            loss, predictions = eval_step(input, target)
            total_loss += loss
            correct_predictions += predictions
            total_predictions += len(target)
            counter += 1

    return total_loss / counter, correct_predictions / total_predictions

# Train

In [42]:
num_epochs = 1

for e in range(num_epochs):
    for index, batch in enumerate(train_dataloader):
        print('index ', index, end=' \r')
        input = batch['text'].to(DEVICE)
        target = batch['label'].to(DEVICE)
        train_step(input, target)

    loss, accuracy = evaluate()
    print("Epoch = [{}], loss = [{}], accuracy = [{}]".format(e, loss, accuracy))

index  1 

KeyboardInterrupt: 