In [2]:
import pandas as pd
import torch
import numpy as np
from transformers import HerbertTokenizer, RobertaModel
from torch import nn
from transformers import BertModel
from sklearn import metrics
from torch.optim import Adam
from tqdm import tqdm

In [7]:
reviews = pd.read_parquet('../data/reviews_sample_proc.parquet')
train = reviews.query('partition == "train"')
test = reviews.query('partition == "test"')

original_train = train[['original', 'rating']].rename(columns={'original': 'text'}).assign(**{'rating': lambda x: x['rating']-1})[:100]
original_test = test[['original', 'rating']].rename(columns={'original': 'text'}).assign(**{'rating': lambda x: x['rating']-1})

In [10]:
tokenizer = HerbertTokenizer.from_pretrained('allegro/herbert-klej-cased-tokenizer-v1')

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = df['rating'].to_list()
        self.texts = [
            tokenizer(text, padding='max_length', max_length = 512,
                truncation=True, return_tensors='pt') for text in df['text']
        ]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y
    
class RobertaClassifier(nn.Module):

    def __init__(self):

        super().__init__()

        self.bert = RobertaModel.from_pretrained('allegro/herbert-klej-cased-v1')
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        linear_output = self.linear(pooled_output)
        final_layer = self.relu(linear_output)

        return final_layer

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMTokenizer'. 
The class this function is called from is 'HerbertTokenizer'.


In [11]:
%%time

torch.manual_seed(1234)

def train(model, train_data, learning_rate, epochs):

    train = Dataset(train_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=16, shuffle=True)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f}'
        )
                  
EPOCHS = 2
model = RobertaClassifier()

modules = [model.bert.embeddings, *model.bert.encoder.layer[:8]]
for module in modules:
    for param in module.parameters():
        param.requires_grad = False
        
LR = 5e-5
              
train(model, translated_train, LR, EPOCHS)

Downloading (…)lve/main/config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/500M [00:00<?, ?B/s]

100%|██████████| 7/7 [00:39<00:00,  5.66s/it]


Epochs: 1 | Train Loss:  0.114             | Train Accuracy:  0.150


100%|██████████| 7/7 [00:38<00:00,  5.52s/it]

Epochs: 2 | Train Loss:  0.113             | Train Accuracy:  0.210
CPU times: user 3min 5s, sys: 49.4 s, total: 3min 55s
Wall time: 1min 35s





In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=16)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    y_pred = torch.empty(0) .to(device)

    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            y_pred = torch.cat([y_pred, output.argmax(dim=1)])
    
    return y_pred

In [None]:
%%time

results = evaluate(model, translated_test)

print(metrics.classification_report(translated_test['rating'].to_numpy(), results.cpu().numpy()))