In [None]:
import torch
print(torch.cuda.is_available())
print("Done!")

!pip install transformers
print('transformers installed!')

True
Done!
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
transformers installed!


In [None]:
def get_active_device():
    """Picking GPU if available or else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
active_device = get_active_device()
print(active_device)

cuda


In [None]:
import pandas as pd
import copy
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler
from torch.utils.data import SequentialSampler
from torch.utils.data import TensorDataset

ATTENTION_MASK = "attention_mask"
INPUT_IDS = "input_ids"
MAX_LENGTH = "max_length"
NUM_ADDITIONAL_TOKENS = 2
PYTORCH_CODE = "pt"

LENGTH_COL = "lengths"
TEXT_COL = "review"
TOKENS_COL = "tokens"
LABEL_COL = "sentiment"
MINI_BATCH_SIZE = 4096
MAX_TRAIN_LENGTH = 512
BATCH_SIZE = 64

RANDOM_SAMPLING = "random_sampling"
SEQUENTIAL_SAMPLING = "sequential_sampling"

def break_by_batch_size(df: pd.DataFrame) -> list:
    sorted_df = df.sort_values(by=LENGTH_COL, axis=0, ascending=False, ignore_index=True)
    labels = sorted_df[LABEL_COL].to_list()
    lengths = sorted_df[LENGTH_COL].to_list()
    texts = sorted_df[TEXT_COL].to_list()
    
    df_list = []
    batch_size = MINI_BATCH_SIZE
    max_valid_length = MAX_TRAIN_LENGTH
    header = {TEXT_COL:[], LABEL_COL:[], LENGTH_COL:[]}
    row_index = 0
    num_rows = len(labels)
    while row_index < num_rows:
        num_words = 0
        curr_df = copy.deepcopy(header)
        while num_words < batch_size and row_index < num_rows:
            actual_length = min(max_valid_length, lengths[row_index])
            num_words += actual_length
            text = texts[row_index]
            tokens = text.split()
            text = " ".join(tokens[:max_valid_length])
            curr_df[TEXT_COL].append(text)
            curr_df[LABEL_COL].append(labels[row_index])
            curr_df[LENGTH_COL].append(actual_length)
            row_index += 1
        
        df_list.append(pd.DataFrame(curr_df))
        
    return df_list


def df_to_dataloader(df: pd.DataFrame, tokenizer, sampling_type) -> TensorDataset:
    sorted_df = df.sort_values(by=LENGTH_COL, axis=0, ascending=False, ignore_index=True)
    labels = sorted_df[LABEL_COL].values.tolist()
    texts = sorted_df[TEXT_COL].values.tolist()
    max_len = min(sorted_df[LENGTH_COL].max() + NUM_ADDITIONAL_TOKENS, MAX_TRAIN_LENGTH)

    tokenized_texts = [tokenizer(text, padding='max_length', max_length=max_len, truncation=True, return_tensors=PYTORCH_CODE) for text in texts]
    indexed_texts = torch.stack([item[INPUT_IDS].squeeze() for item in tokenized_texts], dim=0)
    masks = torch.stack([item[ATTENTION_MASK].squeeze() for item in tokenized_texts], dim=0)
    labels = torch.tensor(labels)

    data = TensorDataset(indexed_texts, masks, labels)
    
    if sampling_type == RANDOM_SAMPLING:
        sampler = RandomSampler(data)
    elif sampling_type == SEQUENTIAL_SAMPLING:
        sampler = SequentialSampler(data)
    else:
        print('Wrong Sampling Type: ' + sampling_type)
        return None
        
    dataloader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader

def get_data_loaders(input_df: pd.DataFrame,
                     sampling_type: str,
                     tokenizer) -> list:
    input_df[LENGTH_COL] = input_df[TEXT_COL].apply(lambda x: len(x.split()))
    df_list = break_by_batch_size(input_df)
    dataloaders = []
    for df in df_list:
        dataloader = df_to_dataloader(df, tokenizer, sampling_type)
        dataloaders.append(dataloader)
        
    return dataloaders


In [None]:
import torch.nn as nn
import time

def test(model: nn.Module, dataloaders: list):
    corrects = 0
    evaluated = 0
    start_time = time.time()
    model.eval()
    for dl in dataloaders:
        for texts, masks, labels in dl:
            texts = texts.to(active_device)
            masks = masks.to(active_device)
            labels = labels.to(active_device)
            with torch.no_grad():
                logits = model(input_id=texts, mask=masks)
            preds = torch.argmax(logits, dim=1)
            corrects += (preds == labels).sum().item()
            evaluated += texts.shape[0]
        
    return (corrects / evaluated), (time.time() - start_time)


In [None]:
def end_train(last_model: nn.Module, opt_model: nn.Module, test_dl: list, val_dl: list, log_file):
    accuracy, run_time = test(last_model, test_dl)
    str_acc = "{:.5f}".format(accuracy)
    str_time = "{:.1f}".format(run_time)
    log_file.write('Last Model\t' + str_acc + '\t' + str_time + '\n')
    log_file.flush()
    print('Last Model\t' + str_acc + '\t' + str_time)
        
    # Print optimal
    opt_acc, run_time = test(opt_model, test_dl)
    val_acc, run_time = test(opt_model, val_dl)

    test_acc = "test: {:.5f}".format(opt_acc)
    val_acc = "val: {:.5f}".format(val_acc)
    log_file.write('Optimal Model\tTest=' + test_acc + '\tVal=' + val_acc + '\n')
    log_file.flush()
    print('Optimal Model\tTest=' + test_acc + '\tVal=' + val_acc)


In [None]:
import random
import numpy as np

def set_seed(seed_value: int):
    if seed_value >= 0:
        random.seed(seed_value)
        np.random.seed(seed_value)
        torch.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)


In [None]:
import torch.optim as optim
RHO = 0.95
LEARNING_RATE = 1e-5
OPT_NAME = "adam"
BETA_ONE = 0
BETA_TWO = 0.98
ADAM_EPS = 0.00000001
ADADELATA_OPT = "adadelta"
SGD_OPT = "sgd"
ADAM_OPT = "adam"
def get_optimizer(parameters):
    optimizer = None
    if OPT_NAME == ADADELATA_OPT:
        optimizer = optim.Adadelta(parameters,
                                   lr=LEARNING_RATE,
                                   rho=RHO)
    elif OPT_NAME == SGD_OPT:
        optimizer = optim.SGD(parameters, LEARNING_RATE)
    elif OPT_NAME == ADAM_OPT:
        optimizer = optim.Adam(parameters,
                               lr=LEARNING_RATE,
                               betas=(BETA_ONE,BETA_TWO,),
                               eps=ADAM_EPS)
    else:
        print('Wrong optimizer name: ' + OPT_NAME)
        
    return optimizer

CROSS_ENTROP_LOSS = "cross_entropy_loss"
BCE_LOSS = "bce_loss"
def get_loss_function(func_name: str):
    loss_func = None
    if func_name == CROSS_ENTROP_LOSS:
        loss_func = nn.CrossEntropyLoss()
    elif func_name == BCE_LOSS:
        loss_func = nn.BCELoss()
    else:
        print('Wrong loss function name: ' + func_name)
        
    return loss_func

In [1]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaModel
BERT_CONFIG = "roberta-base"
BERT_LABELS2ID = {"positive":0, "negative":1}

class BertClassifier(nn.Module):

    def __init__(self, dropout: float):

        super(BertClassifier, self).__init__()

        self.bert = RobertaModel.from_pretrained(
            BERT_CONFIG,
            label2id=BERT_LABELS2ID,
            id2label={BERT_LABELS2ID[x]:x for x in BERT_LABELS2ID},
            num_labels=len(BERT_LABELS2ID)
        )
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(self.bert.config.hidden_size, self.bert.config.num_labels)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask,return_dict=False)
        # [batch size, hidden dim]

        dropped_out = self.dropout(pooled_output)

        logits = self.relu(self.linear(dropped_out))
        #[batch size, #classes]

        return logits

ModuleNotFoundError: ignored

In [None]:
NUM_EPOCHS = 10
MIN_EPOCHS_TO_STOP = 2
MAX_NO_IMP = 2
MAX_VALID_LOSS = 0.35
EARLY_STOP_MAX_NO_IMP = 2
DROPOUT = 0.5
LOG_FILE_NAME = "drive/MyDrive/ColabLogs/roberta_base_rt_verify.txt"
SEED_VALUE = -1
TRAIN_SET = "drive/MyDrive/ColabData/rt_train_set.csv"
VALIDATION_SET = "drive/MyDrive/ColabData/rt_val_set.csv"
TEST_SET = "drive/MyDrive/ColabData/rt_test.csv"
MIN_VALID_EPOCHS = 2

from transformers import RobertaTokenizer

class Trainer:
    def __init__(self):
        pass

    def train(self) -> tuple:
        print('bert trainer - start')
        log_file = open(LOG_FILE_NAME, "w", encoding="utf-8")
        set_seed(SEED_VALUE)

        print("Load BERT model")
        print("\tTokenizer:")
        start_time = time.time()
        tokenizer = RobertaTokenizer.from_pretrained(BERT_CONFIG)
        print("\tload time = {:.2f}".format(time.time() - start_time))

        print("\tModel:")
        start_time = time.time()
        pending_model = BertClassifier(dropout=DROPOUT)
        pending_model = pending_model.to(active_device)
        optimal_model = None
        print("\tload time = {:.2f}".format(time.time() - start_time))
            
        print("load data frames")
        train_df = pd.read_csv(TRAIN_SET)
        val_df = pd.read_csv(VALIDATION_SET)
        test_df = pd.read_csv(TEST_SET)

        print("load data loaders")
        train_dl = get_data_loaders(train_df, RANDOM_SAMPLING, tokenizer)
        val_dl = get_data_loaders(val_df, SEQUENTIAL_SAMPLING, tokenizer)
        test_dl = get_data_loaders(test_df, SEQUENTIAL_SAMPLING, tokenizer)
        
        optimizer =  get_optimizer(pending_model.parameters())
        loss_func = nn.CrossEntropyLoss()
        loss_func = loss_func.to(active_device)
        
        num_epochs = NUM_EPOCHS
        print('start training loops. #epochs = ' + str(num_epochs))
        print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Train Acc':^11} | {'Test Acc':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*50)  
        
        log_file.write(f"{'Epoch':^7} | {'Train Loss':^12} | {'Train Acc':^11} | {'Test Acc':^10} | {'Val Acc':^9} | {'Elapsed':^9}\n")
        log_file.write("-"*50 + "\n")
            
        
        best_val_acc = 0
        best_val_epoch = -1
        best_test_acc = 0
        best_test_epoch = -1
        min_loss = 100
        num_no_imp = 0
        for i in range(num_epochs):
            epoch = i + 1
            epoch_start_time = time.time()
            total_loss = 0
            num_batches = 0

            random.shuffle(train_dl)
            pending_model.train()
            for dl in train_dl:
                for texts, masks, labels in dl:
                    texts = texts.to(active_device)
                    masks = masks.to(active_device)
                    labels = labels.to(active_device)
                    optimizer.zero_grad()
                    logits = pending_model(input_id=texts, mask=masks)
                    loss = loss_func(logits, labels)
                    total_loss += loss.item()
                    num_batches += 1
                    loss.backward()
                    optimizer.step()
                
            avg_loss = total_loss / num_batches
            epoch_time = time.time() - epoch_start_time
            
            # Validation test.
            val_acc, _ = test(pending_model, val_dl)
            train_acc, _ = test(pending_model, train_dl)
            test_acc, _ = test(pending_model, test_dl)
            val_acc *= 100
            train_acc *= 100
            test_acc *= 100
            print(f"{epoch:^7} | {avg_loss:^12.6f} | {train_acc:^9.2f} | {test_acc:^9.2f} |  {val_acc:^9.4f} | {epoch_time:^9.2f}")
            log_file.write(f"{epoch:^7} | {avg_loss:^12.6f}  {train_acc:^9.2f} | {test_acc:^9.2f} |  {val_acc:^9.4f} | {epoch_time:^9.2f}\n")
            log_file.flush()
                
            if avg_loss < min_loss:
                min_loss = avg_loss
                num_no_imp = 0
            else:
                num_no_imp += 1
                
            if num_no_imp > EARLY_STOP_MAX_NO_IMP and epoch > MIN_EPOCHS_TO_STOP:
                print('early stop exit')
                log_file.write('\tEarly Stop exit\n')
                log_file.flush()
                break
            
            if epoch < MIN_VALID_EPOCHS:
                continue
            
            if avg_loss > MAX_VALID_LOSS:
                continue
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                optimal_model = copy.deepcopy(pending_model)
                best_val_epoch = epoch

            if test_acc > best_test_acc:
                best_test_acc = test_acc
                best_test_epoch = epoch
        
        print('bert trainer - end')
        print("Best Val Acc = {:.2f}".format(best_val_acc) + ", Best Val Epoch = " + str(best_val_epoch))
        print("Best Test Acc = {:.2f}".format(best_test_acc) + ", Best Test Epoch = " + str(best_test_epoch))
        log_file.write("Best Val Acc = {:.2f}".format(best_val_acc) + ", Best Val Epoch = " + str(best_val_epoch) + "\n")
        log_file.write("Best Test Acc = {:.2f}".format(best_test_acc) + ", Best Test Epoch = " + str(best_test_epoch) + "\n")
        end_train(pending_model, optimal_model, test_dl, val_dl, log_file)

        log_file.flush()
        log_file.close()

        return pending_model, optimal_model, best_val_epoch


In [None]:
trainer = Trainer()
last_model, opt_model, best_epoch = trainer.train()
print(best_epoch)

bert trainer - start
Load BERT model
	Tokenizer:
	load time = 0.34
	Model:


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


	load time = 1.52
load data frames
load data loaders
start training loops. #epochs = 10
 Epoch  |  Train Loss  |  Train Acc  |  Test Acc  |  Val Acc  |  Elapsed 
--------------------------------------------------
   1    |   0.520888   |   79.32   |   77.31   |   81.0690  |   9.30   
   2    |   0.341739   |   90.77   |   85.90   |   88.6414  |   9.12   
   3    |   0.277357   |   92.47   |   85.75   |   88.4187  |   9.14   
   4    |   0.270057   |   94.54   |   86.28   |   89.7550  |   9.19   
   5    |   0.202174   |   93.50   |   84.61   |   87.9733  |   9.11   
   6    |   0.178934   |   97.06   |   86.60   |   89.3096  |   9.15   
   7    |   0.134179   |   96.62   |   84.74   |   89.5323  |   9.24   
   8    |   0.118726   |   98.51   |   86.34   |   88.8641  |   9.19   
   9    |   0.092134   |   95.10   |   81.84   |   86.1915  |   9.10   
  10    |   0.091998   |   98.95   |   85.21   |   89.3096  |   9.09   
bert trainer - end
Best Val Acc = 89.76, Best Val Epoch = 4
Best Te