In [None]:
# https://github.com/gtfintechlab/fomc-hawkish-dovish/blob/main/code_model/bert_fine_tune_lm_hawkish_dovish_train_test.py

In [1]:
import os
import sys
from time import time, sleep
import pandas as pd
from transformers import (BertForSequenceClassification,
                          BertTokenizerFast,
                          RobertaTokenizerFast,
                          RobertaForSequenceClassification,
                          AutoTokenizer,
                          AutoModelForSequenceClassification)
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from sklearn.metrics import f1_score
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Colab Notebooks/Comp545_FinalProject')

Mounted at /content/drive


In [3]:
def train_lm_hawkish_dovish(gpu_numbers: str,
                            train_data_path: str,
                            test_data_path: str,
                            language_model_to_use: str,
                            batch_size: int,
                            learning_rate: float,
                            save_model_path: str):
    """
    Description: Run experiment over particular batch size, learning rate
    """
    # set gpu
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_numbers)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    print("Device assigned: ", device)

    # load training data
    data_df = pd.read_excel(train_data_path)
    sentences = data_df['sentence'].to_list()
    labels = data_df['label'].to_numpy()

    # load test data
    data_df_test = pd.read_excel(test_data_path)
    sentences_test = data_df_test['sentence'].to_list()
    labels_test = data_df_test['label'].to_numpy()

    # load tokenizer
    try:
        if language_model_to_use == 'bert':
            tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
        elif language_model_to_use == 'roberta':
            tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=True, do_basic_tokenize=True)
        elif language_model_to_use == 'flangroberta':
            tokenizer = AutoTokenizer.from_pretrained('SALT-NLP/FLANG-Roberta', do_lower_case=True, do_basic_tokenize=True)
        elif language_model_to_use == 'finbert':
            tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone', do_lower_case=True, do_basic_tokenize=True)
        elif language_model_to_use == 'flangbert':
            tokenizer = BertTokenizerFast.from_pretrained('SALT-NLP/FLANG-BERT', do_lower_case=True, do_basic_tokenize=True)
        elif language_model_to_use == 'bert-large':
            tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased', do_lower_case=True, do_basic_tokenize=True)
        elif language_model_to_use == 'roberta-large':
            tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large', do_lower_case=True, do_basic_tokenize=True)
        else:
            return -1
    except Exception as e:
        print(e)
        sleep(600)
        if language_model_to_use == 'bert':
            tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
        elif language_model_to_use == 'roberta':
            tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=True, do_basic_tokenize=True)
        elif language_model_to_use == 'flangroberta':
            tokenizer = AutoTokenizer.from_pretrained('SALT-NLP/FLANG-Roberta', do_lower_case=True, do_basic_tokenize=True)
        elif language_model_to_use == 'finbert':
            tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone', do_lower_case=True, do_basic_tokenize=True)
        elif language_model_to_use == 'flangbert':
            tokenizer = BertTokenizerFast.from_pretrained('SALT-NLP/FLANG-BERT', do_lower_case=True, do_basic_tokenize=True)
        elif language_model_to_use == 'bert-large':
            tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased', do_lower_case=True, do_basic_tokenize=True)
        elif language_model_to_use == 'roberta-large':
            tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large', do_lower_case=True, do_basic_tokenize=True)
        else:
            return -1

    max_length = 0
    sentence_input = []
    labels_output = []
    for i, sentence in enumerate(sentences):
        if isinstance(sentence, str):
            tokens = tokenizer(sentence)['input_ids']
            sentence_input.append(sentence)
            max_length = max(max_length, len(tokens))
            labels_output.append(labels[i])
        else:
            pass

    max_length=256

    if language_model_to_use == 'flangroberta':
        max_length=128

    tokens = tokenizer(sentence_input, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
    labels = np.array(labels_output)

    input_ids = tokens['input_ids']
    attention_masks = tokens['attention_mask']
    labels = torch.LongTensor(labels)
    dataset = TensorDataset(input_ids, attention_masks, labels)
    val_length = int(len(dataset) * 0.2)
    train_length = len(dataset) - val_length
    print(f'Train Size: {train_length}, Validation Size: {val_length}')
    experiment_results = []

    # select language model
    try:
        if language_model_to_use == 'bert':
            model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3).to(device)
        elif language_model_to_use == 'roberta':
            model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3).to(device)
        elif language_model_to_use == 'flangroberta':
            model = AutoModelForSequenceClassification.from_pretrained('SALT-NLP/FLANG-Roberta', num_labels=3).to(device)
        elif language_model_to_use == 'finbert':
            model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3).to(device)
        elif language_model_to_use == 'flangbert':
            model = BertForSequenceClassification.from_pretrained('SALT-NLP/FLANG-BERT', num_labels=3).to(device)
        elif language_model_to_use == 'bert-large':
            model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=3).to(device)
        elif language_model_to_use == 'roberta-large':
            model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=3).to(device)
        else:
            return -1
    except:
        sleep(600)
        if language_model_to_use == 'bert':
            model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3).to(device)
        elif language_model_to_use == 'roberta':
            model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3).to(device)
        elif language_model_to_use == 'flangroberta':
            model = AutoModelForSequenceClassification.from_pretrained('SALT-NLP/FLANG-Roberta', num_labels=3).to(device)
        elif language_model_to_use == 'finbert':
            model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3).to(device)
        elif language_model_to_use == 'flangbert':
            model = BertForSequenceClassification.from_pretrained('SALT-NLP/FLANG-BERT', num_labels=3).to(device)
        elif language_model_to_use == 'bert-large':
            model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=3).to(device)
        elif language_model_to_use == 'roberta-large':
            model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=3).to(device)
        else:
            return -1

    # create train-val split
    train, val = torch.utils.data.random_split(dataset=dataset, lengths=[train_length, val_length])
    dataloaders_dict = {'train': DataLoader(train, batch_size=batch_size, shuffle=True), 'val': DataLoader(val, batch_size=batch_size, shuffle=True)}
    print(train_length, val_length)

    # select optimizer
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    max_num_epochs = 100
    max_early_stopping = 7
    early_stopping_count = 0
    best_ce = float('inf')
    best_accuracy = float('-inf')
    best_f1 = float('-inf')

    eps = 1e-2

    for epoch in range(max_num_epochs):
        if (early_stopping_count >= max_early_stopping):
            break
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
                early_stopping_count += 1
            else:
                model.eval()

            curr_ce = 0
            curr_accuracy = 0
            actual = torch.tensor([]).long().to(device)
            pred = torch.tensor([]).long().to(device)

            for input_ids, attention_masks, labels in dataloaders_dict[phase]:
                input_ids = input_ids.to(device)
                attention_masks = attention_masks.to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(input_ids = input_ids, attention_mask = attention_masks, labels=labels)
                    loss = outputs.loss
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    else:
                        curr_ce += loss.item() * input_ids.size(0)
                        curr_accuracy += torch.sum(torch.max(outputs.logits, 1)[1] == labels).item()
                        actual = torch.cat([actual, labels], dim=0)
                        pred= torch.cat([pred, torch.max(outputs.logits, 1)[1]], dim=0)
            if phase == 'val':
                curr_ce = curr_ce / len(val)
                curr_accuracy = curr_accuracy / len(val)
                currF1 = f1_score(actual.cpu().detach().numpy(), pred.cpu().detach().numpy(), average='weighted')
                if curr_ce <= best_ce - eps:
                    best_ce = curr_ce
                    early_stopping_count = 0
                if curr_accuracy >= best_accuracy + eps:
                    best_accuracy = curr_accuracy
                    early_stopping_count = 0
                if currF1 >= best_f1 + eps:
                    best_f1 = currF1
                    early_stopping_count = 0
                print("Val CE: ", curr_ce)
                print("Val Accuracy: ", curr_accuracy)
                print("Val F1: ", currF1)
                print("Early Stopping Count: ", early_stopping_count)

    ## ------------------testing---------------------
    sentence_input_test = []
    labels_output_test = []
    for i, sentence in enumerate(sentences_test):
        if isinstance(sentence, str):
            tokens = tokenizer(sentence)['input_ids']
            sentence_input_test.append(sentence)
            labels_output_test.append(labels_test[i])
        else:
            pass

    tokens_test = tokenizer(sentence_input_test, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
    labels_test = np.array(labels_output_test)

    input_ids_test = tokens_test['input_ids']
    attention_masks_test = tokens_test['attention_mask']
    labels_test = torch.LongTensor(labels_test)
    dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

    dataloaders_dict_test = {'test': DataLoader(dataset_test, batch_size=batch_size, shuffle=True)}
    test_ce = 0
    test_accuracy = 0
    actual = torch.tensor([]).long().to(device)
    pred = torch.tensor([]).long().to(device)
    for input_ids, attention_masks, labels in dataloaders_dict_test['test']:
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        with torch.no_grad():
            outputs = model(input_ids = input_ids, attention_mask = attention_masks, labels=labels)
            loss = outputs.loss
            test_ce += loss.item() * input_ids.size(0)
            test_accuracy += torch.sum(torch.max(outputs.logits, 1)[1] == labels).item()
            actual = torch.cat([actual, labels], dim=0)
            pred = torch.cat([pred, torch.max(outputs.logits, 1)[1]], dim=0)
    test_ce = test_ce / len(dataset_test)
    test_accuracy = test_accuracy/ len(dataset_test)
    test_f1 = f1_score(actual.cpu().detach().numpy(), pred.cpu().detach().numpy(), average='weighted')
    experiment_results = [learning_rate, batch_size, best_ce, best_accuracy, best_f1, test_ce, test_accuracy, test_f1]

    # save model
    if save_model_path != None:
        model.save_pretrained(save_model_path)
        tokenizer.save_pretrained(save_model_path)

    return experiment_results

In [4]:
def train_lm_price_change_experiments(gpu_numbers: str,
                                      train_data_path: str,
                                      test_data_path: str,
                                      language_model_to_use: str,
                                      data_category: str):
    """
    Description: Run experiments over different batch sizes, learning rates to find best hyperparameters
    """
    results = []
    batch_sizes = [32, 16]
    learning_rates = [1e-4, 1e-5]
    count = 0
    total_experiments = len(batch_sizes) * len(learning_rates)

    for k, batch_size in enumerate(batch_sizes):
        for j, learning_rate in enumerate(learning_rates):
            count += 1
            print(f'Experiment {count} of {total_experiments}:')

            results.append(train_lm_hawkish_dovish(gpu_numbers,
                                                   train_data_path,
                                                   test_data_path,
                                                   language_model_to_use,
                                                   batch_size,
                                                   learning_rate,
                                                   None))
            df = pd.DataFrame(results, columns=["Learning Rate",
                                                "Batch Size",
                                                "Val Cross Entropy",
                                                "Val Accuracy",
                                                "Val F1 Score",
                                                "Test Cross Entropy",
                                                "Test Accuracy",
                                                "Test F1 Score"])
            df.to_excel(f'grid_search_results/final_{data_category}_{language_model_to_use}.xlsx', index=False)

In [5]:
start_t = time()

# experiments
# roberta-large: https://huggingface.co/FacebookAI/roberta-large, https://arxiv.org/abs/1907.11692
# finbert: https://huggingface.co/yiyanghkust/finbert-tone, https://arxiv.org/abs/2006.08097, https://github.com/yya518/FinBERT
# flangbert: https://huggingface.co/SALT-NLP/FLANG-BERT
# flangroberta: https://huggingface.co/SALT-NLP/FLANG-Roberta
# for language_model_to_use in ["roberta", "roberta-large", "bert", "bert-large", "finbert", "flangbert", "flangroberta"]:
for language_model_to_use in ["roberta", "flangbert", "flangroberta", "finbert"]:
    # for data_category in ["lab-manual-combine", "lab-manual-sp", "lab-manual-mm", "lab-manual-pc", "lab-manual-mm-split", "lab-manual-pc-split", "lab-manual-sp-split", "lab-manual-split-combine"]:
    print("Model name: ", language_model_to_use)
    for data_category in ["lab-manual-combine", "lab-manual-split-combine"]:
        print("Data category: ", data_category)
        train_data_path = "data_PEFT/" + data_category + "-train.xlsx"
        test_data_path = "data_PEFT/" + data_category + "-test.xlsx"
        train_lm_price_change_experiments(gpu_numbers="0",
                                          train_data_path=train_data_path,
                                          test_data_path=test_data_path,
                                          language_model_to_use=language_model_to_use,
                                          data_category=data_category)

print((time() - start_t)/60.0)

Model name:  roberta
Data category:  lab-manual-combine
Experiment 1 of 4:
Device assigned:  cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Train Size: 4568, Validation Size: 1141


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4568 1141
Val CE:  0.8373459159126415
Val Accuracy:  0.6021034180543383
Val F1:  0.6108659530705703
Early Stopping Count:  0
Val CE:  0.7273007421823681
Val Accuracy:  0.6722173531989483
Val F1:  0.6756007403544934
Early Stopping Count:  0
Val CE:  0.6027446258600923
Val Accuracy:  0.7528483786152498
Val F1:  0.7496376528845967
Early Stopping Count:  0
Val CE:  0.5548595624676302
Val Accuracy:  0.78965819456617
Val F1:  0.789381905887164
Early Stopping Count:  0
Val CE:  0.47097479680668985
Val Accuracy:  0.8273444347063978
Val F1:  0.8282209310358699
Early Stopping Count:  0
Val CE:  0.4244201090072562
Val Accuracy:  0.8369851007887817
Val F1:  0.8328268746306547
Early Stopping Count:  0
Val CE:  0.3134758608786293
Val Accuracy:  0.9062226117440841
Val F1:  0.9058156954660502
Early Stopping Count:  0
Val CE:  0.3000045263328435
Val Accuracy:  0.9193689745836985
Val F1:  0.9193662817191672
Early Stopping Count:  0
Val CE:  0.3754232205078332
Val Accuracy:  0.8983347940403155
Val F1:  0

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4568 1141
Val CE:  0.6788775463254696
Val Accuracy:  0.6985100788781771
Val F1:  0.7010178879123083
Early Stopping Count:  0
Val CE:  0.47586136277990315
Val Accuracy:  0.8168273444347064
Val F1:  0.8182415972092566
Early Stopping Count:  0
Val CE:  0.34047642342360995
Val Accuracy:  0.8983347940403155
Val F1:  0.8983107805936109
Early Stopping Count:  0
Val CE:  0.28353878681031264
Val Accuracy:  0.9193689745836985
Val F1:  0.9192928637565608
Early Stopping Count:  0
Val CE:  0.2641776782002604
Val Accuracy:  0.9316389132340053
Val F1:  0.9312872611648425
Early Stopping Count:  0
Val CE:  0.2465112256857515
Val Accuracy:  0.9421560035056967
Val F1:  0.9419576293015629
Early Stopping Count:  0
Val CE:  0.22989195410250363
Val Accuracy:  0.9482909728308502
Val F1:  0.9484995676658393
Early Stopping Count:  0
Val CE:  0.2263391201682699
Val Accuracy:  0.9439088518843121
Val F1:  0.9440514270278526
Early Stopping Count:  1
Val CE:  0.2401664230440344
Val Accuracy:  0.950920245398773
Val F

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4568 1141
Val CE:  1.0532274252484495
Val Accuracy:  0.4864154250657318
Val F1:  0.31834971805599194
Early Stopping Count:  0
Val CE:  1.054910929238556
Val Accuracy:  0.4864154250657318
Val F1:  0.31834971805599194
Early Stopping Count:  1
Val CE:  1.0545592041103387
Val Accuracy:  0.4864154250657318
Val F1:  0.31834971805599194
Early Stopping Count:  2
Val CE:  1.0529193866158868
Val Accuracy:  0.4864154250657318
Val F1:  0.31834971805599194
Early Stopping Count:  3
Val CE:  1.0508060342486127
Val Accuracy:  0.4864154250657318
Val F1:  0.31834971805599194
Early Stopping Count:  4
Val CE:  1.0505467595811688
Val Accuracy:  0.4864154250657318
Val F1:  0.31834971805599194
Early Stopping Count:  5
Val CE:  1.0499594502549334
Val Accuracy:  0.4864154250657318
Val F1:  0.31834971805599194
Early Stopping Count:  6
Val CE:  1.051101714659949
Val Accuracy:  0.4864154250657318
Val F1:  0.31834971805599194
Early Stopping Count:  7
Experiment 4 of 4:
Device assigned:  cuda
Train Size: 4568, Vali

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4568 1141
Val CE:  0.6410493348691675
Val Accuracy:  0.7309377738825592
Val F1:  0.7330346906209241
Early Stopping Count:  0
Val CE:  0.5569117360394922
Val Accuracy:  0.7852760736196319
Val F1:  0.7925535100768005
Early Stopping Count:  0
Val CE:  0.30383668230041716
Val Accuracy:  0.8957055214723927
Val F1:  0.8960637007051871
Early Stopping Count:  0
Val CE:  0.18374897612097177
Val Accuracy:  0.9491673970201577
Val F1:  0.9493006738961449
Early Stopping Count:  0
Val CE:  0.20994883509292403
Val Accuracy:  0.9395267309377738
Val F1:  0.9397722103412829
Early Stopping Count:  1
Val CE:  0.16235782766484513
Val Accuracy:  0.9570552147239264
Val F1:  0.9571937037810998
Early Stopping Count:  0
Val CE:  0.16476192787188815
Val Accuracy:  0.9596844872918493
Val F1:  0.9596914418956688
Early Stopping Count:  0
Val CE:  0.1325482136090788
Val Accuracy:  0.9640666082383874
Val F1:  0.9641436400537335
Early Stopping Count:  0
Val CE:  0.15998835716811624
Val Accuracy:  0.9666958808063103
Va

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4762 1190
Val CE:  1.0312475607174785
Val Accuracy:  0.5134453781512605
Val F1:  0.3483788184901945
Early Stopping Count:  0
Val CE:  1.0346179228870809
Val Accuracy:  0.5134453781512605
Val F1:  0.3483788184901945
Early Stopping Count:  1
Val CE:  1.0367392971736042
Val Accuracy:  0.5134453781512605
Val F1:  0.3483788184901945
Early Stopping Count:  2
Val CE:  1.0359071905873403
Val Accuracy:  0.5134453781512605
Val F1:  0.3483788184901945
Early Stopping Count:  3
Val CE:  1.0322533908010532
Val Accuracy:  0.5134453781512605
Val F1:  0.3483788184901945
Early Stopping Count:  4
Val CE:  1.0347614986555917
Val Accuracy:  0.5134453781512605
Val F1:  0.3483788184901945
Early Stopping Count:  5
Val CE:  1.033316404879594
Val Accuracy:  0.5134453781512605
Val F1:  0.3483788184901945
Early Stopping Count:  6
Val CE:  1.0340785421243235
Val Accuracy:  0.5134453781512605
Val F1:  0.3483788184901945
Early Stopping Count:  7
Experiment 2 of 4:
Device assigned:  cuda
Train Size: 4762, Validation 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4762 1190
Val CE:  0.7515016738106223
Val Accuracy:  0.646218487394958
Val F1:  0.6555096141517452
Early Stopping Count:  0
Val CE:  0.5280249156871764
Val Accuracy:  0.7974789915966387
Val F1:  0.8006331185687242
Early Stopping Count:  0
Val CE:  0.3422395408028314
Val Accuracy:  0.8949579831932774
Val F1:  0.8954225024387837
Early Stopping Count:  0
Val CE:  0.28113506055679643
Val Accuracy:  0.9117647058823529
Val F1:  0.9113766073334483
Early Stopping Count:  0
Val CE:  0.2613454771580315
Val Accuracy:  0.9285714285714286
Val F1:  0.9287220687386348
Early Stopping Count:  0
Val CE:  0.23502421816106603
Val Accuracy:  0.9436974789915966
Val F1:  0.9437619391452233
Early Stopping Count:  0
Val CE:  0.24573244733595045
Val Accuracy:  0.9436974789915966
Val F1:  0.9438693906701154
Early Stopping Count:  1
Val CE:  0.22895764148394976
Val Accuracy:  0.9470588235294117
Val F1:  0.9471739955465652
Early Stopping Count:  2
Val CE:  0.2752036043575832
Val Accuracy:  0.9369747899159664
Val F

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4762 1190
Val CE:  1.0297427226515377
Val Accuracy:  0.5142857142857142
Val F1:  0.3493261455525606
Early Stopping Count:  0
Val CE:  1.0305934965109624
Val Accuracy:  0.5142857142857142
Val F1:  0.3493261455525606
Early Stopping Count:  1
Val CE:  1.0386543834910673
Val Accuracy:  0.5142857142857142
Val F1:  0.3493261455525606
Early Stopping Count:  2
Val CE:  1.0531610114233834
Val Accuracy:  0.5142857142857142
Val F1:  0.3493261455525606
Early Stopping Count:  3
Val CE:  1.0298762607975167
Val Accuracy:  0.5142857142857142
Val F1:  0.3493261455525606
Early Stopping Count:  4
Val CE:  1.029643952946703
Val Accuracy:  0.5142857142857142
Val F1:  0.3493261455525606
Early Stopping Count:  5
Val CE:  1.035824558514507
Val Accuracy:  0.5142857142857142
Val F1:  0.3493261455525606
Early Stopping Count:  6
Val CE:  1.0414548437134559
Val Accuracy:  0.5142857142857142
Val F1:  0.3493261455525606
Early Stopping Count:  7
Experiment 4 of 4:
Device assigned:  cuda
Train Size: 4762, Validation S

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4762 1190
Val CE:  0.5819705973152353
Val Accuracy:  0.7714285714285715
Val F1:  0.7675209155117616
Early Stopping Count:  0
Val CE:  0.3568966226793137
Val Accuracy:  0.8638655462184874
Val F1:  0.8619134503342559
Early Stopping Count:  0
Val CE:  0.29412890536507136
Val Accuracy:  0.907563025210084
Val F1:  0.9091294495881942
Early Stopping Count:  0
Val CE:  0.233532745748007
Val Accuracy:  0.9403361344537815
Val F1:  0.9404611287253684
Early Stopping Count:  0
Val CE:  0.21377294633944496
Val Accuracy:  0.9436974789915966
Val F1:  0.9433191844268205
Early Stopping Count:  0
Val CE:  0.18972824358952647
Val Accuracy:  0.9571428571428572
Val F1:  0.9571146050372457
Early Stopping Count:  0
Val CE:  0.16355568669235507
Val Accuracy:  0.9605042016806723
Val F1:  0.9606321918601282
Early Stopping Count:  0
Val CE:  0.19182324384656116
Val Accuracy:  0.9588235294117647
Val F1:  0.9587211215755652
Early Stopping Count:  1
Val CE:  0.18632285809066115
Val Accuracy:  0.9563025210084034
Val 

tokenizer_config.json:   0%|          | 0.00/369 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Train Size: 4568, Validation Size: 1141


config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4568 1141
Val CE:  0.7207565999522113
Val Accuracy:  0.7037686240140227
Val F1:  0.6966787343734022
Early Stopping Count:  0
Val CE:  0.37283817145199655
Val Accuracy:  0.873794916739702
Val F1:  0.8740461427525286
Early Stopping Count:  0
Val CE:  0.2935763514167274
Val Accuracy:  0.908851884312007
Val F1:  0.9093743441372167
Early Stopping Count:  0
Val CE:  0.24909699719324122
Val Accuracy:  0.9219982471516214
Val F1:  0.9222461658976852
Early Stopping Count:  0
Val CE:  0.22485523329399637
Val Accuracy:  0.9421560035056967
Val F1:  0.9422667798986619
Early Stopping Count:  0
Val CE:  0.27061151348276424
Val Accuracy:  0.950920245398773
Val F1:  0.9508164530265452
Early Stopping Count:  1
Val CE:  0.3126270051924781
Val Accuracy:  0.9395267309377738
Val F1:  0.9395769927275428
Early Stopping Count:  2
Val CE:  0.3956307623597428
Val Accuracy:  0.9141104294478528
Val F1:  0.9153772351790376
Early Stopping Count:  3
Val CE:  0.3229977504189447
Val Accuracy:  0.9097283085013146
Val F1:

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4568 1141
Val CE:  0.827396154978315
Val Accuracy:  0.5863277826468011
Val F1:  0.5552415406954002
Early Stopping Count:  0
Val CE:  0.5843799764602253
Val Accuracy:  0.7738825591586328
Val F1:  0.7748835220612309
Early Stopping Count:  0
Val CE:  0.3180313693991259
Val Accuracy:  0.9062226117440841
Val F1:  0.9058982444899082
Early Stopping Count:  0
Val CE:  0.23645053968473445
Val Accuracy:  0.9316389132340053
Val F1:  0.9319560146394178
Early Stopping Count:  0
Val CE:  0.21727057831405966
Val Accuracy:  0.9482909728308502
Val F1:  0.9484860416559263
Early Stopping Count:  0
Val CE:  0.20891801606762525
Val Accuracy:  0.9544259421560035
Val F1:  0.9544873204192077
Early Stopping Count:  1
Val CE:  0.21356558654127572
Val Accuracy:  0.9535495179666958
Val F1:  0.9536105141777391
Early Stopping Count:  2
Val CE:  0.20958271495978317
Val Accuracy:  0.9544259421560035
Val F1:  0.9544879931645626
Early Stopping Count:  3
Val CE:  0.21194466269402834
Val Accuracy:  0.9605609114811569
Val

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4568 1141
Val CE:  0.736132757142382
Val Accuracy:  0.6765994741454864
Val F1:  0.6847146222777317
Early Stopping Count:  0
Val CE:  0.4489796206707708
Val Accuracy:  0.8518843120070114
Val F1:  0.8527537601886914
Early Stopping Count:  0
Val CE:  0.3312542209700468
Val Accuracy:  0.8930762489044698
Val F1:  0.8925642868291109
Early Stopping Count:  0
Val CE:  0.3093440648475229
Val Accuracy:  0.8957055214723927
Val F1:  0.8962617823234988
Early Stopping Count:  0
Val CE:  0.30659376183160686
Val Accuracy:  0.915863277826468
Val F1:  0.9159351541715671
Early Stopping Count:  0
Val CE:  0.38358367390532333
Val Accuracy:  0.9035933391761612
Val F1:  0.9044070712969221
Early Stopping Count:  1
Val CE:  0.29340588029206294
Val Accuracy:  0.9237510955302366
Val F1:  0.9233569482429574
Early Stopping Count:  0
Val CE:  0.35344839839209247
Val Accuracy:  0.9360210341805434
Val F1:  0.9362515676204652
Early Stopping Count:  0
Val CE:  0.25118142970299756
Val Accuracy:  0.9377738825591586
Val F

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4568 1141
Val CE:  0.6859968065902081
Val Accuracy:  0.6993865030674846
Val F1:  0.6789918850294377
Early Stopping Count:  0
Val CE:  0.4083585449209556
Val Accuracy:  0.845749342681858
Val F1:  0.8448562983450243
Early Stopping Count:  0
Val CE:  0.3136683210769732
Val Accuracy:  0.9097283085013146
Val F1:  0.9108774344205713
Early Stopping Count:  0
Val CE:  0.2700405678043336
Val Accuracy:  0.9342681858019282
Val F1:  0.9346229114225628
Early Stopping Count:  0
Val CE:  0.2753878468712898
Val Accuracy:  0.9421560035056967
Val F1:  0.9425820939417551
Early Stopping Count:  1
Val CE:  0.2310410976462151
Val Accuracy:  0.9482909728308502
Val F1:  0.948249380156744
Early Stopping Count:  0
Val CE:  0.2632570993840877
Val Accuracy:  0.9421560035056967
Val F1:  0.9423697947805385
Early Stopping Count:  1
Val CE:  0.269761018956196
Val Accuracy:  0.9456617002629273
Val F1:  0.9458131978093104
Early Stopping Count:  2
Val CE:  0.2934999206984101
Val Accuracy:  0.9395267309377738
Val F1:  0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4762 1190
Val CE:  0.7713949245565078
Val Accuracy:  0.6588235294117647
Val F1:  0.6452452893181859
Early Stopping Count:  0
Val CE:  0.42396717109725257
Val Accuracy:  0.8369747899159664
Val F1:  0.8335681139507558
Early Stopping Count:  0
Val CE:  0.29287950372119914
Val Accuracy:  0.8983193277310925
Val F1:  0.8973019134516168
Early Stopping Count:  0
Val CE:  0.23140282485665392
Val Accuracy:  0.9277310924369748
Val F1:  0.9275886437402731
Early Stopping Count:  0
Val CE:  0.21364081677268534
Val Accuracy:  0.9319327731092437
Val F1:  0.9322082574446968
Early Stopping Count:  0
Val CE:  0.2368074928919057
Val Accuracy:  0.9420168067226891
Val F1:  0.9414284014920427
Early Stopping Count:  0
Val CE:  0.2644346776842961
Val Accuracy:  0.9445378151260504
Val F1:  0.9445732491935557
Early Stopping Count:  1
Val CE:  0.24270558852483246
Val Accuracy:  0.9428571428571428
Val F1:  0.9424824614537834
Early Stopping Count:  2
Val CE:  0.20704684447841484
Val Accuracy:  0.9504201680672268
Va

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4762 1190
Val CE:  0.8202627803097252
Val Accuracy:  0.6210084033613446
Val F1:  0.6290592265143612
Early Stopping Count:  0
Val CE:  0.5092212410534129
Val Accuracy:  0.7966386554621848
Val F1:  0.8001262930744343
Early Stopping Count:  0
Val CE:  0.3725060895711434
Val Accuracy:  0.8773109243697479
Val F1:  0.8795076341808591
Early Stopping Count:  0
Val CE:  0.30553414750875546
Val Accuracy:  0.9176470588235294
Val F1:  0.9186015228686596
Early Stopping Count:  0
Val CE:  0.23780798084423446
Val Accuracy:  0.9445378151260504
Val F1:  0.9446340447497323
Early Stopping Count:  0
Val CE:  0.2828846545342137
Val Accuracy:  0.9352941176470588
Val F1:  0.9355143807893342
Early Stopping Count:  1
Val CE:  0.29831250790710084
Val Accuracy:  0.9361344537815126
Val F1:  0.9364715355351744
Early Stopping Count:  2
Val CE:  0.30522149799943377
Val Accuracy:  0.9394957983193277
Val F1:  0.939679059154587
Early Stopping Count:  3
Val CE:  0.3065729468925895
Val Accuracy:  0.9436974789915966
Val F

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4762 1190
Val CE:  0.8138530022957746
Val Accuracy:  0.6302521008403361
Val F1:  0.6395311769878476
Early Stopping Count:  0
Val CE:  0.5657204800293225
Val Accuracy:  0.7789915966386555
Val F1:  0.780810428752121
Early Stopping Count:  0
Val CE:  0.3959042639542027
Val Accuracy:  0.8747899159663866
Val F1:  0.8751551913829875
Early Stopping Count:  0
Val CE:  0.31045634515207854
Val Accuracy:  0.8966386554621849
Val F1:  0.897015145743355
Early Stopping Count:  0
Val CE:  0.35372550508805684
Val Accuracy:  0.907563025210084
Val F1:  0.9071122962108812
Early Stopping Count:  0
Val CE:  0.3640631645870935
Val Accuracy:  0.9302521008403362
Val F1:  0.9299242079068114
Early Stopping Count:  0
Val CE:  0.4081548671167688
Val Accuracy:  0.9193277310924369
Val F1:  0.9184201042712118
Early Stopping Count:  1
Val CE:  0.3166251351383804
Val Accuracy:  0.9294117647058824
Val F1:  0.9296678953176322
Early Stopping Count:  2
Val CE:  0.36777455941265247
Val Accuracy:  0.938655462184874
Val F1:  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4762 1190
Val CE:  0.673767961474026
Val Accuracy:  0.7134453781512605
Val F1:  0.7188399371399079
Early Stopping Count:  0
Val CE:  0.41842037840049806
Val Accuracy:  0.8529411764705882
Val F1:  0.8557498643203333
Early Stopping Count:  0
Val CE:  0.3104764681105979
Val Accuracy:  0.9033613445378151
Val F1:  0.9044048528134572
Early Stopping Count:  0
Val CE:  0.22750980687413772
Val Accuracy:  0.9428571428571428
Val F1:  0.9432064130281774
Early Stopping Count:  0
Val CE:  0.2087395566069529
Val Accuracy:  0.9487394957983193
Val F1:  0.9487593016596484
Early Stopping Count:  0
Val CE:  0.24008035102248693
Val Accuracy:  0.9470588235294117
Val F1:  0.9469039093268672
Early Stopping Count:  1
Val CE:  0.2470633098319517
Val Accuracy:  0.9470588235294117
Val F1:  0.9470846408932355
Early Stopping Count:  2
Val CE:  0.25651922532349447
Val Accuracy:  0.9487394957983193
Val F1:  0.9490067718767958
Early Stopping Count:  3
Val CE:  0.24955924731843612
Val Accuracy:  0.9478991596638655
Val 

tokenizer_config.json:   0%|          | 0.00/311 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Token indices sequence length is longer than the specified maximum sequence length for this model (129 > 128). Running this sequence through the model will result in indexing errors


Train Size: 4568, Validation Size: 1141


config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-Roberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4568 1141
Val CE:  1.0446807773984597
Val Accuracy:  0.5048203330411919
Val F1:  0.3387029840788893
Early Stopping Count:  0
Val CE:  1.0455185449718907
Val Accuracy:  0.5048203330411919
Val F1:  0.3387029840788893
Early Stopping Count:  1
Val CE:  1.046448924685653
Val Accuracy:  0.5048203330411919
Val F1:  0.3387029840788893
Early Stopping Count:  2
Val CE:  1.0442368065861627
Val Accuracy:  0.5048203330411919
Val F1:  0.3387029840788893
Early Stopping Count:  3
Val CE:  1.0422367033051567
Val Accuracy:  0.5048203330411919
Val F1:  0.3387029840788893
Early Stopping Count:  4
Val CE:  1.0377451897085392
Val Accuracy:  0.5048203330411919
Val F1:  0.3387029840788893
Early Stopping Count:  5
Val CE:  1.0363764384668401
Val Accuracy:  0.5048203330411919
Val F1:  0.3387029840788893
Early Stopping Count:  6
Val CE:  1.0387883203056796
Val Accuracy:  0.5048203330411919
Val F1:  0.3387029840788893
Early Stopping Count:  7
Experiment 2 of 4:
Device assigned:  cuda


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Token indices sequence length is longer than the specified maximum sequence length for this model (129 > 128). Running this sequence through the model will result in indexing errors


Train Size: 4568, Validation Size: 1141


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-Roberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4568 1141
Val CE:  0.8585616183009303
Val Accuracy:  0.6170026292725679
Val F1:  0.6138012459978587
Early Stopping Count:  0
Val CE:  0.6487068275314585
Val Accuracy:  0.7423312883435583
Val F1:  0.7487118022525598
Early Stopping Count:  0
Val CE:  0.4699473983879992
Val Accuracy:  0.8247151621384751
Val F1:  0.8257396201554685
Early Stopping Count:  0
Val CE:  0.31896047295445834
Val Accuracy:  0.8965819456617002
Val F1:  0.8970392916766633
Early Stopping Count:  0
Val CE:  0.30688226779336786
Val Accuracy:  0.9106047326906223
Val F1:  0.911355829735364
Early Stopping Count:  0
Val CE:  0.28027352374651054
Val Accuracy:  0.9167397020157756
Val F1:  0.9177186536156103
Early Stopping Count:  0
Val CE:  0.2322501911864168
Val Accuracy:  0.9412795793163892
Val F1:  0.9413270795661332
Early Stopping Count:  0
Val CE:  0.219998252799279
Val Accuracy:  0.9465381244522348
Val F1:  0.9466578066323197
Early Stopping Count:  0
Val CE:  0.22237941114710258
Val Accuracy:  0.9500438212094654
Val F1

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Token indices sequence length is longer than the specified maximum sequence length for this model (129 > 128). Running this sequence through the model will result in indexing errors


Train Size: 4568, Validation Size: 1141


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-Roberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4568 1141
Val CE:  0.7148717717054536
Val Accuracy:  0.7063978965819456
Val F1:  0.7043403827124459
Early Stopping Count:  0
Val CE:  0.5119043467182741
Val Accuracy:  0.8194566170026293
Val F1:  0.8222044918028545
Early Stopping Count:  0
Val CE:  0.3977532289876528
Val Accuracy:  0.8781770376862401
Val F1:  0.8789371712853528
Early Stopping Count:  0
Val CE:  0.33101388835572654
Val Accuracy:  0.8992112182296231
Val F1:  0.8993155998764694
Early Stopping Count:  0
Val CE:  0.38361975851025526
Val Accuracy:  0.880806310254163
Val F1:  0.882448733571098
Early Stopping Count:  1
Val CE:  0.47500730053204177
Val Accuracy:  0.8501314636283961
Val F1:  0.8519085123050038
Early Stopping Count:  2
Val CE:  1.080957343032964
Val Accuracy:  0.4662576687116564
Val F1:  0.2965320738249865
Early Stopping Count:  3
Val CE:  1.068061079245508
Val Accuracy:  0.4662576687116564
Val F1:  0.2965320738249865
Early Stopping Count:  4
Val CE:  1.0692762629595063
Val Accuracy:  0.4662576687116564
Val F1:  

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Token indices sequence length is longer than the specified maximum sequence length for this model (129 > 128). Running this sequence through the model will result in indexing errors


Train Size: 4568, Validation Size: 1141


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-Roberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4568 1141
Val CE:  0.7209962542698531
Val Accuracy:  0.6932515337423313
Val F1:  0.6914813790010621
Early Stopping Count:  0
Val CE:  0.4574301045103934
Val Accuracy:  0.8299737072743207
Val F1:  0.8307535049392122
Early Stopping Count:  0
Val CE:  0.30264785675712313
Val Accuracy:  0.9044697633654689
Val F1:  0.9046124400838854
Early Stopping Count:  0
Val CE:  0.3174146920259745
Val Accuracy:  0.9132340052585451
Val F1:  0.9135748076328227
Early Stopping Count:  1
Val CE:  0.2776401118423519
Val Accuracy:  0.9360210341805434
Val F1:  0.9358735319334318
Early Stopping Count:  0
Val CE:  0.2913593574803352
Val Accuracy:  0.936897458369851
Val F1:  0.9367987325033014
Early Stopping Count:  1
Val CE:  0.26162466135159246
Val Accuracy:  0.9447852760736196
Val F1:  0.9446840755396175
Early Stopping Count:  0
Val CE:  0.34246147997631615
Val Accuracy:  0.9281332164767747
Val F1:  0.9281261113453866
Early Stopping Count:  1
Val CE:  0.25954482134221013
Val Accuracy:  0.9386503067484663
Val F

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Token indices sequence length is longer than the specified maximum sequence length for this model (150 > 128). Running this sequence through the model will result in indexing errors


Train Size: 4762, Validation Size: 1190


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-Roberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4762 1190
Val CE:  0.6255399870772321
Val Accuracy:  0.7394957983193278
Val F1:  0.7346238641718076
Early Stopping Count:  0
Val CE:  0.5113187166823059
Val Accuracy:  0.8302521008403362
Val F1:  0.8326742352560709
Early Stopping Count:  0
Val CE:  0.31948210344094186
Val Accuracy:  0.8949579831932774
Val F1:  0.8953298193127092
Early Stopping Count:  0
Val CE:  0.26161271469871034
Val Accuracy:  0.9243697478991597
Val F1:  0.9249051069633443
Early Stopping Count:  0
Val CE:  0.28561358138637133
Val Accuracy:  0.9243697478991597
Val F1:  0.9248587599681065
Early Stopping Count:  1
Val CE:  0.22448013534127664
Val Accuracy:  0.9487394957983193
Val F1:  0.9486218354350459
Early Stopping Count:  0
Val CE:  0.28793499975097403
Val Accuracy:  0.9327731092436975
Val F1:  0.9328511697361872
Early Stopping Count:  1
Val CE:  0.28449264173943767
Val Accuracy:  0.9420168067226891
Val F1:  0.9421001326675085
Early Stopping Count:  2
Val CE:  0.23511311254393655
Val Accuracy:  0.9512605042016806
V

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Token indices sequence length is longer than the specified maximum sequence length for this model (150 > 128). Running this sequence through the model will result in indexing errors


Train Size: 4762, Validation Size: 1190


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-Roberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4762 1190
Val CE:  0.8208955647564736
Val Accuracy:  0.6495798319327731
Val F1:  0.6308955536437622
Early Stopping Count:  0
Val CE:  0.5827912825496256
Val Accuracy:  0.7722689075630252
Val F1:  0.7762795847668772
Early Stopping Count:  0
Val CE:  0.3772324567582427
Val Accuracy:  0.8705882352941177
Val F1:  0.8710518350301928
Early Stopping Count:  0
Val CE:  0.2664476354645581
Val Accuracy:  0.9201680672268907
Val F1:  0.9205557143369489
Early Stopping Count:  0
Val CE:  0.23221198843181634
Val Accuracy:  0.9361344537815126
Val F1:  0.9362122086220758
Early Stopping Count:  0
Val CE:  0.18979452347398557
Val Accuracy:  0.9571428571428572
Val F1:  0.9571148563475931
Early Stopping Count:  0
Val CE:  0.16933161314545558
Val Accuracy:  0.9571428571428572
Val F1:  0.9569470191325392
Early Stopping Count:  0
Val CE:  0.1748020256180786
Val Accuracy:  0.9613445378151261
Val F1:  0.961143462166176
Early Stopping Count:  1
Val CE:  0.1749661622381135
Val Accuracy:  0.9588235294117647
Val F1

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Token indices sequence length is longer than the specified maximum sequence length for this model (150 > 128). Running this sequence through the model will result in indexing errors


Train Size: 4762, Validation Size: 1190


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-Roberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4762 1190
Val CE:  1.0387317994061638
Val Accuracy:  0.5100840336134453
Val F1:  0.3445976721239414
Early Stopping Count:  0
Val CE:  1.0387891066174548
Val Accuracy:  0.5100840336134453
Val F1:  0.3445976721239414
Early Stopping Count:  1
Val CE:  1.0467033841028934
Val Accuracy:  0.5100840336134453
Val F1:  0.3445976721239414
Early Stopping Count:  2
Val CE:  0.8900371678737031
Val Accuracy:  0.6327731092436975
Val F1:  0.6337431072347818
Early Stopping Count:  0
Val CE:  0.8402870269382701
Val Accuracy:  0.692436974789916
Val F1:  0.6988555346524242
Early Stopping Count:  0
Val CE:  0.8263049723721352
Val Accuracy:  0.6823529411764706
Val F1:  0.630303728214043
Early Stopping Count:  0
Val CE:  0.737767300385387
Val Accuracy:  0.7436974789915967
Val F1:  0.7383773578978315
Early Stopping Count:  0
Val CE:  1.1318598140187623
Val Accuracy:  0.5100840336134453
Val F1:  0.3445976721239414
Early Stopping Count:  1
Val CE:  1.0332829848057081
Val Accuracy:  0.5100840336134453
Val F1:  0.

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Token indices sequence length is longer than the specified maximum sequence length for this model (150 > 128). Running this sequence through the model will result in indexing errors


Train Size: 4762, Validation Size: 1190


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-Roberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4762 1190
Val CE:  0.707692256043939
Val Accuracy:  0.7058823529411765
Val F1:  0.7117893671897705
Early Stopping Count:  0
Val CE:  0.46061716039641565
Val Accuracy:  0.826890756302521
Val F1:  0.8286385872992507
Early Stopping Count:  0
Val CE:  0.3610503529300209
Val Accuracy:  0.8890756302521008
Val F1:  0.890189658511508
Early Stopping Count:  0
Val CE:  0.3790861420270776
Val Accuracy:  0.8857142857142857
Val F1:  0.8871719041137832
Early Stopping Count:  1
Val CE:  0.2693337010736225
Val Accuracy:  0.9361344537815126
Val F1:  0.9365425140357287
Early Stopping Count:  0
Val CE:  0.2494968156122892
Val Accuracy:  0.9411764705882353
Val F1:  0.9413310804266106
Early Stopping Count:  0
Val CE:  0.2408627792785899
Val Accuracy:  0.9445378151260504
Val F1:  0.9446201900128277
Early Stopping Count:  1
Val CE:  0.24390144961721757
Val Accuracy:  0.9487394957983193
Val F1:  0.9488100681442528
Early Stopping Count:  0
Val CE:  0.23673662987435942
Val Accuracy:  0.9529411764705882
Val F1: 

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

Train Size: 4568, Validation Size: 1141


pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

4568 1141
Val CE:  0.5056246235431233
Val Accuracy:  0.8071866783523225
Val F1:  0.8076877107593649
Early Stopping Count:  0
Val CE:  0.2631292907320335
Val Accuracy:  0.9184925503943909
Val F1:  0.9187903889360027
Early Stopping Count:  0
Val CE:  0.25387894813425177
Val Accuracy:  0.9412795793163892
Val F1:  0.9412813947764086
Early Stopping Count:  0
Val CE:  0.2681989641313251
Val Accuracy:  0.9421560035056967
Val F1:  0.9421353838467235
Early Stopping Count:  1
Val CE:  0.3063881880473949
Val Accuracy:  0.9386503067484663
Val F1:  0.9385971144395652
Early Stopping Count:  2
Val CE:  0.33762269671500306
Val Accuracy:  0.9395267309377738
Val F1:  0.9395314743540556
Early Stopping Count:  3
Val CE:  0.3295049249838571
Val Accuracy:  0.9316389132340053
Val F1:  0.9312111714191436
Early Stopping Count:  4
Val CE:  0.284946071602711
Val Accuracy:  0.9430324276950044
Val F1:  0.943130680122113
Early Stopping Count:  5
Val CE:  0.3447395204374604
Val Accuracy:  0.9430324276950044
Val F1: 