In this notebook we're going to test the hypothesis that pretraining a model on domain data results in better performance of the model when finetuning a model

The three models we'll be using are:
1. FinBERT 2 v_1
2. FinBERT 2 v_2
3. DistilBERT

Note that the FinBERT's are based on the DistilBERT models
The evaluation metric we'll use are Accuracy, cross entropy loss and macro F1 average.

In [None]:
from datasets import Dataset, load_dataset
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, Pipeline
from huggingface_hub import login

login()

In [None]:
finetune_ds = load_dataset('takala/financial_phrasebank', 'sentences_50agree')
tokenizer = AutoTokenizer.from_pretrained('')

def tokenized_inputs(col, tokenizer):
    return tokenizer(col['sentence'], padding="max_length", truncation=True)

finetune_ds = finetune_ds.map(tokenized_inputs)

Map:   0%|          | 0/4846 [00:00<?, ? examples/s]

In [None]:
configs = [
    {'name': 'bert_base', 'model': 'google-bert/bert-base-uncased', 'tokenizer': AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')},
    {'name': 'finBERT2_v2', 'model': 'Czunzun/finBERT2_v2', 'tokenizer': AutoTokenizer.from_pretrained('Czunzun/finBERT2_v2')}
]

tokenized_dataset = {}

for config in configs:
    tokenized_dataset[config['name']] = finetune_ds.map(lambda train: config['tokenizer'](train['sentence'], padding="max_length", truncation=True))

OSError: Czunzun/finBERT2_v2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
#Setting up the training arguments

def training_step(config):
    from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
    data_collator = DataCollatorWithPadding(tokenizer=config['tokenizer'])
    model = AutoModelForSequenceClassification.from_pretrained(config['model'])
    
    training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    )
    
    trainer=Trainer(
        model=model,
        args=training_arg,
        data_collator=data_collator,
        tokenizer=config['tokenizer'],
        train_dataset=finetune_ds['train'],
        eval_dataset=finetune_ds['eval']
        )
    trainer.train()
    