In [1]:
from datasets import load_dataset

dataset = load_dataset("takala/financial_phrasebank", "sentences_allagree")
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2264
    })
})

In [2]:
data = dataset['train'].shuffle(seed = 42)

split = data.train_test_split(test_size = 0.3)
train_data = split['train'] # training data 70%
temp = split['test'] # temporary data to split further

test_split = temp.train_test_split(test_size = 0.5)
val_data = test_split['train'] # validation data 15%
test_data = test_split['test'] # testing data 15%

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'yiyanghkust/finbert-tone'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)



In [4]:
# Tokenizing text

def preprocess(examples):
    return tokenizer(examples["sentence"], padding = "max_length", truncation = True)

train_tokenized = train_data.map(preprocess, batched = True, batch_size = 32)
val_tokenized = val_data.map(preprocess, batched = True, batch_size = 32)
test_tokenized = test_data.map(preprocess, batched = True, batch_size = 32)

# Set format for PyTorch -> to train with PyTorch

train_tokenized.set_format("torch", columns = ["input_ids", "attention_mask", "label"])
val_tokenized.set_format("torch", columns = ["input_ids", "attention_mask", "label"])
test_tokenized.set_format("torch", columns = ["input_ids", "attention_mask", "label"])

Map:   0%|                       | 0/1584 [00:00<?, ? examples/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 1584/1584 [00:00<00:00, 6391.81 examples/s]
Map: 100%|████████████| 340/340 [00:00<00:00, 5937.30 examples/s]
Map: 100%|████████████| 340/340 [00:00<00:00, 6148.84 examples/s]


In [5]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = '../models/finbert_finetuned1',
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    logging_dir = '../logs',
    load_best_model_at_end = True
)