In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
import torch
from torch import tensor
import torch.nn.functional as F
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from trl import SFTTrainer
from transformers import TrainingArguments, Trainer
from typing import Tuple
from datasets import load_dataset, Dataset
from tqdm import tqdm
model_name = 'microsoft/deberta-v3-large'
output_dir = "data/"


data = pd.read_csv(output_dir + "finance_sentiment.csv")
data_sample = data.sample(n=3000, random_state=42)

In [2]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = model.config.max_position_embeddings
print("model parameters:" + str(sum(p.numel() for p in model.parameters())))

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model parameters:435063810




In [3]:
labels = data_sample["label"].tolist()
labels = [0 if x == 0 else 1 for x in labels] # Labels: 0 -> Negative; 1 -> Positive
# convert labels to one hot vectors
labels = np.eye(2)[labels]
labels

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [4]:

train_data,val_data, train_labels, val_labels = train_test_split(data_sample["text"], labels, test_size=1000/len(data_sample), random_state=42)
dataset = Dataset.from_list([{'text': text, 'labels': label} for text, label in zip(train_data, train_labels)])
val_dataset = Dataset.from_list([{'text': text, 'labels': label} for text, label in zip(val_data, val_labels)])

def tokenize_function(examples):
    return tokenizer(examples['text'])

dataset = dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
dataset

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2000
})

In [5]:
from transformers import TrainerCallback, TrainerState, TrainerControl,training_args

class CustomCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        # Assuming the evaluation dataset has 'labels' and 'predictions' fields
        eval_dataloader = kwargs['eval_dataloader']
        model = kwargs['model']
        tokenizer = kwargs['tokenizer']
        
        model.eval()
        correct = 0
        total = 0
        
        for batch in eval_dataloader:
            inputs = batch['input_ids'].to(args.device)
            labels = batch['labels'].to(args.device)
            
            with torch.no_grad():
                outputs = model(inputs)
                predictions = torch.argmax(outputs.logits, dim=-1)
            
            labels = torch.argmax(labels, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
        
        accuracy = correct / total
        print(f"Evaluation Accuracy: {accuracy:.4f}")



trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset=val_dataset,
    args=TrainingArguments(
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim=training_args.OptimizerNames.ADAMW_TORCH,
        learning_rate=5e-5,
        weight_decay=0.001,
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir="outputs",
        num_train_epochs=1,
        # report_to="wandb",
        report_to="none",
        group_by_length=True,
        evaluation_strategy="steps",
        eval_steps=20,
    ),
    callbacks=[CustomCallback()],
)
trainer_stats = trainer.train()

Step,Training Loss,Validation Loss
20,0.6935,0.659165
40,0.4187,0.253105
60,0.2755,0.22252


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Evaluation Accuracy: 0.6700
Evaluation Accuracy: 0.9260
Evaluation Accuracy: 0.9420


In [6]:
torch.save(model, output_dir + "bert.pt")
torch.save(tokenizer, output_dir + "bert_tokenizer.pt")

In [7]:
# model = torch.load(output_dir + "bert.pt")
# tokenizer = torch.load(output_dir + "bert_tokenizer.pt")

In [8]:
model = model.cuda()
model = model.eval()

In [9]:
with torch.no_grad():
    batch = val_data[:50].tolist()
    tokens = tokenizer(batch, padding=True, return_tensors="pt")
    tokens = {k: v.cuda() for k, v in tokens.items()}
    output = model(**tokens)
    logits = output[0].cpu()
    scores = F.softmax(logits, dim=1)[:,1]
    for i, text in enumerate(batch):
        print(f"Text: {text}\nScore: {scores[i].item()}\n")

Text: $OLLI - Ollie's Bargain Outlet EPS beats by $0.03, beats on revenue https://t.co/hgb3KCVBXc
Score: 0.9726799726486206

Text: Twitter Beats Revenue, User Growth Estimates in Fourth Quarter
Score: 0.971839427947998

Text: Highlight: “There’s going to be a bloodbath in terms of synergy savings and retrenchment…” @InvescoUS's… https://t.co/YPgyDW3rMX
Score: 0.06798333674669266

Text: $XLF $FAS $FAZ - Banks may face legal actions over margin calls - FT https://t.co/SAACWM7yNa
Score: 0.06219761073589325

Text: $USA $CRF $SCHX - It's time to buy stocks - Morgan Stanley's Wilson https://t.co/sk5Ll4yTei
Score: 0.9677207469940186

Text: $WTRH back over $1
Score: 0.9764747023582458

Text: Oil boosted by renewed hopes for global production cut https://t.co/4tAO1U31nz
Score: 0.9700902700424194

Text: $OIBR.C - Oi S.A. Is Transforming Into A Leading Telecom Infrastructure Wholesaler For 5G In Brazil. Sign up for up… https://t.co/XQk239OsBs
Score: 0.9704325199127197

Text: China to Suspend Addi