## FinBERT

In [1]:
import re
import torch
import gradio as gr
import numpy as np
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix


In [2]:

def clean_text(text):
    """Remove non-ASCII chars (keep x20~x7E)."""
    return re.sub(r"[^\x20-\x7E]", "", text)


file_path = "/Users/tim/Desktop/self-learning/LLM_tune/all-data.csv"


raw_dataset = load_dataset(
    "csv",
    data_files={"full": file_path},
    encoding="ISO-8859-1",
    column_names=["sentiment", "text"]
)["full"]

print("=== Raw dataset example ===")
print(raw_dataset[0])


def map_label_for_finbert(example):

    old_label = example["sentiment"]
    text      = example["text"]  
    str2num = {"negative": 0, "neutral": 1, "positive": 2}
    old_num = str2num.get(old_label, 1)  # default to 1 if something odd
    old2new = {0: 1, 1: 2, 2: 0}
    new_label = old2new[old_num]
    text_cleaned = clean_text(text)
    example["text"]   = text_cleaned
    example["labels"] = new_label
    return example

mapped_dataset = raw_dataset.map(map_label_for_finbert)

# split (80% train, 10% val, 10% test)
split_dataset = mapped_dataset.train_test_split(test_size=0.2, seed=42)
train_data    = split_dataset["train"]
temp_data     = split_dataset["test"]

temp_split = temp_data.train_test_split(test_size=0.5, seed=42)
valid_data = temp_split["train"]
test_data  = temp_split["test"]

print("\n=== Final splits ===")
print("Train rows:", len(train_data))
print("Valid rows:", len(valid_data))
print("Test rows: ", len(test_data))

=== Raw dataset example ===
{'sentiment': 'neutral', 'text': 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .'}


Map:   0%|          | 0/4846 [00:00<?, ? examples/s]


=== Final splits ===
Train rows: 3876
Valid rows: 485
Test rows:  485


In [3]:
model_name = "ProsusAI/finbert"
tokenizer  = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    out = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    out["labels"] = examples["labels"]
    return out

train_tokenized = train_data.map(tokenize_function, batched=True)
valid_tokenized = valid_data.map(tokenize_function, batched=True)
test_tokenized  = test_data.map(tokenize_function,  batched=True)

print("\n=== Tokenized example (train) ===")
print(train_tokenized[0])

Map:   0%|          | 0/3876 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]


=== Tokenized example (train) ===
{'sentiment': 'neutral', 'text': "Under the terms of the agreement , Bunge will acquire Raisio 's Keiju , Makuisa and Pyszny Duet brands and manufacturing plants in Finland and Poland .", 'labels': 2, 'input_ids': [101, 2104, 1996, 3408, 1997, 1996, 3820, 1010, 21122, 3351, 2097, 9878, 15547, 20763, 1005, 1055, 26679, 9103, 1010, 5003, 5283, 14268, 1998, 1052, 7274, 2480, 4890, 11979, 9639, 1998, 5814, 4264, 1999, 6435, 1998, 3735, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [4]:
# FinBERT originally: 0=pos,1=neg,2=neu
baseline_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3, 
)

# We'll do an "inference-only" Trainer to measure baseline
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
baseline_model.to(device)
print("Using device =>", device)

def baseline_compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc   = accuracy_score(labels, preds)
    f1    = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

inference_args = TrainingArguments(
    output_dir="./finbert_baseline_check",
    do_train=False,
    do_eval=True,
    per_device_eval_batch_size=8,
    logging_steps=10,
)

baseline_trainer = Trainer(
    model=baseline_model,
    args=inference_args,
    compute_metrics=baseline_compute_metrics
)

print("\n=== Evaluating Baseline FinBERT (no fine-tune) on Test set ===")
baseline_results = baseline_trainer.evaluate(eval_dataset=test_tokenized)
print("Baseline =>", baseline_results)

pred_output = baseline_trainer.predict(test_tokenized)
pred_labels = np.argmax(pred_output.predictions, axis=-1)
true_labels = pred_output.label_ids

cm = confusion_matrix(true_labels, pred_labels)
print("\n=== Confusion Matrix (Baseline) ===")
print(cm)
print("\nReminder: indices 0=pos,1=neg,2=neu. We mapped your data so 0->pos,1->neg,2->neu.\n")

Using device => mps

=== Evaluating Baseline FinBERT (no fine-tune) on Test set ===


Baseline => {'eval_loss': 0.33012115955352783, 'eval_model_preparation_time': 0.0013, 'eval_accuracy': 0.8865979381443299, 'eval_f1': 0.8879352378109027, 'eval_runtime': 14.9928, 'eval_samples_per_second': 32.349, 'eval_steps_per_second': 4.069}

=== Confusion Matrix (Baseline) ===
[[131   4   8]
 [  1  57   2]
 [ 30  10 242]]

Reminder: indices 0=pos,1=neg,2=neu. We mapped your data so 0->pos,1->neg,2->neu.



In [5]:
finetune_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
finetune_model.to(device)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc   = accuracy_score(labels, preds)
    f1    = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./finbert-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    learning_rate=3e-5,
    warmup_steps=500,
    fp16=False,   # for MPS, typically keep fp16=False
    push_to_hub=False,
    logging_steps=50,
)

trainer = Trainer(
    model=finetune_model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=2,
        early_stopping_threshold=0.0
    )]
)

print("\n=== Fine-tuning FinBERT on your data (mapped to 0=pos,1=neg,2=neu) ===")
train_result = trainer.train()
finetune_model.save_pretrained("./finbert-finetuned")
tokenizer.save_pretrained("./finbert-finetuned")




=== Fine-tuning FinBERT on your data (mapped to 0=pos,1=neg,2=neu) ===


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4043,0.562838,0.868041,0.868326
2,0.4119,0.599194,0.868041,0.868962
3,0.1408,0.688088,0.863918,0.865336
4,0.1714,0.765658,0.882474,0.881439
5,0.0127,0.843127,0.872165,0.8731
6,0.0475,0.935173,0.868041,0.86687


('./finbert-finetuned/tokenizer_config.json',
 './finbert-finetuned/special_tokens_map.json',
 './finbert-finetuned/vocab.txt',
 './finbert-finetuned/added_tokens.json',
 './finbert-finetuned/tokenizer.json')

In [6]:
test_results = trainer.evaluate(eval_dataset=test_tokenized)
print("\n=== Test set evaluation (Fine-tuned) ===")
print(test_results)

pred_output_ft = trainer.predict(test_tokenized)
pred_labels_ft = np.argmax(pred_output_ft.predictions, axis=-1)
true_labels_ft = pred_output_ft.label_ids

cm_ft = confusion_matrix(true_labels_ft, pred_labels_ft)
print("\n=== Confusion Matrix (Fine-tuned) ===")
print(cm_ft)
print("\nIndices: 0=pos,1=neg,2=neu.\n")


=== Test set evaluation (Fine-tuned) ===
{'eval_loss': 0.7654902338981628, 'eval_accuracy': 0.8721649484536083, 'eval_f1': 0.8720654105936723, 'eval_runtime': 14.1292, 'eval_samples_per_second': 34.326, 'eval_steps_per_second': 8.635, 'epoch': 6.0}

=== Confusion Matrix (Fine-tuned) ===
[[121   2  20]
 [  6  47   7]
 [ 20   7 255]]

Indices: 0=pos,1=neg,2=neu.



In [7]:
label_map = {0: "positive", 1: "negative", 2: "neutral"}

def predict_sentiment(text):
    # We'll use the fine-tuned model
    inputs = tokenizer(
        text, return_tensors="pt",
        truncation=True, padding="max_length", max_length=512
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = finetune_model(**inputs)
        pred_label_idx = outputs.logits.argmax(dim=-1).item()
    return label_map[pred_label_idx]

demo = gr.Interface(
    fn=predict_sentiment,
    inputs="text",
    outputs="text",
    title="FinBERT Sentiment Analysis (Fine-tuned)",
    description="Indices: 0=positive,1=negative,2=neutral. Data was remapped so baseline aligns with FinBERT."
)
demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7865


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


* Running on public URL: https://90caeb0f58900a3be9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


