## FinBERT

In [1]:
import re
import torch
import gradio as gr
import numpy as np
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix


In [2]:
# Text cleaning(Remove non-ASCII characters, keep x20~x7E)
def clean_text(text):
    return re.sub(r"[^\x20-\x7E]", "", text)

# Load CSV
file_path = "/Users/tim/Desktop/self-learning/LLM_tune/all-data.csv"

dataset = load_dataset(
    "csv",
    data_files={"full": file_path},
    encoding="ISO-8859-1",
    column_names=["sentiment", "text"]
)["full"]

print("Dataset loaded. First example:")
print(dataset[0])

# Label mapping & cleaning
def map_label_and_clean(example):
    mapping = {"negative": 0, "neutral": 1, "positive": 2}
    example["text"] = clean_text(example["text"])
    example["labels"] = mapping[example["sentiment"]]
    return example

dataset = dataset.map(map_label_and_clean)

# Split data (80% Train, 10% Val, 10% Test)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
temp_dataset = split_dataset["test"]

temp_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
validation_dataset = temp_split["train"]
test_dataset = temp_split["test"]

print("\nDataset split:")
print("Train rows:", len(train_dataset))
print("Validation rows:", len(validation_dataset))
print("Test rows:", len(test_dataset))

Dataset loaded. First example:
{'sentiment': 'neutral', 'text': 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .'}

Dataset split:
Train rows: 3876
Validation rows: 485
Test rows: 485


In [3]:
# Load FinBERT
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    tokenized["labels"] = examples["labels"]
    return tokenized

train_tokenized = train_dataset.map(tokenize_function, batched=True)
validation_tokenized = validation_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

print("\nTokenized train first example:")
print(train_tokenized[0])



Tokenized train first example:
{'sentiment': 'neutral', 'text': "Under the terms of the agreement , Bunge will acquire Raisio 's Keiju , Makuisa and Pyszny Duet brands and manufacturing plants in Finland and Poland .", 'labels': 1, 'input_ids': [101, 2104, 1996, 3408, 1997, 1996, 3820, 1010, 21122, 3351, 2097, 9878, 15547, 20763, 1005, 1055, 26679, 9103, 1010, 5003, 5283, 14268, 1998, 1052, 7274, 2480, 4890, 11979, 9639, 1998, 5814, 4264, 1999, 6435, 1998, 3735, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [4]:
# Load FinBERT
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [5]:
# Training parameters
training_args = TrainingArguments(
    output_dir="./finbert-finetuned-es",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10 ,
    learning_rate=3e-5,
    warmup_steps=500,
    #weight_decay=0.01,
    fp16=False,  # Disable for CPU/MPS
    push_to_hub=False,
    logging_steps=50,
)

# Trainer with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=validation_tokenized,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=2,  # Stop if F1 doesn't improve for 2 epochs
        early_stopping_threshold=0.0
    )]
)

print("Trainer initialised with EarlyStopping & hyperparam adjustments.")




Trainer initialised with EarlyStopping & hyperparam adjustments.


In [6]:
# Fine-tuning
train_result = trainer.train()

# Save best model & tokenizer
model.save_pretrained("./finbert-finetuned-es")
tokenizer.save_pretrained("./finbert-finetuned-es")


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4578,0.613873,0.847423,0.847851
2,0.4822,0.587523,0.861856,0.861591
3,0.1185,0.689293,0.865979,0.868914
4,0.102,0.860924,0.872165,0.870521
5,0.0463,0.833237,0.863918,0.865505
6,0.0873,0.913344,0.872165,0.870511


('./finbert-finetuned-es/tokenizer_config.json',
 './finbert-finetuned-es/special_tokens_map.json',
 './finbert-finetuned-es/vocab.txt',
 './finbert-finetuned-es/added_tokens.json',
 './finbert-finetuned-es/tokenizer.json')

In [7]:
# Evaluate on test set
test_results = trainer.evaluate(eval_dataset=test_tokenized)
print("\nTest set evaluation results:")
print(test_results)

# Confusion matrix
predictions = trainer.predict(test_tokenized)
pred_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

cm = confusion_matrix(true_labels, pred_labels)
print("\nConfusion Matrix:")
print(cm)


Test set evaluation results:
{'eval_loss': 0.8068486452102661, 'eval_accuracy': 0.8701030927835052, 'eval_f1': 0.8694192576283578, 'eval_runtime': 16.8677, 'eval_samples_per_second': 28.753, 'eval_steps_per_second': 7.233, 'epoch': 6.0}

Confusion Matrix:
[[ 49  10   1]
 [  3 258  21]
 [  5  23 115]]


In [8]:
# Gradio Interface

device = torch.device("mps")  
model.to(device)

# Labels & inference function
label_names = ["negative", "neutral", "positive"]

def predict_sentiment(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        pred_label = outputs.logits.argmax(dim=-1).item()
    
    return label_names[pred_label]

# Create Gradio interface
demo = gr.Interface(
    fn=predict_sentiment,
    inputs="text",
    outputs="text",
    title="FinBERT Sentiment Analysis",
)

# Launch interface
demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://b991a220f0ee878399.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


