In [None]:
# Mount Google Drive to save model
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Install necessary libraries
!pip install transformers datasets torch scikit-learn pandas numpy yfinance snscrape


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting snscrape
  Downloading snscrape-0.7.0.20230622-py3-none-any.whl.metadata (4.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1

In [None]:
# Imports
import re
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import torch


In [None]:
# Load dataset
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment")

# Cleaning function
finance_abbreviations = {
    "PT": "Price Target", "EPS": "Earnings Per Share",
    "QoQ": "Quarter over Quarter", "YoY": "Year over Year",
    "MoM": "Month over Month", "FY": "Fiscal Year",
    "P/E": "Price-to-Earnings Ratio", "EBITDA": "Earnings Before Interest, Taxes, Depreciation, and Amortization"
}

def clean_financial_tweet(text):
    if not isinstance(text, str): return text
    text = re.sub(r"http\S+|www\S+", "[URL]", text)
    text = re.sub(r"#(\w+)", r"\1", text)
    text = re.sub(r"\$(\w+)", r"\1", text)
    text = re.sub(r"@\w+", "", text)
    for abbr, full_form in finance_abbreviations.items():
        text = re.sub(rf"\b{abbr}\b", full_form, text, flags=re.IGNORECASE)
    text = re.sub(r"([!?.,])\1+", r"\1", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

dataset = dataset.map(lambda x: {"text": clean_financial_tweet(x["text"])})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

sent_train.csv:   0%|          | 0.00/859k [00:00<?, ?B/s]

sent_valid.csv:   0%|          | 0.00/217k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9543 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2388 [00:00<?, ? examples/s]

Map:   0%|          | 0/9543 [00:00<?, ? examples/s]

Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

In [None]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.remove_columns(["text"])
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

train_dataset = dataset["train"].shuffle(seed=42).select(range(3000))
eval_dataset = dataset["validation"].shuffle(seed=42).select(range(300))


tokenizer_config.json:   0%|          | 0.00/369 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/464k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/9543 [00:00<?, ? examples/s]

Map:   0%|          | 0/2388 [00:00<?, ? examples/s]

In [None]:
# Fine-tune model
model = AutoModelForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis", num_labels=3)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs"
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myuhan-zeng123[0m ([33myuhan-zeng123-university-of-toronto[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.374454,0.846667,0.841432
2,No log,0.410991,0.836667,0.835154


TrainOutput(global_step=376, training_loss=0.5073771375290891, metrics={'train_runtime': 189.253, 'train_samples_per_second': 31.704, 'train_steps_per_second': 1.987, 'total_flos': 394670126592000.0, 'train_loss': 0.5073771375290891, 'epoch': 2.0})

In [None]:
# Save model to Google Drive
model_path = "/content/drive/MyDrive/financialbert_finetuned_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print("✅ Model saved to Google Drive.")


✅ Model saved to Google Drive.
