In [1]:
import os
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BertTokenizer, 
    BertForSequenceClassification
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import evaluate
import torch
from tqdm import tqdm
from datetime import datetime
tqdm.pandas() # Utile pour avoir les bars de chargement avec pandas

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Config
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# Paramètres de fine-tuning
MODEL_NAME = "ProsusAI/finbert"
DATA_PATH = "./Data/sentiment_annotated_with_texts.csv"  # <- mets ton fichier ici
OUTPUT_DIR = "./finbert_forex_finetuned"
MAX_LENGTH = 512
NUM_EPOCHS = 3
BATCH_SIZE_TRAIN = 16
BATCH_SIZE_EVAL = 32
LEARNING_RATE = 2e-5

np.random.seed(SEED)
torch.manual_seed(SEED)

print(f"Torch device: {DEVICE}")


Torch device: cuda


In [2]:
df = pd.read_csv(DATA_PATH)
df['title'] = df['title'].astype(str).str.strip()
df.head()

Unnamed: 0,published_at,ticker,true_sentiment,title,author,url,source,text,finbert_sentiment,finbert_sent_score
0,2023-01-12 07:47:00,EURCHF,Positive,Euro to benefit from the ECBs pronounced hawki...,FXStreet Insights Team,https://www.fxstreet.com/news/euro-to-benefit-...,FX Street,The Euro was able to appreciate particularly s...,Positive,0.85
1,2023-01-12 10:34:00,EURCHF,Positive,EURCHF Trend higher may remain in place – ING,FXStreet Insights Team,https://www.fxstreet.com/news/eur-chf-trend-hi...,FX Street,EUR/CHF yesterday broke above 1.00. Economists...,Positive,0.51
2,2023-01-12 11:40:00,EURCHF,Neutral,Does a jump in EURCHF point to a break above 1...,FXStreet Insights Team,https://www.fxstreet.com/news/does-a-jump-in-e...,FX Street,EUR/CHF vaults parity for the first time since...,Neutral,0.37
3,2023-01-12 15:32:00,EURCHF,Positive,EURCHF could extend its advance back to levels...,FXStreet Insights Team,https://www.fxstreet.com/news/eur-chf-could-ex...,FX Street,EUR/CHF climbs back above parity. Economists a...,Positive,0.64
4,2023-01-13 11:37:00,EURCHF,Positive,EURCHF to head higher towards 10130 and projec...,FXStreet Insights Team,https://www.fxstreet.com/news/eur-chf-to-head-...,FX Street,EUR/CHF has broken out above the sideways rang...,Positive,0.83


In [3]:
le = LabelEncoder()
df['label_str'] = df['true_sentiment'].astype(str)
df['label'] = le.fit_transform(df['label_str'])
label_map = {int(i): c for i, c in enumerate(le.classes_)}
num_labels = len(le.classes_)

print("Mapping labels (index -> label):", label_map)
print("Nombre de classes :", num_labels)
df[['title','label_str','label']].head(6)

Mapping labels (index -> label): {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
Nombre de classes : 3


Unnamed: 0,title,label_str,label
0,Euro to benefit from the ECBs pronounced hawki...,Positive,2
1,EURCHF Trend higher may remain in place – ING,Positive,2
2,Does a jump in EURCHF point to a break above 1...,Neutral,1
3,EURCHF could extend its advance back to levels...,Positive,2
4,EURCHF to head higher towards 10130 and projec...,Positive,2
5,EURCHF Room for the Euro to extend the move hi...,Positive,2


In [4]:
# Split train/val/test
# Si published_at est présent et parseable, on fait une séparation temporelle (80/10/10).
use_time_split = False
if 'published_at' in df.columns:
    # essaie de parser la date
    try:
        df['published_at_parsed'] = pd.to_datetime(df['published_at'], utc=True, errors='coerce')
        n_na_dates = df['published_at_parsed'].isna().sum()
        if n_na_dates == 0:
            use_time_split = True
        else:
            print(f"[WARN] {n_na_dates} lignes ont des dates non parsables ; on utilisera un split aléatoire stratifié.")
    except Exception as e:
        print("[WARN] Erreur parsing dates :", e)

if use_time_split:
    # tri par date, découpage temporel
    df = df.sort_values('published_at_parsed').reset_index(drop=True)
    n = len(df)
    i_train = int(n * 0.80)
    i_val = int(n * 0.90)
    train_df = df.iloc[:i_train].reset_index(drop=True)
    val_df   = df.iloc[i_train:i_val].reset_index(drop=True)
    test_df  = df.iloc[i_val:].reset_index(drop=True)
    print(f"Time split utilisé. Train {len(train_df)}, Val {len(val_df)}, Test {len(test_df)}")
else:
    # split stratifié aléatoire (80/10/10)
    train_temp, test_df = train_test_split(df, test_size=0.10, random_state=SEED, stratify=df['label'])
    train_df, val_df = train_test_split(train_temp, test_size=0.1111, random_state=SEED, stratify=train_temp['label'])  # ~80/10/10
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    print(f"Random stratified split utilisé. Train {len(train_df)}, Val {len(val_df)}, Test {len(test_df)}")


Time split utilisé. Train 1832, Val 229, Test 230


In [5]:
# Convertir en datasets Hugging Face et renommer colonnes utiles
hf_train = Dataset.from_pandas(train_df[['title','label']].rename(columns={'title':'text'}))
hf_val   = Dataset.from_pandas(val_df[['title','label']].rename(columns={'title':'text'}))
hf_test  = Dataset.from_pandas(test_df[['title','label']].rename(columns={'title':'text'}))

print("Exemples (train) :")
hf_train[0]

Exemples (train) :


{'text': 'Euro to benefit from the ECBs pronounced hawkish determination – Commerzbank',
 'label': 2}

In [6]:
# Tokenizer + préparation des inputs
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

def preprocess_fn(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=MAX_LENGTH)

hf_train = hf_train.map(preprocess_fn, batched=True, remove_columns=['text'])
hf_val   = hf_val.map(preprocess_fn, batched=True, remove_columns=['text'])
hf_test  = hf_test.map(preprocess_fn, batched=True, remove_columns=['text'])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map:   0%|          | 0/1832 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

In [7]:
# Métriques
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "precision_macro": precision.compute(predictions=preds, references=labels, average="macro")["precision"],
        "recall_macro": recall.compute(predictions=preds, references=labels, average="macro")["recall"],
    }


In [8]:
# Configuration du Trainer
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE_TRAIN,
    per_device_eval_batch_size=BATCH_SIZE_EVAL,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    save_total_limit=2,
    seed=SEED,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Entraînement
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,No log,0.783408,0.650655,0.611952,0.717914,0.601635
2,No log,0.628378,0.737991,0.741858,0.749137,0.758562
3,No log,0.593195,0.781659,0.782987,0.781865,0.788648


TrainOutput(global_step=345, training_loss=0.7527831146682518, metrics={'train_runtime': 264.7313, 'train_samples_per_second': 20.761, 'train_steps_per_second': 1.303, 'total_flos': 1446071343833088.0, 'train_loss': 0.7527831146682518, 'epoch': 3.0})

In [9]:
# Évaluation finale (test set)
print("Évaluation sur le jeu test :")
metrics = trainer.predict(hf_test)
print(metrics.metrics)


Évaluation sur le jeu test :


{'test_loss': 0.7011784911155701, 'test_accuracy': 0.7565217391304347, 'test_f1_macro': 0.7470736006042781, 'test_precision_macro': 0.7518518518518519, 'test_recall_macro': 0.7509839176905405, 'test_runtime': 3.6255, 'test_samples_per_second': 63.44, 'test_steps_per_second': 2.207}


In [10]:
import json

In [11]:
# Sauvegarde modèle + tokenizer + mapping labels + sample preds
os.makedirs(OUTPUT_DIR, exist_ok=True)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

with open(os.path.join(OUTPUT_DIR, "label_map.json"), "w", encoding="utf-8") as f:
    json.dump(label_map, f, ensure_ascii=False, indent=2)

print("Modèle sauvegardé dans :", OUTPUT_DIR)
print("Mapping labels sauvegardé dans label_map.json")


Modèle sauvegardé dans : ./finbert_forex_finetuned
Mapping labels sauvegardé dans label_map.json


In [12]:
ft_model = trainer.model.to(DEVICE)

def ft_eval(row) :
    text = row['title']
    with torch.no_grad() :
        prob = torch.softmax(ft_model(**tokenizer(text, return_tensors="pt").to(DEVICE)).logits, dim=1).to('cpu')
    sentiment_labels = ['Negative', 'Neutral', 'Positive']
    predicted_sentiment_index = torch.argmax(prob, dim=1).item()
    predicted_sentiment = sentiment_labels[predicted_sentiment_index]
    return predicted_sentiment

df["ft_pred"] = df.progress_apply(ft_eval, axis=1)

100%|██████████| 2291/2291 [00:12<00:00, 176.40it/s]


In [None]:
(df["ft_pred"] == df["true_sentiment"]).mean()

np.float64(0.8489742470536883)

In [14]:
df_eval = pd.read_csv("./Data/Chatgpt.csv")
df_eval['title'] = df_eval['paraphrased_title']
df_eval["ft_pred"] = df_eval.progress_apply(ft_eval, axis=1)

100%|██████████| 2264/2264 [00:12<00:00, 178.86it/s]


In [None]:
(df_eval["ft_pred"] == df_eval["true_sentiment"]).mean()

True     1677
False     587
Name: count, dtype: int64