In [1]:
# Importar librerías necesarias
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader
from transformers import TrainerCallback


In [2]:
dataset = pd.read_parquet("hf://datasets/TimKoornstra/synthetic-financial-tweets-sentiment/data/train-00000-of-00001.parquet")


In [3]:
print(dataset.shape)
dataset.head()


(1428771, 2)


Unnamed: 0,tweet,sentiment
0,💰 Cashing out stocks today has left me with a ...,2
1,💸 Losing half my investments in the last month...,2
2,📉 Crypto bubble finally bursts! The crypto tra...,2
3,🔥 The blazing inferno engulfs the stock market...,2
4,🥶 My stocks are stuck in an icy grip today. Fi...,2


In [11]:
# Preprocesar el dataset
dataset = dataset[['tweet', 'sentiment']]  # Asegúrate de que el dataframe solo tenga las columnas necesarias

# Reducir el tamaño del dataset al 5% de forma aleatoria
reduced_dataset = dataset.sample(frac=0.05, random_state=42)

# Dividir en entrenamiento y validación
train_df, val_df = train_test_split(reduced_dataset, test_size=0.2, random_state=42)

# Convertir a formato de Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.rename_column("sentiment", "labels")
val_dataset = val_dataset.rename_column("sentiment", "labels")

print(train_dataset.shape)

(57151, 3)


In [12]:
# Cargar el tokenizador de BERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Preprocesar los datos: Tokenizar los tweets
def preprocess_function(examples):
    return tokenizer(examples['tweet'], truncation=True, padding=True, max_length=128)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/57151 [00:00<?, ? examples/s]

Map:   0%|          | 0/14288 [00:00<?, ? examples/s]

In [13]:
# Cargar el modelo de BERT preentrenado para clasificación de secuencias
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3, dropout= 0.3)

# Forzar el uso de GPU si está disponible
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Running in {device}")

# Definir los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',          # Directorio de salida
    num_train_epochs=3,              # Número de épocas
    per_device_train_batch_size=32,   # Tamaño del batch por dispositivo
    per_device_eval_batch_size=64,    # Tamaño del batch para la evaluación
    warmup_steps=500,                # Número de pasos de warm-up
    weight_decay=0.1,               # Decaimiento del peso
    logging_dir='./logs',            # Directorio para los registros
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluación al final de cada época
    save_strategy="epoch",           # Guardar el modelo al final de cada época
    load_best_model_at_end=True,     # Cargar el mejor modelo al final
    fp16=True                        # Habilitar cálculos en precisión mixta para mayor velocidad en GPU
)

# Función de métricas
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


from transformers import AdamW

# Cambiar el optimizador a AdamW
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.1)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)  # Asignar el optimizador
)

# Entrenar el modelo
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running in cuda


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1978,0.250201,0.907475,0.907959,0.910494,0.907475
2,0.1458,0.242904,0.913214,0.913643,0.916845,0.913214
3,0.2344,0.230939,0.919793,0.919973,0.921203,0.919793


TrainOutput(global_step=5358, training_loss=0.2541519622831213, metrics={'train_runtime': 22729.4969, 'train_samples_per_second': 7.543, 'train_steps_per_second': 0.236, 'total_flos': 5677033005483840.0, 'train_loss': 0.2541519622831213, 'epoch': 3.0})

In [18]:
from transformers import pipeline

# Ruta del mejor checkpoint (generalmente se encuentra en ./results/checkpoint-xxxx)
best_checkpoint = './results/checkpoint-5358'  # Reemplaza con el mejor checkpoint

# Cargar modelo y tokenizador desde el mejor checkpoint
model = DistilBertForSequenceClassification.from_pretrained(best_checkpoint)
tokenizer = DistilBertTokenizer.from_pretrained(best_checkpoint)

# Crear pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

# Tweets de prueba
tweets = [
    "The stock market is doing great today! 🚀",
    "I can't believe I lost so much money. 😔",
    "Not sure about investing anymore, the market is too volatile."
]

# Predicción
predictions = classifier(tweets)

label_mapping = {"LABEL_0": "Neutral", "LABEL_1": "Positivo", "LABEL_2": "Negativo"}

# Ajustar la predicción
for tweet, pred in zip(tweets, predictions):
    sentiment = label_mapping[pred['label']]
    print(f"Tweet: {tweet}")
    print(f"Predicción: {sentiment} (Confianza: {pred['score']:.4f})\n")



Device set to use cuda:0


Tweet: The stock market is doing great today! 🚀
Predicción: Positivo (Confianza: 0.9974)

Tweet: I can't believe I lost so much money. 😔
Predicción: Negativo (Confianza: 0.9991)

Tweet: Not sure about investing anymore, the market is too volatile.
Predicción: Negativo (Confianza: 0.9993)

