#### Imports

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, pipeline
from datasets import load_dataset, load_from_disk
import json

#### Check for GPU availability

In [2]:
if not torch.cuda.is_available():
    raise SystemExit("GPU (CUDA) not available. Aborting training process.")

print(f"Using GPU: {torch.cuda.get_device_name(0)}")

Using GPU: NVIDIA GeForce GTX 1070


#### Load models and dataset

In [None]:
base_model_name = "ProsusAI/finbert"
finetuned_model_name = "./models/finbert-finetuned"
dataset_name = "modestus/bitcoin_sentiment_analysis"
summarizer_name = "facebook/bart-large-cnn"

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=2,
    ignore_mismatched_sizes=True,
    id2label={0: "negative", 1: "positive"},
    label2id={"negative": 0, "positive": 1},
).to("cuda")

In [None]:
# Load summarizer model and tokenizer
summarizer = pipeline("summarization", model=summarizer_name, device=0)
summarizer_tokenizer = summarizer.tokenizer

In [20]:
# Load dataset
dataset = load_dataset(dataset_name)

#### Preprocess dataset

In [21]:
# Summarize text and format as "title - summary"
def summarize_and_format(examples):
    contents = examples["content"]
    
    # Truncate contents in batch
    truncated_contents = []
    for content in contents:
        tokens = summarizer_tokenizer.encode(content, truncation=True)
        if len(tokens) > 1000:
            content = summarizer_tokenizer.decode(tokens[:1000])
        truncated_contents.append(content)
    
    # Process contents in batch
    processed_texts = []
    summaries = summarizer(truncated_contents, max_length=64, min_length=24)
    for i, content in enumerate(contents):
        title = content[:200].split('.')[0] + '.'
        summary = summaries[i]['summary_text']
        processed_texts.append(f"{title} - {summary}")
    
    return {"text": processed_texts}

# Extract sentiment from metrics column
def extract_sentiment(example):
    metrics = example["metrics"]
    positive_label = metrics[2]["label"] 
    negative_label = metrics[3]["label"]  
    sentiment = positive_label + (negative_label * -1)    # 1 for positive, -1 for negative, 0 for neutral
    mapping = {-1: 0, 0: -1, 1: 1}
    
    return {"labels": mapping[sentiment]}

# Filter out neutral examples (label -1)
def filter_neutral(example):
    return int(example["labels"]) != -1

# Filter non-Bitcoin examples
def filter_examples(example):
    return example["metrics"][0]["label"] != 0

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

In [None]:
# Apply preprocessing functions to each split
for split in ["train", "test"]:
    dataset[split] = dataset[split].filter(filter_examples)
    dataset[split] = dataset[split].map(extract_sentiment)
    dataset[split] = dataset[split].filter(filter_neutral)
    dataset[split] = dataset[split].map(summarize_and_format, batched=True, batch_size=32)
    dataset[split] = dataset[split].map(tokenize_function, batched=True)
    dataset[split].set_format("torch", columns=["input_ids", "attention_mask", "labels"])

dataset.save_to_disk("./models/processed_bitcoin_sentiment")

In [None]:
# Split test into validation and test
dataset = load_from_disk("./models/processed_bitcoin_sentiment")
test_val_split = dataset["test"].train_test_split(test_size=0.5, seed=11)
train_dataset = dataset["train"]
eval_dataset = test_val_split["train"]  
test_dataset = test_val_split["test"]   

#### Model finetuning

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=finetuned_model_name,
    num_train_epochs=10,
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,
    eval_strategy="steps", 
    eval_steps=100,  
    save_strategy="steps",
    save_steps=200, 
    learning_rate=1e-5,
    weight_decay=0.03, 
    logging_steps=100,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    fp16=True,
    dataloader_num_workers=4, 
    report_to="none"
)

# Create early stopping callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=6,
    early_stopping_threshold=0.001
)

In [None]:
# Train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[early_stopping] 
)

trainer.train() 
model.save_pretrained(finetuned_model_name)
tokenizer.save_pretrained(finetuned_model_name)

#### Test model

In [23]:
from sklearn.metrics import accuracy_score, classification_report

def predict(dataset, classifier):
    results = classifier(dataset["text"], truncation=True, max_length=256, batch_size=16)
    true_labels = dataset["labels"]
    label_map = {"negative": 0, "positive": 1}
    pred_labels = [label_map[result["label"]] for result in results]
    
    return true_labels, pred_labels

In [None]:
# Test models on evaluation dataset
original_classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
true_labels, original_pred_labels = predict(dataset["test"], original_classifier)

finetuned_classifier = pipeline("sentiment-analysis", model=finetuned_model_name, tokenizer=finetuned_model_name)
true_labels, finetuned_pred_labels = predict(dataset["test"], finetuned_classifier)

In [25]:
# Calculate metrics
original_accuracy = accuracy_score(true_labels, original_pred_labels)
original_report = classification_report(true_labels, original_pred_labels, target_names=["negative", "positive"])

finetuned_accuracy = accuracy_score(true_labels, finetuned_pred_labels)
finetuned_report = classification_report(true_labels, finetuned_pred_labels, target_names=["negative", "positive"])

print(f"Original model accuracy: {original_accuracy:.4f}")
print(f"Finetuned model accuracy: {finetuned_accuracy:.4f}")
print(f"Original model report:\n {original_report}")
print(f"Finetuned model report:\n {finetuned_report}")


Original model accuracy: 0.4599
Finetuned model accuracy: 0.8368
Original model report:
               precision    recall  f1-score   support

    negative       0.24      0.40      0.30       303
    positive       0.66      0.48      0.56       745

    accuracy                           0.46      1048
   macro avg       0.45      0.44      0.43      1048
weighted avg       0.54      0.46      0.49      1048

Finetuned model report:
               precision    recall  f1-score   support

    negative       0.73      0.69      0.71       303
    positive       0.88      0.90      0.89       745

    accuracy                           0.84      1048
   macro avg       0.80      0.79      0.80      1048
weighted avg       0.83      0.84      0.84      1048

