## bert-base-uncased

In [1]:
import re
import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [2]:
# Text cleaning(Remove non-ASCII characters, keep x20~x7E)
def clean_text(text):
    return re.sub(r"[^\x20-\x7E]", "", text)

# CSV file path
file_path = "/Users/tim/Desktop/self-learning/LLM_tune/all-data.csv"

# Load CSV using load_dataset
dataset = load_dataset(
    "csv",
    data_files={"full": file_path},
    encoding="ISO-8859-1",
    column_names=["sentiment", "text"]
)["full"]

print("Dataset loaded:")
print(dataset[0])

# Label mapping & text cleaning
def map_label_and_clean(example):
    mapping = {"negative": 0, "neutral": 1, "positive": 2}
    example["text"] = clean_text(example["text"])
    example["labels"] = mapping[example["sentiment"]]
    return example

dataset = dataset.map(map_label_and_clean)

# Split data (80% train, 10% val, 10% test)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
temp_dataset = split_dataset["test"]

temp_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
validation_dataset = temp_split["train"]
test_dataset = temp_split["test"]

print("\nDataset split:")
print("Train rows:", len(train_dataset))
print("Validation rows:", len(validation_dataset))
print("Test rows:", len(test_dataset))


Dataset loaded:
{'sentiment': 'neutral', 'text': 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .'}


Map:   0%|          | 0/4846 [00:00<?, ? examples/s]


Dataset split:
Train rows: 3876
Validation rows: 485
Test rows: 485


In [3]:
# Load tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"], 
        padding="max_length",  # Ensure uniform sequence length
        truncation=True,  # Truncate longer texts
        max_length=512  # Set max token length
    )
    tokenized["labels"] = examples["labels"]  # Preserve labels
    return tokenized

# Apply tokenization
train_tokenized = train_dataset.map(tokenize_function, batched=True)
validation_tokenized = validation_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3876 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

Map:   0%|          | 0/485 [00:00<?, ? examples/s]

In [4]:
# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Get predicted class
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Training parameters
training_args = TrainingArguments(
    output_dir="./bert-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,  # Reload best checkpoint at the end
    metric_for_best_model="f1",  # Use F1 as selection metric
    greater_is_better=True,  # Higher F1 is better
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=False,
    push_to_hub=False,
    logging_steps=50,
)

# Trainer with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=validation_tokenized,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=1,  # Stop if F1 doesn't improve for 1 epoch
        early_stopping_threshold=0.0
    )]
)

print("Trainer initialised with EarlyStoppingCallback.")




Trainer initialised with EarlyStoppingCallback.


In [6]:
# Start training 
train_result = trainer.train()

# Save the model and tokenizer
model.save_pretrained("./bert-finetuned")
tokenizer.save_pretrained("./bert-finetuned")

# Evaluate on the test set
test_results = trainer.evaluate(eval_dataset=test_tokenized)
print("\nTest set evaluation results:")
print(test_results)

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4622,0.691732,0.818557,0.821499
2,0.5305,0.737521,0.8,0.805337



Test set evaluation results:
{'eval_loss': 0.743843138217926, 'eval_accuracy': 0.8041237113402062, 'eval_f1': 0.807031407707751, 'eval_runtime': 14.4495, 'eval_samples_per_second': 33.565, 'eval_steps_per_second': 8.443, 'epoch': 2.0}


In [7]:
# Predict on the test set with confusion matrix
predictions = trainer.predict(test_tokenized)
pred_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

cm = confusion_matrix(true_labels, pred_labels)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 44   8   8]
 [ 10 223  49]
 [  4  16 123]]


## roberta-base model + Warmup

In [12]:
# Use RoBERTa-base model
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"], 
        padding="max_length",  # Ensure uniform sequence length
        truncation=True,  # Truncate longer texts
        max_length=512  # Set max token length
    )
    tokenized["labels"] = examples["labels"]  # Preserve labels
    return tokenized

# Apply tokenization
train_tokenized = train_dataset.map(tokenize_function, batched=True)
validation_tokenized = validation_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

print("\nTokenized train first example:")
print(train_tokenized[0])

Map:   0%|          | 0/485 [00:00<?, ? examples/s]


Tokenized train first example:
{'sentiment': 'neutral', 'text': "Under the terms of the agreement , Bunge will acquire Raisio 's Keiju , Makuisa and Pyszny Duet brands and manufacturing plants in Finland and Poland .", 'labels': 1, 'input_ids': [0, 17245, 5, 1110, 9, 5, 1288, 2156, 16757, 1899, 40, 6860, 4833, 354, 1020, 128, 29, 3350, 2161, 257, 2156, 256, 7386, 6619, 8, 221, 2459, 329, 2855, 5620, 594, 3595, 8, 3021, 3451, 11, 12587, 8, 6508, 479, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [13]:
# Load RoBERTa-base model for classification (3 classes)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Get predicted class
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Training parameters for RoBERTa-base
training_args = TrainingArguments(
    output_dir="./roberta-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,  # Load best model based on F1
    metric_for_best_model="f1",
    greater_is_better=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,  
    learning_rate=2e-5,  # Lower than 3e-5
    weight_decay=0.01,
    warmup_steps=500,  # Enable warmup
    fp16=False,  
    push_to_hub=False,
    logging_steps=50,
)

# Trainer with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=validation_tokenized,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=1,  # Stop if no improvement for 1 epoch
        early_stopping_threshold=0.0
    )]
)

print("Trainer initialised with roberta-base, 5 epochs, warmup, and early stopping.")




Trainer initialised with roberta-base, 5 epochs, warmup, and early stopping.


In [15]:
# Start training
train_result = trainer.train()

# Save model & tokenizer
model.save_pretrained("./roberta-finetuned")
tokenizer.save_pretrained("./roberta-finetuned")

# Final evaluation on test set
test_results = trainer.evaluate(eval_dataset=test_tokenized)
print("\nTest set evaluation results:")
print(test_results)

# Compute and print confusion matrix
predictions = trainer.predict(test_tokenized)
pred_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

cm = confusion_matrix(true_labels, pred_labels)
print("\nConfusion Matrix:")
print(cm)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.485,0.735218,0.839175,0.838952
2,0.6596,0.621853,0.853608,0.853971
3,0.2816,0.834224,0.851546,0.853155



Test set evaluation results:
{'eval_loss': 0.738659679889679, 'eval_accuracy': 0.8247422680412371, 'eval_f1': 0.8256257419556389, 'eval_runtime': 353.6134, 'eval_samples_per_second': 1.372, 'eval_steps_per_second': 0.345, 'epoch': 3.0}

Confusion Matrix:
[[ 46  11   3]
 [ 12 235  35]
 [  2  22 119]]
