

Step 1: Data Preprocessing and Tokenization

In [3]:
import pandas as pd
import torch
import emoji
import re
from transformers import AutoTokenizer
from datasets import Dataset

# Step 1: Load the balanced dataset
file_path = "C:/Users/verma/Documents/SML PROJECT/DataCleaning/balanced_dataset.csv"  # Path to your balanced dataset
df = pd.read_csv(file_path)

# Step 2: Preprocess text (if needed)
def preprocess_text_for_berttweet(text):
    """
    Preprocess text for BERTweet:
    - Remove unnecessary whitespace.
    - Preserve social media markers like hashtags, mentions, and emojis.
    """
    text = text.lower()
    text = emoji.demojize(text, delimiters=(" ", " "))  # Convert emojis to text
    text = re.sub(r"@\w+", " @mention ", text)  # Replace mentions
    text = re.sub(r"#(\w+)", r" \1 ", text)  # Keep hashtags
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s:]", "", text)  # Remove special characters
    text = text.strip()  # Remove leading/trailing spaces
    text = ' '.join(text.split())  # Replace multiple spaces/newlines with a single space
    return text

# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text_for_berttweet)

# Step 3: Map sentiment labels to integers
label_mapping = {'happy': 0, 'sad': 1, 'anger': 2}
df['mapped_labels'] = df['sentiment'].map(label_mapping)

# Drop rows with missing values (if any)
df = df.dropna(subset=['cleaned_text', 'mapped_labels'])

# Step 4: Tokenizer Initialization
tokenizer = AutoTokenizer.from_pretrained("D:/SML Project/BERTweet/bertweet-tokenizer")  # Pretrained BERTweet tokenizer

# Step 5: Tokenize the text
MAX_LENGTH = 128  # Adjusted based on word count distribution (short texts mostly)

def tokenize_function(batch):
    """
    Tokenize text for BERTweet with padding and truncation.
    """
    return tokenizer(
        batch["cleaned_text"], 
        padding="max_length", 
        truncation=True, 
        max_length=MAX_LENGTH
    )

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
columns_to_remove = ["cleaned_text", "text", "sentiment", "word_count", "__index_level_0__"]
columns_to_remove = [col for col in columns_to_remove if col in tokenized_dataset.column_names]
tokenized_dataset = tokenized_dataset.remove_columns(columns_to_remove)

# Ensure the `labels` column matches the required format
if 'labels' not in tokenized_dataset.column_names:  # Avoid duplicate renaming
    tokenized_dataset = tokenized_dataset.rename_column("mapped_labels", "labels")

# Set format for PyTorch tensors
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Step 6: Save the tokenized dataset
output_path = "tokenized_dataset_balanced"  # Directory to save the tokenized dataset
tokenized_dataset.save_to_disk(output_path)

print(f"Tokenized dataset saved to: {output_path}")

Map:   0%|          | 0/384798 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/384798 [00:00<?, ? examples/s]

Tokenized dataset saved to: tokenized_dataset_balanced


# Step 2: Model Training and Evaluation

In [4]:
import torch
from transformers import (
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding, 
    AutoTokenizer
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_from_disk, ClassLabel

# Step 1: Load the Tokenized Dataset
tokenized_dataset_path = "tokenized_dataset_balanced"  # Path to your tokenized balanced dataset
tokenized_dataset = load_from_disk(tokenized_dataset_path)

# Step 2: Convert Labels to ClassLabel
if not isinstance(tokenized_dataset.features["labels"], ClassLabel):
    # Define the class names based on your dataset
    class_names = ["happy", "sad", "anger"]  # Update with your actual class names
    class_label = ClassLabel(names=class_names)
    tokenized_dataset = tokenized_dataset.cast_column("labels", class_label)

# Step 3: Split the Dataset
# Train-test split with stratification to ensure label balance
train_test_split = tokenized_dataset.train_test_split(
    test_size=0.2, 
    stratify_by_column="labels", 
    seed=42
)
test_valid_split = train_test_split["test"].train_test_split(
    test_size=0.5, 
    seed=42
)

train_dataset = train_test_split["train"]
val_dataset = test_valid_split["train"]
test_dataset = test_valid_split["test"]

print(f"Dataset splits - Train: {len(train_dataset)}, Validation: {len(val_dataset)}, Test: {len(test_dataset)}")

# Step 4: Load Pretrained Tokenizer
tokenizer = AutoTokenizer.from_pretrained("D:/SML Project/BERTweet/bertweet-tokenizer")

# Step 5: Load BERTweet Model
model = AutoModelForSequenceClassification.from_pretrained(
    "D:/SML Project/BERTweet/bertweet-model", 
    num_labels=3,  # Number of sentiment classes (happy, sad, anger)
    ignore_mismatched_sizes=True  # Ignore size mismatches
)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Step 6: Create Data Collator
# Automatically pads inputs to the maximum length for batch processing
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")

# Step 7: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",                 # Directory for logs and checkpoints
    evaluation_strategy="epoch",           # Evaluate after each epoch
    learning_rate=5e-5,                     # Learning rate
    per_device_train_batch_size=16,        # Batch size for training
    per_device_eval_batch_size=64,         # Batch size for evaluation
    num_train_epochs=3,                    # Number of epochs
    weight_decay=0.01,                     # Weight decay for regularization
    logging_dir="./logs",                  # Log directory
    save_strategy="epoch",                 # Save checkpoints every epoch
    save_total_limit=2,                    # Keep only the last 2 checkpoints
    load_best_model_at_end=True,           # Load the best model at the end of training
    metric_for_best_model="eval_loss",     # Use validation loss to select the best model
    greater_is_better=False,               # Lower eval_loss is better
    fp16=torch.cuda.is_available()         # Enable mixed precision if supported
)

# Step 8: Define Metrics
def compute_metrics(p):
    """
    Compute evaluation metrics.
    """
    preds = p.predictions.argmax(axis=-1)  # Convert logits to class predictions
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    accuracy = accuracy_score(labels, preds)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Step 9: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Step 10: Train the Model
print("Training started...")
trainer.train()

# Step 11: Evaluate the Model on the Test Dataset
print("Evaluating on the test dataset...")
test_results = trainer.evaluate(test_dataset)
print("Test Evaluation Results:", test_results)

# Step 12: Save the Fine-tuned Model and Tokenizer
output_model_dir = "./fine_tuned_bertweet_model_balanced"
output_tokenizer_dir = "./fine_tuned_bertweet_tokenizer_balanced"

model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_tokenizer_dir)

print(f"Model saved to {output_model_dir}")
print(f"Tokenizer saved to {output_tokenizer_dir}")

Casting the dataset:   0%|          | 0/384798 [00:00<?, ? examples/s]

Dataset splits - Train: 307838, Validation: 38480, Test: 38480


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at D:/SML Project/BERTweet/bertweet-model and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Training started...


Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 