In [7]:
# This is for the installation of pytorch
!pip install torch

In [None]:
# This is for the uninstallation of pytorch
!pip uninstall torch -y

We encountered issues during the process of putting tensorflow and pytorch under the same environment (tensorflow is for the CNN model and pytorch is for the bert transformer). That is, if you want to run CNN with tensorflow, you first need to uninstall the pytorch package (vise versa). Also, we suggest run each model separately!!!!

In [6]:
# Load Packages for DistilBERT
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset


<h3>1.1 Load and preprocess the dataset:</h3><p>

In [18]:
# Load Dataset for both models
train = pd.read_csv("/Users/jiaxuanliu/Downloads/cs-3780-5780-how-do-you-feel/train.csv")
train_text = train["text"]
train_label = train["label"]

test = pd.read_csv("/Users/jiaxuanliu/Downloads/cs-3780-5780-how-do-you-feel/test.csv")
test_id = test["id"]
test_text = test["text"]

<h2>Model1: DistilBERT</h2><p>

In [None]:
# TODO


# Model1 DistilBERT


# Encode labels
label_encoder = LabelEncoder() # Converts text labels to numerical format.
train_label_encoded = label_encoder.fit_transform(train_label) # Encodes training labels.
num_labels = len(label_encoder.classes_) # Total number of unique labels (classes)
# 80% training and 20% validation split.


# Split data into training and validation sets
train_text_split, val_text_split, train_label_split, val_label_split = train_test_split(
    train_text, train_label_encoded, test_size=0.2, random_state=42
)

# Define Dataset class
class EmotionDataset(Dataset): # Custom dataset for tokenization and formatting
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        # Input text data and labels
        self.tokenizer = tokenizer # Pretrained tokenizer
        self.max_len = max_len # Maximum tokenized sequence length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels[idx] if self.labels is not None else -1

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long) if label != -1 else torch.tensor(-1, dtype=torch.long)
        }

# Initialize DistilBERT tokenizer and model
model_name = "distilbert-base-uncased" # Pretrained DistilBERT model
tokenizer = DistilBertTokenizer.from_pretrained(model_name) # Load tokenizer
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
# Load DistilBERT model configured for multi-class classification

# Create datasets
max_len = 128
train_dataset = EmotionDataset(train_text_split, train_label_split, tokenizer, max_len)
val_dataset = EmotionDataset(val_text_split, val_label_split, tokenizer, max_len)
test_dataset = EmotionDataset(test_text, None, tokenizer, max_len)

# Define function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred # Extract predictions and true labels
    predictions = torch.argmax(torch.tensor(logits), dim=-1) # Get predicted class
    acc = accuracy_score(labels, predictions.numpy()) # Calculate accuracy
    return {"accuracy": acc}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results", # Save model checkpoints
    eval_strategy="epoch",  # Updated parameter name
    save_strategy="epoch",
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16, # Batch size for evaluation
    num_train_epochs=3, # Train for 3 epochs
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True, # Load the best model after training
    metric_for_best_model="accuracy", # Use accuracy to choose the best model
    greater_is_better=True
)

# Trainer
trainer = Trainer(
    model=model, # Model for training
    args=training_args, # Training configurations
    train_dataset=train_dataset, # Training dataset
    eval_dataset=val_dataset, # Validation dataset
    compute_metrics=compute_metrics # Function to compute evaluation metrics
)

# Train the model
trainer.train()

##

# Evaluate the model on validation data
eval_results = trainer.evaluate()
print("Validation Results:", eval_results)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8062,0.936063,0.7465
2,0.6643,0.824572,0.772
3,0.6266,0.814632,0.773


Validation Results: {'eval_loss': 0.8146315813064575, 'eval_accuracy': 0.773, 'eval_runtime': 123.7404, 'eval_samples_per_second': 16.163, 'eval_steps_per_second': 1.01, 'epoch': 3.0}


In [None]:
# Save the trained model and tokenizer
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.txt',
 './trained_model/added_tokens.json')

<h2>Model1: DistilBERT</h2><p>

In [12]:
# Training & Validation & Prediction for DistilBERT
# Define test dataset class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        item = {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
        }

        return item

# Create test dataset
test_dataset = EmotionDataset(test_text, labels=None, tokenizer=tokenizer, max_len=128)

# Generate predictions
from transformers import Trainer
trainer = Trainer(model=model)
predictions = trainer.predict(test_dataset)

# Decode predictions
test_preds = torch.argmax(torch.tensor(predictions.predictions), dim=1)
test_labels_decoded = label_encoder.inverse_transform(test_preds.numpy())

In [13]:
submission_BERT = pd.DataFrame({"id": test_id, "label": test_labels_decoded})
submission_BERT.to_csv("submission_BERT.csv", index=False)

<h2>Part 4: Resources and Literature Used</h2><p>

https://huggingface.co/docs/hub/transformers


https://huggingface.co/docs/transformers/en/model_doc/distilbert