# AUEB M.Sc. in Data Science (part-time)

### 2024.04 - 2024.06

## PART 06
### EXERCISE 01: Fine-Tuning BERT for Text Classification and Named Entity Recognition using HuggingFace Transformers


**Course**: Text Analytics   
**Authors**:
Anagnos Theodoros (p3352323) -
Michalopoulos Ioannis (p3352314) -
Kafantaris Panagiotis (p3352328) -  
Vigkos Ioannis (p3352326)

**Date**: 2024-06-11

### Setings and install

In [None]:
!pip install optuna



In [None]:
!pip install contractions



In [None]:
!pip install transformers datasets accelerate



In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


### Libraries

In [None]:
# NLTK Downloads and Imports
# Downloading necessary NLTK resources for tokenization, stopwords, lemmatization, and POS tagging
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# General Purpose Libraries
import pandas as pd
import numpy as np
import zipfile
import re
import string
from bs4 import BeautifulSoup

# Text Preprocessing Libraries
import contractions

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, precision_recall_fscore_support, average_precision_score, precision_recall_curve, auc
from sklearn.preprocessing import label_binarize

# Transformers Libraries for BERT
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback

# Deep Learning Libraries
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, Dropout, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Input, concatenate
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Optimization Libraries
import optuna

# Visualization Libraries
import matplotlib.pyplot as plt

# PyTorch Libraries
import torch

### Functions

In [None]:
# Defining a function for converting chat words to their full forms
def convert_chat_words(text):
    # Splitting the text into individual words
    words = text.split()
    converted_words = []

    # Iterating over each word in the text
    for word in words:
        # Checking if the word is in the chat_words_dict
        if word.lower() in chat_words_dict:
            # Converting the chat word to its full form
            converted_words.append(chat_words_dict[word.lower()])
        else:
            # Keeping the word as it is if it's not a chat word
            converted_words.append(word)

    # Joining the converted words back into a single string
    converted_text = " ".join(converted_words)
    return converted_text

In [None]:
# Defining a function for cleaning text by removing punctuation, numbers, extra spaces, and repetitions of punctuation
def clean_text(text):
    # Removing punctuation from the text
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Removing numbers from the text
    text = re.sub(r'\d+', '', text)

    # Removing extra spaces from the text
    text = ' '.join(text.split())

    # Replacing repetitions of punctuation in the text
    text = re.sub(r'(\W)\1+', r'\1', text)

    return text

In [None]:
# Defining a function for removing special characters from the text
def remove_special_characters(text):
    # Removing special characters from the text
    text = re.sub(r"[^\w\s]", '', text)
    return text

In [None]:
# Defining a function for performing lemmatization on text
def lemmatize_text(text):
    # Getting the POS tags for the words
    pos_tags = nltk.pos_tag(text)

    # Performing lemmatization
    lemmatized_words = []
    for word, tag in pos_tags:
        # Mapping the POS tag to the WordNet POS tag
        pos = wordnet_map.get(tag[0].upper(), wordnet.NOUN)
        # Lemmatizing the word with the appropriate POS tag
        lemmatized_word = lemmatizer.lemmatize(word, pos=pos)
        # Adding the lemmatized word to the list
        lemmatized_words.append(lemmatized_word)

    return lemmatized_words

In [None]:
def model_init():
    # Loading the BERT model for sequence classification from the pretrained 'bert-base-uncased' model
    # Setting the number of labels according to the length of the label mapping
    return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_mapping))

In [None]:
def compute_metrics(p):
    # Calculating predictions by taking the argmax of the last dimension
    preds = p.predictions.argmax(-1)

    # Loading true labels
    labels = p.label_ids

    # Calculating precision using the weighted average method
    precision = precision_score(labels, preds, average='weighted')

    # Calculating recall using the weighted average method
    recall = recall_score(labels, preds, average='weighted')

    # Calculating F1 score using the weighted average method
    f1 = f1_score(labels, preds, average='weighted')

    # Calculating accuracy
    acc = accuracy_score(labels, preds)

    # Returning a dictionary of computed metrics
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
def objective(trial):
    # Defining hyperparameters to tune
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True)
    weight_decay = trial.suggest_float('weight_decay', 0.01, 0.1, log=True)
    warmup_steps = trial.suggest_int('warmup_steps', 0, 500)
    num_train_epochs = trial.suggest_int('num_train_epochs', 3, 5)
    batch_size = trial.suggest_categorical('batch_size', [16, 32])

    # Defining training arguments
    training_args = TrainingArguments(
        output_dir='./results',  # Setting output directory
        learning_rate=learning_rate,  # Setting learning rate
        per_device_train_batch_size=batch_size,  # Setting batch size for training
        per_device_eval_batch_size=batch_size,  # Setting batch size for evaluation
        num_train_epochs=num_train_epochs,  # Setting number of training epochs
        weight_decay=weight_decay,  # Setting weight decay
        warmup_steps=warmup_steps,  # Setting warmup steps
        evaluation_strategy="steps",  # Setting evaluation strategy to steps
        eval_steps=100,  # Setting evaluation steps
        save_steps=100,  # Setting save steps
        load_best_model_at_end=True,  # Enabling loading best model at end
        metric_for_best_model="accuracy",  # Setting metric for best model
        logging_dir='./logs',  # Setting logging directory
        logging_steps=10  # Setting logging steps
    )

    # Initializing the trainer
    trainer = Trainer(
        model_init=model_init,  # Initializing model
        args=training_args,  # Setting training arguments
        train_dataset=train_dataset,  # Setting training dataset
        eval_dataset=val_dataset,  # Setting validation dataset
        compute_metrics=compute_metrics,  # Setting compute metrics function
    )

    # Training the model
    trainer.train()

    # Evaluating the model on the validation dataset
    eval_results = trainer.evaluate(eval_dataset=val_dataset)

    # Returning the evaluation accuracy
    return eval_results['eval_accuracy']

In [None]:
def evaluate_classifier(y_test, y_pred, y_proba, classes):
    # Calculating macro-averaged precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
    print(f'Macro-averaged Precision: {precision:.4f}')
    print(f'Macro-averaged Recall: {recall:.4f}')
    print(f'Macro-averaged F1 Score: {f1:.4f}')

    # Binarizing the labels
    y_test_binarized = label_binarize(y_test, classes=classes)
    n_classes = y_test_binarized.shape[1]

    # Initializing list to store PR-AUC scores for each class
    pr_auc_scores = []

    # Computing PR-AUC for each class
    for i in range(n_classes):
        pr_auc = average_precision_score(y_test_binarized[:, i], y_proba[:, i])
        pr_auc_scores.append(pr_auc)

    # Calculating macro-averaged PR-AUC
    macro_pr_auc = np.mean(pr_auc_scores)
    print(f'Macro-averaged PR-AUC: {macro_pr_auc:.4f}')

    # Printing detailed classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
# Defining a function to compute metrics
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Function to plot Precision-Recall AUC for each class
def plot_precision_recall_auc(y_test, y_proba, class_names, model_name):
    # Calculating the number of classes and setting up the subplot grid
    num_classes = len(class_names)
    num_cols = 3
    num_rows = (num_classes + num_cols - 1) // num_cols

    # Creating a subplot grid
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, num_rows * 5))
    axes = axes.flatten()

    # Calculating Precision-Recall AUC for each class and plotting the Precision-Recall curve
    for i, class_name in enumerate(class_names):
        precision, recall, _ = precision_recall_curve(y_test == i, y_proba[:, i])
        pr_auc = auc(recall, precision)
        print(f'Class {class_name}: Precision-Recall AUC = {pr_auc:.4f}')

        # Plotting the Precision-Recall curve in the corresponding subplot
        axes[i].plot(recall, precision, label=f'AUC={pr_auc:.4f}')
        axes[i].set_title(f'Class {class_name}')
        axes[i].set_xlabel('Recall')
        axes[i].set_ylabel('Precision')
        axes[i].set_ylim([0.0, 1.05])
        axes[i].set_xlim([0.0, 1.0])
        axes[i].legend(loc="lower left")

    # Removing any empty subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjusting layout and setting the main title
    plt.tight_layout()
    plt.suptitle(f'Precision-Recall Curves for {model_name}', y=1.02)
    plt.show()

## Data Preprocessing and Preparation

In [None]:
# Setting display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

### Loading the dataset

In [None]:
# Loading the dataset from the Excel file
df = pd.read_excel('LabeledText.xlsx')

# Keeping only the columns we need
df = df[['Caption', 'LABEL']]

### Preprocessing

In [None]:
# Lowercasing the data in the 'Caption' column
df['text_cleaned'] = df['Caption'].apply(lambda x: x.lower())

In [None]:
# Printing unique categories/labels in the dataset
print('Categories')
print(df.LABEL.unique())
print("-------------")

# Printing a sample of the dataset
print('Dataset Sample')
df.head()

Categories
['negative' 'positive' 'neutral']
-------------
Dataset Sample


Unnamed: 0,Caption,LABEL,text_cleaned
0,How I feel today #legday #jelly #aching #gym,negative,how i feel today #legday #jelly #aching #gym
1,@ArrivaTW absolute disgrace two carriages from Bangor half way there standing room only #disgraced,negative,@arrivatw absolute disgrace two carriages from bangor half way there standing room only #disgraced
2,This is my Valentine's from 1 of my nephews. I am elated; sometimes the little things are the biggest & best things!,positive,this is my valentine's from 1 of my nephews. i am elated; sometimes the little things are the biggest & best things!
3,betterfeelingfilms: RT via Instagram: First day of filming #powerless back in 2011. Can't ¡­,neutral,betterfeelingfilms: rt via instagram: first day of filming #powerless back in 2011. can't ¡­
4,Zoe's first love #Rattled @JohnnyHarper15,positive,zoe's first love #rattled @johnnyharper15


In [None]:
# Removing URLs from the text
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: re.sub(r'http\S+|www.\S+', '', x))

# Removing HTML tags from the text
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: BeautifulSoup(x, "html.parser").text)

  df['text_cleaned'] = df['text_cleaned'].apply(lambda x: BeautifulSoup(x, "html.parser").text)


In [None]:
# Defining a dictionary to convert common chat words to their full forms
# There are many more chat words that can be added to this dictionary. These are some common examples.
chat_words_dict = {
    "imo": "in my opinion",
    "cyaa": "see you",
    "idk": "I don't know",
    "rn": "right now",
    "afaik": "as far as I know",
}

In [None]:
# Converting chat words to their full forms in the 'text_cleaned' column
df['text_cleaned'] = df['text_cleaned'].apply(convert_chat_words)

In [None]:
# Cleaning the text in the 'text_cleaned' column
df['text_cleaned'] = df['text_cleaned'].apply(clean_text)

In [None]:
# Removing special characters from the text in the 'text_cleaned' column
df['text_cleaned'] = df['text_cleaned'].apply(remove_special_characters)

In [None]:
# Expanding contractions in the 'text_cleaned' column
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: contractions.fix(x))

In [None]:
# Tokenizing the text in the 'text_cleaned' column
df['tokens'] = df['text_cleaned'].apply(lambda x: word_tokenize(x))

In [None]:
# Loading English stop words
stop_words = set(stopwords.words('english'))

# Removing stop words from the 'tokens' column
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [None]:
# Printing the updated 'tokens' column
print(df['tokens'].tail(20))
print("\n")

# Printing the first few rows of the DataFrame
print(df.head())
print("\n")

# Printing the schema of the DataFrame
print(df.info())

4849              [get, friday, night, look, sorted, newin, lbd, littleblackdress, strappy, plunge, neckline, mini, black, bodycon]
4850                                                                                         [rt, nneagoe, love, caring, beautiful]
4851               [february, winter, rainy, stormy, windy, wednesday, morning, love, happy, positive, passionate, reading, coffee]
4852    [rt, thatguykai, honored, pittsburgh, pirates, consultant, coachotip, speak, ball, club, passionate, relentless, ownership]
4853                                              [genghis, khan, ily, relatable, king, passionate, yeet, yas, sogengrn, apgenghis]
4854                 [february, winter, rainy, stormy, windy, wednesday, evening, love, happy, positive, passionate, calm, fun, uk]
4855                  [february, winter, rainy, stormy, windy, wednesday, evening, love, happy, positive, passionate, calm, coffee]
4856                                                   [rt, bishopcarrollhs,

In [None]:
# POS tag mapping dictionary
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

# Create an instance of WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Applying lemmatization to the 'tokens' column
df['tokens'] = df['tokens'].apply(lemmatize_text)

# Printing the updated 'tokens' column after lemmatization
print(df['tokens'].tail(20))
print("\n")

# Printing the first few rows of the DataFrame after lemmatization
print(df.head())
print("\n")

# Printing the schema of the DataFrame after lemmatization
print(df.info())

4849             [get, friday, night, look, sort, newin, lbd, littleblackdress, strappy, plunge, neckline, mini, black, bodycon]
4850                                                                                        [rt, nneagoe, love, care, beautiful]
4851            [february, winter, rainy, stormy, windy, wednesday, morning, love, happy, positive, passionate, reading, coffee]
4852    [rt, thatguykai, honor, pittsburgh, pirate, consultant, coachotip, speak, ball, club, passionate, relentless, ownership]
4853                                           [genghis, khan, ily, relatable, king, passionate, yeet, yas, sogengrn, apgenghis]
4854                 [february, winter, rainy, stormy, windy, wednesday, even, love, happy, positive, passionate, calm, fun, uk]
4855                  [february, winter, rainy, stormy, windy, wednesday, even, love, happy, positive, passionate, calm, coffee]
4856                                                   [rt, bishopcarrollhs, great, bishopcarroll

In [None]:
df.head()

Unnamed: 0,Caption,LABEL,text_cleaned,tokens
0,How I feel today #legday #jelly #aching #gym,negative,how i feel today legday jelly aching gym,"[feel, today, legday, jelly, ache, gym]"
1,@ArrivaTW absolute disgrace two carriages from Bangor half way there standing room only #disgraced,negative,arrivatw absolute disgrace two carriages from bangor half way there standing room only disgraced,"[arrivatw, absolute, disgrace, two, carriage, bangor, half, way, stand, room, disgrace]"
2,This is my Valentine's from 1 of my nephews. I am elated; sometimes the little things are the biggest & best things!,positive,this is my valentines from of my nephews i am elated sometimes the little things are the biggest best things,"[valentine, nephew, elate, sometimes, little, thing, big, best, thing]"
3,betterfeelingfilms: RT via Instagram: First day of filming #powerless back in 2011. Can't ¡­,neutral,betterfeelingfilms rt via instagram first day of filming powerless back in cannot,"[betterfeelingfilms, rt, via, instagram, first, day, film, powerless, back]"
4,Zoe's first love #Rattled @JohnnyHarper15,positive,zoes first love rattled johnnyharper,"[zoes, first, love, rattle, johnnyharper]"


### Data splitting

In [None]:
# Splitting the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(df['tokens'], df['LABEL'], test_size=0.3, random_state=12547392)

# Further splitting the temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=12547392)

# Printing the lengths of the training, validation, and test sets
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Training set size: 3408
Validation set size: 730
Test set size: 731


### Converting Text Data into BERT's Input Format

In [None]:
# Joining tokens back into full sentences
df['text'] = df['tokens'].apply(lambda x: ' '.join(x))

# Converting labels to numerical format if not already done
label_mapping = {label: idx for idx, label in enumerate(df['LABEL'].unique())}
df['label'] = df['LABEL'].map(label_mapping)

# Printing unique labels and their mappings
print("Label mapping:")
print(label_mapping)

# Splitting the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(df['text'], df['label'], test_size=0.3, random_state=12547392)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=12547392)

# Initializing the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizing the text data for training, validation, and test sets
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(list(X_val), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, return_tensors='pt')

# Converting labels to tensors for training, validation, and test sets
train_labels_tensor = torch.tensor(y_train.tolist())
val_labels_tensor = torch.tensor(y_val.tolist())
test_labels_tensor = torch.tensor(y_test.tolist())

Label mapping:
{'negative': 0, 'positive': 1, 'neutral': 2}


## BERT Classifier

### Custom dataset class

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        # Initializing the dataset with encodings and labels
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Properly handling tensor creation to avoid warnings
        item = {key: torch.as_tensor(val[idx]) for key, val in self.encodings.items()}
        # Adding label to the item
        item['labels'] = torch.as_tensor(self.labels[idx])
        return item

    def __len__(self):
        # Returning the total number of labels (dataset length)
        return len(self.labels)

# Creating the datasets with encodings and labels
train_dataset = CustomDataset(train_encodings, train_labels_tensor)
val_dataset = CustomDataset(val_encodings, val_labels_tensor)
test_dataset = CustomDataset(test_encodings, test_labels_tensor)

### Training the model

In [None]:
class MetricsCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.val_losses = []
        self.train_acc = []
        self.val_acc = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            self.train_losses.append(logs['loss'])
        if 'eval_loss' in logs:
            self.val_losses.append(logs['eval_loss'])
        if 'eval_accuracy' in logs:
            self.val_acc.append(logs['eval_accuracy'])
        if 'accuracy' in logs:
            self.train_acc.append(logs['accuracy'])

In [None]:
# Loading the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_mapping))

# Training arguments with logging steps
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,                # log every 10 steps
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Initialize metrics callback
metrics_callback = MetricsCallback()

# Create the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # custom compute metrics function
    callbacks=[metrics_callback]         # callback to save metrics
)

# Starting the training process
trainer.train()

# Evaluating the model on the validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)

# Evaluating the model on the test set
test_results = trainer.evaluate(eval_dataset=test_dataset)

# Creating a formatted report for validation and test results
def print_formatted_results(results, title):
    print(f"\n{title}:")
    for key, value in results.items():
        print(f"  {key}: {value:.4f}")

# Printing formatted validation and test results
print_formatted_results(val_results, "Validation Results")
print_formatted_results(test_results, "Test Results")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,1.0421,1.030709,0.471233,0.410724,0.592851,0.471233
200,0.7359,0.766671,0.657534,0.649501,0.667733,0.657534


KeyboardInterrupt: 

### Testing the model with BERT model

In [None]:
# Evaluating the model on the test set
# Evaluating the model's performance on the test dataset to obtain various metrics
test_results = trainer.evaluate(eval_dataset=test_dataset)

# Printing test results in a formatted manner
# Displaying the test results with improved formatting for better readability
print("Test Results:")
print(f"  eval_loss: {test_results['eval_loss']:.4f}")  # Printing evaluation loss
print(f"  eval_accuracy: {test_results['eval_accuracy']:.4f}")  # Printing evaluation accuracy
print(f"  eval_f1: {test_results['eval_f1']:.4f}")  # Printing evaluation F1 score
print(f"  eval_precision: {test_results['eval_precision']:.4f}")  # Printing evaluation precision
print(f"  eval_recall: {test_results['eval_recall']:.4f}")  # Printing evaluation recall
print(f"  eval_runtime: {test_results['eval_runtime']:.4f}")  # Printing evaluation runtime
print(f"  eval_samples_per_second: {test_results['eval_samples_per_second']:.4f}")  # Printing evaluation samples per second
print(f"  eval_steps_per_second: {test_results['eval_steps_per_second']:.4f}")  # Printing evaluation steps per second
print(f"  epoch: {test_results['epoch']:.4f}")  # Printing the epoch number

### Plotting accuracy

In [None]:
# Plotting the accuracies
# Creating a figure with a specified size
plt.figure(figsize=(10, 5))

# Plotting training accuracy
plt.plot(metrics_callback.train_acc, 'bo-', label='Training accuracy')

# Plotting validation accuracy
plt.plot(metrics_callback.val_acc, 'go-', label='Validation accuracy')

# Adding title and labels
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')

# Adding a legend to the plot
plt.legend()

# Adding a grid to the plot
plt.grid(True)

# Displaying the plot
plt.show()

# Plotting the losses
# Creating a figure with a specified size
plt.figure(figsize=(10, 5))

# Plotting training loss
plt.plot(metrics_callback.train_losses, 'bo-', label='Training loss')

# Plotting validation loss
plt.plot(metrics_callback.val_losses, 'go-', label='Validation loss')

# Adding title and labels
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')

# Adding a legend to the plot
plt.legend()

# Adding a grid to the plot
plt.grid(True)

# Displaying the plot
plt.show()

## RNN Classifier (Baseline)

In [None]:
# Assuming df is already loaded and preprocessed as done in the BERT implementation

# Splitting the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(df['text'], df['label'], test_size=0.3, random_state=12547392)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=12547392)

# Preparing data for RNN model
MAX_SEQUENCE_LENGTH = 256
EMBEDDING_DIM = 300

# Converting data to numpy arrays
train_data = np.array(X_train)
val_data = np.array(X_val)
test_data = np.array(X_test)

# One-Hot Encoding labels
y_train_1_hot = pd.get_dummies(y_train).values
y_val_1_hot = pd.get_dummies(y_val).values
y_test_1_hot = pd.get_dummies(y_test).values

# Initializing and adapting the TextVectorization layer
vectorizer = TextVectorization(max_tokens=100000, output_mode='int', ngrams=1, output_sequence_length=MAX_SEQUENCE_LENGTH)
vectorizer.adapt(train_data)

# Initializing the embedding matrix with zeros
embedding_matrix = np.zeros((100000, EMBEDDING_DIM))

# Building the RNN model with the provided optimal hyperparameters
model_rnn = Sequential()
model_rnn.add(Input(shape=(1,), dtype=tf.string))  # Adding input layer
model_rnn.add(vectorizer)  # Adding text vectorization layer
model_rnn.add(Embedding(100000, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, trainable=True))  # Adding embedding layer
model_rnn.add(Dropout(0.2))  # Adding dropout layer
model_rnn.add(Bidirectional(LSTM(200)))  # Adding bidirectional LSTM layer
model_rnn.add(Dropout(0.2))  # Adding dropout layer
model_rnn.add(Dense(50, activation='relu'))  # Adding dense layer with ReLU activation
model_rnn.add(Dropout(0.2))  # Adding dropout layer
model_rnn.add(Dense(len(y_train.unique()), activation='softmax'))  # Adding output layer with softmax activation

# Compiling the model
model_rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

# Training the model with the optimal hyperparameters
history = model_rnn.fit(train_data, y_train_1_hot, validation_data=(val_data, y_val_1_hot), batch_size=256, epochs=8, shuffle=True)

# Evaluating the model on the test set
print("RNN Model Classification Report:")
predictions = model_rnn.predict(test_data)
print(classification_report(y_test, np.argmax(predictions, axis=1)))

# Plotting training history for accuracy
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plotting training history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

## CNN Classifier (Baseline)

In [None]:
# Assuming df is already loaded and preprocessed as done in the BERT implementation

# Splitting the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(df['text'], df['label'], test_size=0.3, random_state=12547392)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=12547392)

# One-Hot Encoding labels
# Encoding labels for training, validation, and test sets
y_train_1_hot = pd.get_dummies(y_train).values
y_val_1_hot = pd.get_dummies(y_val).values
y_test_1_hot = pd.get_dummies(y_test).values

# Converting text (sequence of words) to sequence of indexes and padding the sequences
MAX_WORDS = 100000
MAX_SEQUENCE_LENGTH = 256
EMBEDDING_DIM = 300

# Initializing and adapting the TextVectorization layer
vectorizer = TextVectorization(max_tokens=MAX_WORDS, output_mode='int', output_sequence_length=MAX_SEQUENCE_LENGTH)
vectorizer.adapt(X_train)

# Initializing the embedding matrix with zeros
embedding_matrix = np.zeros((MAX_WORDS, EMBEDDING_DIM))

# Building CNN model with optimal hyperparameters
FILTERS = 224
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.01

# Creating model inputs
inputs = Input(shape=(1,), dtype=tf.string)
inputs_seq = vectorizer(inputs)

# Creating embeddings and applying dropout
embeddings = Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, trainable=True)(inputs_seq)
dropped_embeddings = Dropout(rate=DROPOUT_RATE)(embeddings)

# Creating multi-filter CNNs
pooled_convs = []
filter_sizes = [2, 3, 4]

# Creating convolutional and pooling layers for each filter size
for n_gram in filter_sizes:
    conv = Conv1D(filters=FILTERS, kernel_size=n_gram, activation='relu')(dropped_embeddings)
    pool = GlobalMaxPooling1D()(conv)
    pooled_convs.append(pool)

# Concatenating results from all filters and applying dropout
concat = concatenate(pooled_convs)
concat = Dropout(rate=DROPOUT_RATE)(concat)
outputs = Dense(len(y_train.unique()), activation='softmax')(concat)

# Compiling the model
model_cnn = Model(inputs=inputs, outputs=outputs)
model_cnn.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=LEARNING_RATE), metrics=["categorical_accuracy"])

# Converting numpy arrays to tensors for TensorFlow
train_data = tf.convert_to_tensor(np.array(X_train), dtype=tf.string)
val_data = tf.convert_to_tensor(np.array(X_val), dtype=tf.string)
test_data = tf.convert_to_tensor(np.array(X_test), dtype=tf.string)

# Initializing early stopping callback
early_stopping = EarlyStopping(patience=10, verbose=2, monitor="val_categorical_accuracy", mode="max", restore_best_weights=True)

# Training the model with the optimal hyperparameters
history_cnn = model_cnn.fit(train_data, y_train_1_hot, validation_data=(val_data, y_val_1_hot), batch_size=128, epochs=20, shuffle=True, callbacks=[early_stopping])

# Evaluating the model on the test set
print("CNN Model Classification Report:")
predictions = np.argmax(model_cnn.predict(test_data), axis=1)
print(classification_report(y_test, predictions))

# Plotting training history for accuracy
plt.plot(history_cnn.history['categorical_accuracy'])
plt.plot(history_cnn.history['val_categorical_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plotting training history for loss
plt.plot(history_cnn.history['loss'])
plt.plot(history_cnn.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

## BERT model hyperparameter tuning

In [None]:
# Creating the Optuna study for hyperparameter optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Printing the best trial results and hyperparameters
print(f"Best trial: {study.best_trial.value}")
print("Best hyperparameters: ", study.best_trial.params)

# Training the best model with the best hyperparameters
# Extracting the best trial parameters
best_trial = study.best_trial

# Setting up training arguments with the best hyperparameters
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=best_trial.params['learning_rate'],
    per_device_train_batch_size=best_trial.params['batch_size'],
    per_device_eval_batch_size=best_trial.params['batch_size'],
    num_train_epochs=best_trial.params['num_train_epochs'],
    weight_decay=best_trial.params['weight_decay'],
    warmup_steps=best_trial.params['warmup_steps'],
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir='./logs',
    logging_steps=10
)

# Initializing the Trainer with the best hyperparameters
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Training the model
trainer.train()

# Evaluating the final model on the test set
test_results = trainer.evaluate(eval_dataset=test_dataset)

# Printing the formatted test results
print_formatted_results(test_results, "Test Results")

## Macro-averaged precision, recall, F1, and PR-AUC scores for each classifier

In [None]:
# Example usage for BERT
# Generating predictions and probabilities for the BERT model on the test dataset
y_pred_bert = trainer.predict(test_dataset).predictions.argmax(-1)
y_proba_bert = trainer.predict(test_dataset).predictions

# Evaluating the BERT model's performance using the custom evaluation function
evaluate_classifier(y_test, y_pred_bert, y_proba_bert, classes=np.unique(y_test))

# Example usage for RNN
# Generating predictions for the RNN model on the test dataset
predictions_rnn = model_rnn.predict(test_data)

# Converting the predictions to label indices
y_pred_rnn = np.argmax(predictions_rnn, axis=1)

# Using the raw prediction probabilities for PR-AUC calculation
y_proba_rnn = predictions_rnn

# Evaluating the RNN model's performance using the custom evaluation function
evaluate_classifier(y_test, y_pred_rnn, y_proba_rnn, classes=np.unique(y_test))

# Example usage for CNN
# Generating predictions for the CNN model on the test dataset
predictions_cnn = model_cnn.predict(test_data)

# Converting the predictions to label indices
y_pred_cnn = np.argmax(predictions_cnn, axis=1)

# Using the raw prediction probabilities for PR-AUC calculation
y_proba_cnn = predictions_cnn

# Evaluating the CNN model's performance using the custom evaluation function
evaluate_classifier(y_test, y_pred_cnn, y_proba_cnn, classes=np.unique(y_test))

## Precision-Recall AUC for each class and plotting the Precision-Recall curve

In [None]:
# Generating predictions and probabilities for the BERT model on the test dataset
y_pred_bert = trainer.predict(test_dataset).predictions.argmax(-1)
y_proba_bert = trainer.predict(test_dataset).predictions

# Plotting Precision-Recall AUC for the BERT model
plot_precision_recall_auc(y_test, y_proba_bert, ["Negative", "Neutral", "Positive"], "BERT")

# Generating predictions for the RNN model on the test dataset
predictions_rnn = model_rnn.predict(test_data)

# Converting the predictions to label indices
y_pred_rnn = np.argmax(predictions_rnn, axis=1)

# Using the raw prediction probabilities for PR-AUC calculation
y_proba_rnn = predictions_rnn

# Plotting Precision-Recall AUC for the RNN model
plot_precision_recall_auc(y_test, y_proba_rnn, ["Negative", "Neutral", "Positive"], "RNN")

# Generating predictions for the CNN model on the test dataset
predictions_cnn = model_cnn.predict(test_data)

# Converting the predictions to label indices
y_pred_cnn = np.argmax(predictions_cnn, axis=1)

# Using the raw prediction probabilities for PR-AUC calculation
y_proba_cnn = predictions_cnn

# Plotting Precision-Recall AUC for the CNN model
plot_precision_recall_auc(y_test, y_proba_cnn, ["Negative", "Neutral", "Positive"], "CNN")