In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import optuna
import random
import time
import datetime
import re, string, html
# import logging
# logging.basicConfig(level=logging.INFO)

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, SequentialSampler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import AdamW

from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig, get_linear_schedule_with_warmup

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import (accuracy_score, precision_recall_curve, f1_score,
    roc_curve, confusion_matrix, classification_report, ConfusionMatrixDisplay, auc)

from sklearn.manifold import TSNE
from keras.preprocessing.sequence import pad_sequences

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Set a random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
# Load datasets
train_ds = pd.read_csv('/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/train_dataset.csv')
val_ds = pd.read_csv('/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/val_dataset.csv')
test_ds = pd.read_csv('/kaggle/input/ai-2-dl-for-nlp-2025-homework-3/test_dataset.csv')

# Check data is imported correctly
print(train_ds.head)

In [None]:
# Data Preprocessing (from HW1) but with Bert Tokenizer
tk = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocessing(text):
    # Decode HTML entities
    text = html.unescape(text)
    # Make text lowercase
    text = text.lower()
    # Change emails to xxx@email.com
    text = re.sub(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', 'xxx@email.com', text)
    # Remove Twitter usernames
    text = re.sub(r'@\w+', '', text)
    # Change URLs to xxx.link.com
    text = re.sub(r'https?://\S+|www\.\S+', 'httpxxx', text)
     # Remove punctuation
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove non-alphanumeric characters and emojis
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Replace repeating characters (3 or more) with just 2 letters
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    # Tokenize with TweetTokenizer
    tokens = tk.tokenize(text)
    # Define stopwords to filter out of tokens list
    # stop_words = {'the', 'and', 'is', 'in', 'to', 'of', 'a', 'for', 'in', 'so',
    #              'omg', 'dude', 'lol', 'my', 'for', 'on', 'you', 'it', 'me'}
    # Lemmatize
    tokens = [ lemmatizer.lemmatize(word) 
               for word in tokens]
                   #if word not in stop_words and word.isalpha()]
    return ' '.join(tokens)

# Text
train_ds['preprocessed_text'] = train_ds['Text'].apply(preprocessing)
val_ds['preprocessed_text'] = val_ds['Text'].apply(preprocessing)
test_ds['preprocessed_text'] = test_ds['Text'].apply(preprocessing)

# Check data is valid and correctly loaded
print(train_ds.head)

In [None]:
# Tokenization for BERT
train_texts = train_ds['preprocessed_text'].values
val_texts = val_ds['preprocessed_text'].values
test_texts = test_ds['preprocessed_text'].values

train_labels = train_ds['Label'].values
val_labels = val_ds['Label'].values

def get_max_len(texts):
    return max([len(tokenizer.encode(text, add_special_tokens=True)) for text in texts])

#print("Adding +10 to the sentence length for safety.")
max_len_train = get_max_len(train_texts) 
print("Maximum sentence length in train set: ", max_len_train)
max_len_val = get_max_len(val_texts) 
print("Maximum sentence length in val set: ", max_len_val)
max_len_test = get_max_len(test_texts) 
print("Maximum sentence length in test set: ", max_len_test)

# Maximum stentence length in train set:  54
# Maximum stentence length in val set:  49
# Maximum stentence length in test set:  52
# So we will manually assign max_len = 64
MAX_LEN = 64
def bert_encode(texts, tokenizer, MAX_LEN):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return (
        torch.cat(input_ids, dim=0),
        torch.cat(attention_masks, dim=0)
    )

# Encode all sets
input_ids_train, attention_masks_train = bert_encode(train_texts, tokenizer, MAX_LEN)
input_ids_val, attention_masks_val = bert_encode(val_texts, tokenizer, MAX_LEN)
input_ids_test, attention_masks_test = bert_encode(test_texts, tokenizer, MAX_LEN)

# Convert labels to tensors
labels_train = torch.tensor(train_labels)
labels_val = torch.tensor(val_labels)

# Print shape info
print("Train input_ids shape:", input_ids_train.shape)
print("Validation input_ids shape:", input_ids_val.shape)
print("Test input_ids shape:", input_ids_test.shape)

In [None]:
# Create TensorDatasets
train_dataset = TensorDataset(input_ids_train, attention_masks_train, labels_train)
val_dataset = TensorDataset(input_ids_val, attention_masks_val, labels_val)
test_dataset = TensorDataset(input_ids_test, attention_masks_test)

# Define batch size 
BATCH_SIZE = 32

# Create DataLoaders
train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = BATCH_SIZE
)

val_dataloader = DataLoader(
    val_dataset,
    sampler = SequentialSampler(val_dataset),
    batch_size = BATCH_SIZE
)

test_dataloader = DataLoader(
    test_dataset,
    sampler = SequentialSampler(test_dataset),
    batch_size = BATCH_SIZE
)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
# Optimizer & Learning Rate Scheduler
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, #3e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

epochs = 3

# Total number of training steps is [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs


# later  experiment:
num_warmup_steps=int(0.1 * total_steps)

# Create the learning rate scheduler to dynamically adjust the learning rate.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=num_warmup_steps, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))
    # We'll store a number of quantities such as training and validation loss,

In [None]:
# validation accuracy, and timings.
training_stats = []
train_losses = []
val_losses = []
val_accuracies = []
val_f1s = []
model_outputs = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# Training Loop
for epoch_i in range(epochs):
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    # Reset the total loss for this epoch.
    total_train_loss = 0
    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            
        # Unpack this training batch from our dataloader.
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        #Clear any previously calculated gradients before performing a backward pass
        model.zero_grad()

        result = model(b_input_ids,
                   token_type_ids=None,
                   attention_mask=b_input_mask,
                   labels=b_labels,
                   return_dict=True)

        loss = result.loss
        logits = result.logits
            
        # Accumulate the training loss over all of the batches
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()
            
        # Update parameters and take a step using the computed gradient.            
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.4f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0

    val_preds = []
    val_labels = []
    model_outputs_epoch = []
    # Evaluate data for one epoch
    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            result = model(b_input_ids,
                          token_type_ids=None,
                          attention_mask=b_input_mask,
                          labels=b_labels,
                          return_dict=True)

        # Get the loss and "logits" output by the model. The "logits" are the
        # output values prior to applying an activation function like the
        # softmax.
        loss = result.loss

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = result.logits.detach().cpu().numpy()
        probs = F.softmax(torch.tensor(logits), dim=1).numpy()
        model_outputs_epoch.extend(probs[:, 1]) # for ROC Curve
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)

        # For Classification Report
        batch_preds = np.argmax(logits, axis = 1).flatten()
        val_preds.extend(batch_preds)
        val_labels.extend(label_ids)
    
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    print("  Accuracy: {0:.4f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(val_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    # For the plots
    val_f1 = f1_score(val_labels, val_preds, average='macro')
    val_accuracies.append(avg_val_accuracy)
    val_losses.append(avg_val_loss)
    val_f1s.append(val_f1)
    model_outputs = model_outputs_epoch  # update most recent for ROC/PR curve

    print("  Accuracy: {0:.4f}".format(avg_val_accuracy))
    print("  Validation Loss: {0:.4f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    print("\nClassification Report:")
    print(classification_report(val_labels, val_preds, target_names=['Negative', 'Positive']))


    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Valid. F1': val_f1,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )


print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
# Convert stats to DataFrame
df_stats = pd.DataFrame(training_stats).set_index('epoch')

# Seaborn styling
sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

In [None]:
plt.plot(train_losses, 'b-o', label="Training Loss")
plt.plot(val_losses, 'g-o', label="Validation Loss")
plt.title("Training vs. Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks(range(epochs))
plt.savefig("train_val_loss_over_epochs.png")
plt.show()

In [None]:
# Val Accuracy over Epochs
plt.plot(val_accuracies, 'r-o', label="Validation Accuracy")
plt.title("Validation Accuracy Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.grid(True)
plt.legend()
plt.xticks(range(epochs))
plt.savefig("accuracy_over_epochs.png")
plt.show()

In [None]:
# F1 score plot
plt.plot(val_f1s, color='purple', label="Validation F1 Score")
plt.title("Validation F1 Score Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("F1 Score")
plt.grid(True)
plt.legend()
plt.xticks(range(len(val_losses)))
plt.savefig("f1_score_over_epochs.png")
plt.show()

# Confusion Matrix (Final Evaluation)
cm = confusion_matrix(val_labels, val_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.savefig("confusion_matrix.png")
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(val_labels, model_outputs)  # model_outputs = raw sigmoid outputs
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.grid(True)
plt.legend()
plt.savefig("roc_curve.png")
plt.show()

# Precision - Recall Curve
precision, recall, _ = precision_recall_curve(val_labels, model_outputs)

plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precisionâ€“Recall Curve")
plt.grid(True)
plt.savefig("precision_recall_curve.png")
plt.show()

In [None]:
# Prediction on test set
print('Predicting labels for {:,} test sentences...'.format(len(input_ids_test)))

# Put model in evaluation mode
model.eval()

# Tracking variables
predictions  = []

# Predict
for batch in test_dataloader:
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = [t.to(device) for t in batch]

  # Telling the model not to compute or store gradients, saving memory and
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions.
      result = model(b_input_ids,
                     token_type_ids=None,
                     attention_mask=b_input_mask,
                     return_dict=True)

  # Move logits and labels to CPU
  logits = result.logits.detach().cpu().numpy()

  # Store predictions and true labels
  predictions.append(logits)

print('    DONE.')

In [None]:
# Combine the results across all batches.
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
predicted_labels = np.argmax(flat_predictions, axis=1).flatten()

In [None]:
# Create submission DataFrame
submission_df = pd.DataFrame({
    'Id': test_ds['ID'],  
    'Label': predicted_labels
})

# Save submission to .csv
submission_df.to_csv("submission.csv", index=False)

print("Submission file created succesfully.")