In [1]:
!pip install keras
!pip install tensorflow
!pip install transformers



In [2]:
# ‚úÖ PyTorch core
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch import optim

# ‚úÖ NLP processing
import nltk
nltk.download('punkt')  # For word tokenization
nltk.download('stopwords')  # For removing stopwords
nltk.download('wordnet')  # For lemmatization (WordNet)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import re  # For regular expressions in text cleaning
import spacy  # For advanced tokenization (optional)

# ‚úÖ Data handling
import pandas as pd
import numpy as np

# ‚úÖ Plotting & Visualization
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

# ‚úÖ Evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ‚úÖ Text vectorization (if we use BoW/TF-IDF baselines)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# ‚úÖ Optional classical baselines
from sklearn.naive_bayes import MultinomialNB  # For Naive Bayes classification
from sklearn.linear_model import LogisticRegression  # For Logistic Regression classification
from sklearn.svm import LinearSVC  # For Support Vector Classification
from sklearn.ensemble import RandomForestClassifier  # For Random Forest classification

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import os
from dotenv import load_dotenv

# Load variables from .env
load_dotenv()

# Get token
hf_token = os.getenv("HF_TOKEN")

# Login securely
from huggingface_hub import login
login(token=hf_token)

In [6]:
# Install Hugging Face datasets if not already
!pip install datasets --quiet

# Load the dataset
from datasets import load_dataset

# Load mirage-news dataset
dataset = load_dataset("anson-huang/mirage-news")

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m491.2/491.2 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/655M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/143M [00:00<?, ?B/s]

test1_nyt_mj-00000-of-00001.parquet:   0%|          | 0.00/20.2M [00:00<?, ?B/s]

test2_bbc_dalle-00000-of-00002.parquet:   0%|          | 0.00/560M [00:00<?, ?B/s]

test2_bbc_dalle-00001-of-00002.parquet:   0%|          | 0.00/19.0M [00:00<?, ?B/s]

test3_cnn_dalle-00000-of-00002.parquet:   0%|          | 0.00/559M [00:00<?, ?B/s]

test3_cnn_dalle-00001-of-00002.parquet:   0%|          | 0.00/25.8M [00:00<?, ?B/s]

test4_bbc_sdxl-00000-of-00001.parquet:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

test5_cnn_sdxl-00000-of-00001.parquet:   0%|          | 0.00/54.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2500 [00:00<?, ? examples/s]

Generating test1_nyt_mj split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test2_bbc_dalle split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test3_cnn_dalle split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test4_bbc_sdxl split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test5_cnn_sdxl split:   0%|          | 0/500 [00:00<?, ? examples/s]

In [7]:
from datasets import load_dataset

# Load Mirage-News
dataset = load_dataset("anson-huang/mirage-news")

# Convert to pandas
df = dataset['train'].to_pandas()

# Show column names
print("Available columns:", df.columns.tolist())

# Show sample rows
df.head()

Available columns: ['image', 'label', 'text']


Unnamed: 0,image,label,text
0,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,1,"Andal Ampatuan Jr., a notorious crime lord in ..."
1,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,1,"Senator Christopher J. Dodd, with his wife and..."
2,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,1,An immigrant viciously attacks medical personn...
3,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,1,Violent clashes break out in San Francisco on ...
4,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,1,A man closes the door of the house after plant...


In [8]:
print("Columns in dataset:", df.columns.tolist())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Sample label distribution
print("\nLabel distribution:")
print(df['label'].value_counts())

Columns in dataset: ['image', 'label', 'text']

Missing values:
image    0
label    0
text     0
dtype: int64

Label distribution:
label
1    5000
0    5000
Name: count, dtype: int64


In [9]:
# Install datasets library if needed
!pip install datasets --quiet

import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Step 1: Load Mirage-News dataset
dataset = load_dataset("anson-huang/mirage-news")
df = dataset["train"].to_pandas()

# Step 2: Clean ‚Äî drop rows with missing or empty text/image
df = df.dropna(subset=["text", "image"])
df = df[df["text"].str.strip() != ""]

print("‚úÖ Total usable samples:", len(df))

# Step 3: Stratified split ‚Äî 70% train, 15% validation, 15% test
train_df, temp_df = train_test_split(
    df,
    test_size=0.3,
    stratify=df["label"],
    random_state=42
)

valid_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,  # 50% of 30% = 15%
    stratify=temp_df["label"],
    random_state=42
)

# Step 4: Rename columns for compatibility with original CNN code
train_df["clean_title"] = train_df["text"]
valid_df["clean_title"] = valid_df["text"]
test_df["clean_title"]  = test_df["text"]

train_df["6_way_label"] = train_df["label"]
valid_df["6_way_label"] = valid_df["label"]
test_df["6_way_label"]  = test_df["label"]

# Step 5: Extract lists for preprocessing
train_news = list(train_df["clean_title"])
train_labels = list(train_df["6_way_label"])

valid_news = list(valid_df["clean_title"])
valid_labels = list(valid_df["6_way_label"])

test_news = list(test_df["clean_title"])
test_labels = list(test_df["6_way_label"])

print("‚úÖ Split complete:")
print(f"Train size: {len(train_news)}")
print(f"Valid size: {len(valid_news)}")
print(f"Test size : {len(test_news)}")

‚úÖ Total usable samples: 10000
‚úÖ Split complete:
Train size: 7000
Valid size: 1500
Test size : 1500


In [10]:
import nltk

# Download everything needed for:
# - Tokenizing words (punkt)
# - Lemmatizing nouns/verbs/adjectives (wordnet)
# - POS tagging for lemmatization (averaged_perceptron_tagger)
# - Stopwords removal (stopwords)

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

import nltk

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [11]:
import re

def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)
    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

# Apply cleaning
train_news_clean_1 = [preprocess_text(new) for new in train_news]
valid_news_clean_1 = [preprocess_text(new) for new in valid_news]
test_news_clean_1  = [preprocess_text(new) for new in test_news]

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def remove_stopwords_lem(text):
    text = word_tokenize(text)
    # Remove stopwords
    text = [word for word in text if word not in stop_words]
    # Lemmatize: noun ‚Üí verb ‚Üí adjective
    lemmatized_text = []
    for word in text:
        word1 = lemmatizer.lemmatize(word, pos="n")
        word2 = lemmatizer.lemmatize(word1, pos="v")
        word3 = lemmatizer.lemmatize(word2, pos="a")
        lemmatized_text.append(word3)
    return ' '.join(lemmatized_text)

# Apply stopword removal + lemmatization
train_stwrd_lem = [remove_stopwords_lem(text) for text in train_news_clean_1]
valid_stwrd_lem = [remove_stopwords_lem(text) for text in valid_news_clean_1]
test_stwrd_lem  = [remove_stopwords_lem(text) for text in test_news_clean_1]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
import torch
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Load BERT tokenizer
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Preprocessing function to tokenize using BERT tokenizer
def bert_tokenize(text):
    encoded_text = tokenizer.encode(
                        text,
                        add_special_tokens=True,   # Add [CLS] and [SEP]
                        max_length=512,            # Set max length for BERT tokens
                        truncation=True,           # Truncate longer sequences
                        padding='max_length'       # Pad shorter sequences
                    )
    return encoded_text

Loading BERT tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [14]:
train_tokenized = [bert_tokenize(new) for new in train_news]
valid_tokenized = [bert_tokenize(new) for new in valid_news]
test_tokenized = [bert_tokenize(new) for new in test_news]

In [None]:
MAX_LEN = 25  # Set the maximum length for padding

# Padding sequences manually (optional as BERT handles padding, but for control)
train_padded = pad_sequences(train_tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
valid_padded = pad_sequences(valid_tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
test_padded = pad_sequences(test_tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

In [16]:
def attention_mask(text):
    return [int(token_id > 0) for token_id in text]  # Mask is 1 for tokens and 0 for padding

# Generate attention masks for `ourmodel`
train_mask = [attention_mask(new) for new in train_padded]
valid_mask = [attention_mask(new) for new in valid_padded]
test_mask = [attention_mask(new) for new in test_padded]

In [17]:
# Concatenate train and validation padded sequences, masks, and labels
train_valid_padded = np.concatenate([train_padded, valid_padded], axis=0)
train_valid_mask = train_mask + valid_mask
train_valid_labels = train_labels + valid_labels

In [18]:
# Convert to torch tensors directly
train_inputs_tensor = torch.tensor(train_padded)
train_masks_tensor = torch.tensor(train_mask)
train_labels_tensor = torch.tensor(train_labels)

valid_inputs_tensor = torch.tensor(valid_padded)
valid_masks_tensor = torch.tensor(valid_mask)
valid_labels_tensor = torch.tensor(valid_labels)

test_inputs_tensor = torch.tensor(test_padded)
test_masks_tensor = torch.tensor(test_mask)
test_labels_tensor = torch.tensor(test_labels)

# Train + Validation Tensors
train_valid_inputs_tensor = torch.tensor(train_valid_padded)
train_valid_masks_tensor = torch.tensor(train_valid_mask)
train_valid_labels_tensor = torch.tensor(train_valid_labels)

In [19]:
batch_size = 32

# Train DataLoader for `ourmodel`
train_dataload = TensorDataset(train_inputs_tensor, train_masks_tensor, train_labels_tensor)
train_sampler = RandomSampler(train_dataload)
trainloader = DataLoader(train_dataload, sampler=train_sampler, batch_size=batch_size)

# Validation DataLoader for `ourmodel`
valid_dataload = TensorDataset(valid_inputs_tensor, valid_masks_tensor, valid_labels_tensor)
valid_sampler = RandomSampler(valid_dataload)
validloader = DataLoader(valid_dataload, sampler=valid_sampler, batch_size=batch_size)

# Test DataLoader for `ourmodel`
test_dataload = TensorDataset(test_inputs_tensor, test_masks_tensor, test_labels_tensor)
test_sampler = RandomSampler(test_dataload)
testloader = DataLoader(test_dataload, sampler=test_sampler, batch_size=batch_size)

# Train + Validation DataLoader for `ourmodel`
train_valid_dataload = TensorDataset(train_valid_inputs_tensor, train_valid_masks_tensor, train_valid_labels_tensor)
train_valid_sampler = RandomSampler(train_valid_dataload)
train_valid_loader = DataLoader(train_valid_dataload, sampler=train_valid_sampler, batch_size=batch_size)

In [20]:
# Print some information for confirmation
print(f"Train batches for `model`: {len(trainloader)}")
print(f"Validation batches for `model`: {len(validloader)}")
print(f"Test batches for `model`: {len(testloader)}")

Train batches for `model`: 219
Validation batches for `model`: 47
Test batches for `model`: 47


In [None]:
from tqdm import tqdm
from transformers import BertForSequenceClassification, BertConfig
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import torch
import numpy as np
import time
import datetime
import random
from sklearn.metrics import accuracy_score

# Model configuration and initialization
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Scheduler setup
epochs = 8
total_steps = len(train_valid_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# Accuracy function
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Format time function
def format_time(elapsed):
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Set seed for reproducibility
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Initialize loss tracking and early stopping variables
loss_values = []
patience = 3  # Early stopping patience
best_val_loss = float('inf')
patience_counter = 0

# Training loop
for epoch_i in range(epochs):

    print(f"======== Epoch {epoch_i + 1} / {epochs} ========")
    print("Training...")

    # Timer for epoch
    t0 = time.time()

    # Reset total loss and accuracy for this epoch
    total_loss = 0
    total_train_accuracy = 0

    # Set model to training mode
    model.train()

    # tqdm progress bar for training
    for step, batch in tqdm(enumerate(trainloader), total=len(trainloader), desc="Training", unit="batch"):

        # Unpack the batch
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Zero gradients
        model.zero_grad()

        # Forward pass
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        # Get loss value
        loss = outputs[0]

        # Add loss to total loss
        total_loss += loss.item()

        # Backward pass (calculate gradients)
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters
        optimizer.step()

        # Update learning rate
        scheduler.step()

        # Calculate accuracy for this batch
        logits = outputs[1]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_train_accuracy += flat_accuracy(logits, label_ids)

    # Calculate average loss and accuracy for the epoch
    avg_train_loss = total_loss / len(trainloader)
    avg_train_accuracy = total_train_accuracy / len(trainloader)

    loss_values.append(avg_train_loss)

    print(f"  Average training loss: {avg_train_loss:.2f}")
    print(f"  Average training accuracy: {avg_train_accuracy:.2f}")
    print(f"  Training epoch took: {format_time(time.time() - t0)}")

    ######################
    # Validation
    ######################
    print("Running Validation...")

    t0 = time.time()

    # Set model to evaluation mode
    model.eval()

    # Tracking variables for evaluation
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validloader:
        batch = tuple(t.to(device) for t in batch)

        # Unpack the batch
        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2]

        with torch.no_grad():
            outputs = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask)
            logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Compute accuracy
        eval_accuracy += flat_accuracy(logits, label_ids)

    print(f"Validation accuracy: {eval_accuracy / len(validloader)}")
    print(f"Validation took: {format_time(time.time() - t0)}")

    # Early stopping: Check if validation loss improved
    if eval_loss < best_val_loss:
        best_val_loss = eval_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping activated")
        break

print("Training complete!")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training...


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 219/219 [10:31<00:00,  2.88s/batch]


  Average training loss: 0.51
  Average training accuracy: 0.80
  Training epoch took: 0:10:32
Running Validation...
Validation accuracy: 0.8779445288753799
Validation took: 0:00:46
Training...


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 219/219 [10:34<00:00,  2.90s/batch]


  Average training loss: 0.25
  Average training accuracy: 0.91
  Training epoch took: 0:10:34
Running Validation...
Validation accuracy: 0.8749050151975684
Validation took: 0:00:46
Training...


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 219/219 [10:34<00:00,  2.90s/batch]


  Average training loss: 0.16
  Average training accuracy: 0.95
  Training epoch took: 0:10:34
Running Validation...
Validation accuracy: 0.8509688449848024
Validation took: 0:00:46
Training...


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 219/219 [10:34<00:00,  2.90s/batch]


  Average training loss: 0.11
  Average training accuracy: 0.97
  Training epoch took: 0:10:34
Running Validation...
Validation accuracy: 0.8802241641337386
Validation took: 0:00:46
Early stopping activated
Training complete!


In [22]:
import random
import torch
import numpy as np
import time
import datetime
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW

# Reproducibility
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model definition
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=6,
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 5
total_steps = len(train_valid_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round(elapsed))))

# Early Stopping setup
patience = 2
best_val_loss = float('inf')
patience_counter = 0

############# STEP 1: TRAINING #############
print("Starting training...\n")
for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
    print("Training...")

    t0 = time.time()
    total_loss = 0

    model.train()

    for step, batch in enumerate(train_valid_loader):

        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print(f'  Batch {step}  of  {len(train_valid_loader)}.    Elapsed: {elapsed}.')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_valid_loader)
    print(f"\n  Average training loss: {avg_train_loss:.2f}")
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

    ############# STEP 2: VALIDATION (for early stopping) #############
    print("\nRunning Validation...")
    model.eval()
    val_loss = 0

    for batch in validloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels
            )
        loss = outputs.loss
        val_loss += loss.item()

    avg_val_loss = val_loss / len(validloader)
    print(f"  Validation loss: {avg_val_loss:.2f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        print("  ‚úÖ Validation loss improved. Model saved.")
    else:
        patience_counter += 1
        print(f"  ‚ö†Ô∏è No improvement. Patience counter: {patience_counter}")

    if patience_counter >= patience:
        print("\nüî¥ Early stopping triggered!")
        break

print("\n‚úÖ Training complete.")

############# STEP 3: FINAL TESTING ON 1500 SAMPLES #############

print("\nRunning Final Test...")

model.eval()

predictions = np.array([], dtype=int)
labels_test = np.array([], dtype=int)

t0 = time.time()

for i, batch in enumerate(testloader):
    if i * testloader.batch_size >= 1500:
        break

    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs.logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()

    predictions = np.concatenate((predictions, pred_flat))
    labels_test = np.concatenate((labels_test, label_ids))

print("  Final test took: {:}".format(format_time(time.time() - t0)))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Training...
  Batch 40  of  266.    Elapsed: 0:01:54.
  Batch 80  of  266.    Elapsed: 0:03:50.
  Batch 120  of  266.    Elapsed: 0:05:46.
  Batch 160  of  266.    Elapsed: 0:07:42.
  Batch 200  of  266.    Elapsed: 0:09:38.
  Batch 240  of  266.    Elapsed: 0:11:34.

  Average training loss: 0.47
  Training epoch took: 0:12:49

Running Validation...
  Validation loss: 0.21
  ‚úÖ Validation loss improved. Model saved.

Training...
  Batch 40  of  266.    Elapsed: 0:01:54.
  Batch 80  of  266.    Elapsed: 0:03:50.
  Batch 120  of  266.    Elapsed: 0:05:46.
  Batch 160  of  266.    Elapsed: 0:07:42.
  Batch 200  of  266.    Elapsed: 0:09:38.
  Batch 240  of  266.    Elapsed: 0:11:35.

  Average training loss: 0.23
  Training epoch took: 0:12:50

Running Validation...
  Validation loss: 0.12
  ‚úÖ Validation loss improved. Model saved.

Training...
  Batch 40  of  266.    Elapsed: 0:01:54.
  Batch 80  of  266.    Elapsed: 0:03:50.
  Batch 120  of  266.    Elapsed: 0

In [23]:
print("\nClassification Report:")
print(classification_report(labels_test, predictions))


Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       750
           1       0.87      0.87      0.87       750

    accuracy                           0.87      1500
   macro avg       0.87      0.87      0.87      1500
weighted avg       0.87      0.87      0.87      1500



In [24]:
print("\nConfusion Matrix:")
print(confusion_matrix(labels_test, predictions))


Confusion Matrix:
[[652  98]
 [ 95 655]]
