PART **C**

---




In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score, classification_report
import pandas as pd

In [2]:
import random


def create_dataset():
    from google.colab import drive
    import pandas as pd
    import json
    import os

    drive.mount('/content/drive')

    # Load original captions
    original_captions_path = "/content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/test.csv"
    original_captions_df = pd.read_csv(original_captions_path)
    original_captions = original_captions_df['caption'].tolist()

    # List of files and their models/perturbation percentages
    files_info = [
        ("/content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/PARTB_Results/SmolVLM_captions_10.csv", "Model A (SmolVLM)", 10),
        ("/content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/PARTB_Results/SmolVLM_captions_50.csv", "Model A (SmolVLM)", 50),
        ("/content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/PARTB_Results/SmolVLM_captions_80.csv", "Model A (SmolVLM)", 80),
        ("/content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/PARTB_Results/custom_model_occluded_captions_10.json", "Model B (Custom)", 10),
        ("/content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/PARTB_Results/custom_model_occluded_captions_50.json", "Model B (Custom)", 50),
        ("/content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/PARTB_Results/custom_model_occluded_captions_80.json", "Model B (Custom)", 80)
    ]

    dataset = []

    for file_path, model_label, perturbation in files_info:
        file_ext = os.path.splitext(file_path)[1].lower()
        original_idx = 0

        # Process CSV files (SmolVLM)
        if file_ext == '.csv':
            df = pd.read_csv(file_path)

            for _, row in df.iterrows():
                caption = row['Caption'] if 'Caption' in df.columns else row['caption']

                # Extract the actual description from SmolVLM format
                if "User:<image>Describe this image" in caption:
                    generated_caption = caption.split("User:<image>Describe this image")[1].strip()
                else:
                    generated_caption = caption.strip()

                if original_idx < len(original_captions):
                    original_caption = original_captions[original_idx].strip()
                    # Format the input text as required
                    input_text = f"{original_caption} <SEP> {generated_caption} <SEP> {perturbation}"

                    # Add to dataset
                    dataset.append({
                        "input_text": input_text,
                        "output_label": model_label
                    })

                original_idx += 1

        # Process JSON files (Custom)
        elif file_ext == '.json':
            with open(file_path, 'r') as f:
                captions_dict = json.load(f)

            # Process captions from JSON dictionary
            for image_name, caption in captions_dict.items():
                generated_caption = caption.strip()

                if original_idx < len(original_captions):
                    original_caption = original_captions[original_idx].strip()
                    # Format the input text as required
                    input_text = f"{original_caption} <SEP> {generated_caption} <SEP> {perturbation}"

                    # Add to dataset
                    dataset.append({
                        "input_text": input_text,
                        "output_label": model_label
                    })

                original_idx += 1

    # Convert to DataFrame
    result_df = pd.DataFrame(dataset)

    # Shuffle the dataset
    result_df = result_df.sample(frac=1).reset_index(drop=True)

    return result_df

# Create and display the dataset
dataset = create_dataset()
print(f"Dataset created with {len(dataset)} examples")
print("\nFirst 5 examples:")
print(dataset.head(5))

# Save the dataset to a CSV file
dataset.to_csv("/content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/caption_classification_dataset.csv", index=False)
print("\nDataset saved to /content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/caption_classification_dataset.csv")


Mounted at /content/drive
Dataset created with 5568 examples

First 5 examples:
                                          input_text       output_label
0  There is a bamboo fence next to a concrete are...  Model A (SmolVLM)
1  A large double deck bus is driving on the stre...   Model B (Custom)
2  This is a picture of a catcher on a baseball f...   Model B (Custom)
3  A person is wearing a tan jacket. They have a ...   Model B (Custom)
4  A man with his face painted red and white has ...   Model B (Custom)

Dataset saved to /content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/caption_classification_dataset.csv


In [3]:
class CaptionClassifier(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased"):
        super(CaptionClassifier, self).__init__()

        self.bert = BertModel.from_pretrained(bert_model_name)

        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)  # Single logit for BCEWithLogitsLoss
        )

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # shape: [batch_size, hidden_size]
        logits = self.classifier(pooled_output)
        return logits


In [4]:
def train_classifier(model, dataloader, optimizer, criterion, device, epochs):
    model.to(device)
    model.train()

    for epoch in range(epochs):
        running_loss = 0.0

        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].float().to(device)  # Ensure float labels for BCE

            optimizer.zero_grad()

            logits = model(input_ids=input_ids, attention_mask=attention_mask)  # [batch_size, 1]

            loss = criterion(logits.squeeze(), labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_loss = running_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

    print("Training complete!")


In [5]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

def evaluate_classifier(model, dataloader, device, show_report=False):
    model.eval()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = (outputs.view(-1) > 0.5).long()

            all_labels.extend(labels.cpu().numpy().flatten().tolist())
            all_preds.extend(preds.cpu().numpy().flatten().tolist())

    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    acc = accuracy_score(all_labels, all_preds)

    if show_report:
        print("\nClassification Report:")
        print(classification_report(all_labels, all_preds, target_names=["Custom Model", "SmolVLM"]))

        cm = confusion_matrix(all_labels, all_preds)
        print("\nConfusion Matrix:")
        print(cm)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }


In [6]:
from sklearn.model_selection import train_test_split

# Load the full dataset
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/caption_classification_dataset.csv')

# Extract original captions for splitting
df['Original_Caption'] = df['input_text'].apply(lambda x: x.split('<SEP>')[0].strip())

# Get unique original captions
unique_captions = df['Original_Caption'].unique().tolist()

# Split unique captions into train/val/test (70:10:20)
train_caps, temp_caps = train_test_split(unique_captions, test_size=0.3, random_state=42)
val_caps, test_caps = train_test_split(temp_caps, test_size=0.67, random_state=42)  # 0.67 * 0.3 = 0.2 of total

# Create dataframes for each split based on unique captions
train_df = df[df['Original_Caption'].isin(train_caps)].reset_index(drop=True)
val_df = df[df['Original_Caption'].isin(val_caps)].reset_index(drop=True)
test_df = df[df['Original_Caption'].isin(test_caps)].reset_index(drop=True)

# Print dataset statistics
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

# Verify no caption overlap between splits
print(f"Train-val overlap: {len(set(train_caps) & set(val_caps))}")
print(f"Train-test overlap: {len(set(train_caps) & set(test_caps))}")
print(f"Val-test overlap: {len(set(val_caps) & set(test_caps))}")

# Extract exactly what the user requested
# For training
train_text = train_df['input_text'].tolist()  # Already in the "<original> <SEP> <generated> <SEP> <perturbation>" format
train_label = [1 if label == "Model B (Custom)" else 0 for label in train_df['output_label']]

# For validation
val_text = val_df['input_text'].tolist()
val_label = [1 if label == "Model B (Custom)" else 0 for label in val_df['output_label']]

# For testing
test_text = test_df['input_text'].tolist()
test_label = [1 if label == "Model B (Custom)" else 0 for label in test_df['output_label']]

# Print sample to verify
print("\nSample train_text:", train_text[0])
print("Sample train_label:", train_label[0])
print("\nClass distribution:")
print(f"Train: {sum(train_label)} Model B (Custom) / {len(train_label)} total")
print(f"Val: {sum(val_label)} Model B (Custom) / {len(val_label)} total")
print(f"Test: {sum(test_label)} Model B (Custom) / {len(test_label)} total")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train set size: 3894
Validation set size: 552
Test set size: 1122
Train-val overlap: 0
Train-test overlap: 0
Val-test overlap: 0

Sample train_text: There is a bamboo fence next to a concrete are brick tan wall. There are plant growing out of the fence. The ground is made of compacted dirt. There is black and white zebra munching on leaves of the plants that are on the fence. Directly behind the zebra there is small pond. there is small amount of water in the pond and there is greenery growing in the pond. The zebras reflection can be seen in the pond. <SEP> The image is a collage of different images, each of which is a close-up of a zebra. The zebra is black and white, with a distinctive pattern of stripes on its coat. The zebra is standing in a grassy field, and there are trees and bushes in the background. The zebra is looking at the camera, and its ears a

In [7]:
class CaptionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item


In [8]:
# Step 4: Train the model CaptionClassifier with train_generated_captions and train_labels

from torch.nn import BCEWithLogitsLoss

# Initialize tokenizer and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = CaptionClassifier("bert-base-uncased").to(device)

# Tokenize training data
train_encodings = tokenizer(
    train_text,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Dataset and DataLoader
train_dataset = CaptionDataset(train_encodings, train_label)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_encodings = tokenizer(
    val_text,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

val_dataset = CaptionDataset(val_encodings, val_label)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [9]:
from sklearn.metrics import accuracy_score

# Hyperparameter options to test
learning_rates = [1e-5, 3e-5]  # Experiment with different learning rates
batch_sizes = [16, 32]  # Test different batch sizes
epochs_options = [3, 5]  # Test different epoch numbers

# Initialize variables to track the best result
best_model = None
best_accuracy = 0.0
best_params = {}

# Loop over hyperparameters and test
for lr in learning_rates:
    for batch_size in batch_sizes:
        for epochs in epochs_options:
            print(f"Training with lr={lr}, batch_size={batch_size}, epochs={epochs}")

            # Reinitialize the model and tokenizer
            model = CaptionClassifier(bert_model_name="bert-base-uncased").to(device)
            optimizer = AdamW(model.parameters(), lr=lr)
            criterion = BCEWithLogitsLoss()

            # Create DataLoader with current batch size
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

            # Train the model
            train_classifier(model, train_loader, optimizer, criterion, device, epochs)

            # Evaluate the model on the validation set
            val_scores = evaluate_classifier(model, val_loader, device)
            val_accuracy = val_scores['f1_score']  # Choose F1-score or another metric for comparison

            # Save the best model based on validation F1-score
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                best_model = model
                best_params = {'lr': lr, 'batch_size': batch_size, 'epochs': epochs}

# Output the best parameters and model accuracy
print(f"Best Hyperparameters: {best_params}")
print(f"Best Validation F1-Score: {best_accuracy}")

Training with lr=1e-05, batch_size=16, epochs=3
Epoch 1/3, Loss: 0.2442
Epoch 2/3, Loss: 0.0665
Epoch 3/3, Loss: 0.0493
Training complete!
Training with lr=1e-05, batch_size=16, epochs=5
Epoch 1/5, Loss: 0.2224
Epoch 2/5, Loss: 0.0579
Epoch 3/5, Loss: 0.0457
Epoch 4/5, Loss: 0.0353
Epoch 5/5, Loss: 0.0329
Training complete!
Training with lr=1e-05, batch_size=32, epochs=3
Epoch 1/3, Loss: 0.3032
Epoch 2/3, Loss: 0.0970
Epoch 3/3, Loss: 0.0607
Training complete!
Training with lr=1e-05, batch_size=32, epochs=5
Epoch 1/5, Loss: 0.3358
Epoch 2/5, Loss: 0.0968
Epoch 3/5, Loss: 0.0586
Epoch 4/5, Loss: 0.0464
Epoch 5/5, Loss: 0.0412
Training complete!
Training with lr=3e-05, batch_size=16, epochs=3
Epoch 1/3, Loss: 0.1412
Epoch 2/3, Loss: 0.0455
Epoch 3/3, Loss: 0.0354
Training complete!
Training with lr=3e-05, batch_size=16, epochs=5
Epoch 1/5, Loss: 0.1510
Epoch 2/5, Loss: 0.0460
Epoch 3/5, Loss: 0.0446
Epoch 4/5, Loss: 0.0440
Epoch 5/5, Loss: 0.0347
Training complete!
Training with lr=3e-05

In [10]:
# Step 6: Evaluate the model on the test set using the same process as above
# Tokenize the test data
test_encodings = tokenizer(
    test_text,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)


test_dataset = CaptionDataset(test_encodings, test_label)
test_loader = DataLoader(test_dataset, batch_size=best_params['batch_size'], shuffle=False)

print("\n=== Test Set Evaluation ===")
test_scores = evaluate_classifier(best_model, test_loader, device, show_report=True)
print("Test Scores:", test_scores)

# Save evaluation results to CSV
from datetime import datetime

# Create results dataframe with test metrics
results = {
    'metric': ['accuracy', 'precision', 'recall', 'f1_score'],
    'value': [
        test_scores['accuracy'],
        test_scores['precision'],
        test_scores['recall'],
        test_scores['f1_score']
    ]
}

# Add hyperparameters to results
for param_name, param_value in best_params.items():
    results['metric'].append(f'best_{param_name}')
    results['value'].append(param_value)

# Add timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_df = pd.DataFrame(results)

# Save to CSV
csv_path = f'/content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/model_evaluation_results_{timestamp}.csv'
results_df.to_csv(csv_path, index=False)
print(f"Evaluation results saved to {csv_path}")

# Save detailed results with predictions
all_labels = []
all_preds = []

best_model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = best_model(input_ids, attention_mask)
        preds = (outputs.view(-1) > 0.5).long()

        all_labels.extend(labels.cpu().numpy().flatten().tolist())
        all_preds.extend(preds.cpu().numpy().flatten().tolist())

# Create dataframe with predictions
detailed_results = pd.DataFrame({
    'true_label': all_labels,
    'predicted_label': all_preds,
    'correct': [1 if l == p else 0 for l, p in zip(all_labels, all_preds)]
})

# Map numeric labels to model names
detailed_results['true_model'] = detailed_results['true_label'].apply(
    lambda x: "Model B (Custom)" if x == 1 else "Model A (SmolVLM)"
)
detailed_results['predicted_model'] = detailed_results['predicted_label'].apply(
    lambda x: "Model B (Custom)" if x == 1 else "Model A (SmolVLM)"
)

# Save detailed results
detailed_csv_path = f'/content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/detailed_predictions_{timestamp}.csv'
detailed_results.to_csv(detailed_csv_path, index=False)
print(f"Detailed predictions saved to {detailed_csv_path}")


=== Test Set Evaluation ===

Classification Report:
              precision    recall  f1-score   support

Custom Model       0.97      0.99      0.98       561
     SmolVLM       0.99      0.97      0.98       561

    accuracy                           0.98      1122
   macro avg       0.98      0.98      0.98      1122
weighted avg       0.98      0.98      0.98      1122


Confusion Matrix:
[[553   8]
 [ 18 543]]
Test Scores: {'accuracy': 0.9768270944741533, 'precision': 0.985480943738657, 'recall': 0.9679144385026738, 'f1_score': 0.9766187050359713}
Evaluation results saved to /content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/model_evaluation_results_20250414_185142.csv
Detailed predictions saved to /content/drive/MyDrive/custom_captions_dataset/custom_captions_dataset/detailed_predictions_20250414_185142.csv
