###Author: Alec Sirkin
###net-id: aws4934

In [1]:
# Achieved 75.31% Accuracy

import torch

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

device = "cuda" if torch.cuda.is_available() else "cpu"

True
NVIDIA A100-SXM4-40GB


In [2]:
!pip install datasets -q

In [3]:
import torchvision
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt
import time
import copy
import tqdm
import random
import numpy as np
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms, models
from datasets import load_dataset

# Set a fixed random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:


############################################
# Step 1: Load Data and Create index_label_dict
############################################

dataset = load_dataset('hmdliu/ACAC-4K')

index_label_dict = {
    example['idx']: {
        'label': example['label'],
        'ai_label': 1 if example['label'] == 5 else 0
    }
    for example in dataset['train']
}

transform = transforms.Compose([
    transforms.Resize((244, 244)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.6205, 0.5431, 0.4475],
        std=[0.1667, 0.1615, 0.1647]
    )
])


class CustomHFDataset(Dataset):
    def __init__(self, hf_dataset, index_label_dict, transform=None, return_full=False, test_flag=False, binary_flag=False):
        """
        binary_flag: Set this to True when creating the dataset for the binary classifier
                     so that it returns (image, ai_label).
        """
        self.hf_dataset = hf_dataset
        self.index_label_dict = index_label_dict
        self.transform = transform
        self.return_full = return_full
        self.test_flag = test_flag
        self.binary_flag = binary_flag

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        example = self.hf_dataset[idx]
        original_idx = example['idx']
        image = example['image']

        if self.transform:
            image = self.transform(image)

        if self.test_flag:
            # For test dataset
            return image, torch.tensor(original_idx, dtype=torch.long)
        else:
            ai_label = self.index_label_dict[original_idx]['ai_label']
            label = self.index_label_dict[original_idx]['label']

            if self.return_full:
                # Return all info for pseudo-labeling steps
                return image, ai_label, label, original_idx
            else:
                # Decide what to return based on binary_flag
                if self.binary_flag:
                    # For binary classifier, return (image, ai_label)
                    return image, ai_label
                else:
                    # For main model training, return (image, label)
                    return image, label



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:

############################################
# Step 3: Create Full Training Dataset and Balanced Subset for Binary Classifier (retrain on whole dataset or just new split?)
############################################


def create_balanced_binary_dataset(full_dataset, index_label_dict):
    """Create a balanced dataset of AI and non-AI images.

    full_dataset: CustomHFDataset with return_full=False, binary_flag=True
    index_label_dict: dict with 'ai_label' for each entry

    Returns a balanced Subset of the full_dataset.
    """
    all_original_indices = [full_dataset.hf_dataset[i]['idx'] for i in range(len(full_dataset))]

    # Separate AI vs non-AI indices
    ai_label_1_indices = [i for i in all_original_indices if index_label_dict[i]['ai_label'] == 1]
    ai_label_0_indices = [i for i in all_original_indices if index_label_dict[i]['ai_label'] == 0]

    if len(ai_label_1_indices) == 0:
        raise ValueError("No AI-labeled images found; cannot create a balanced dataset.")

    # Sample equal number of non-AI as AI
    if len(ai_label_0_indices) < len(ai_label_1_indices):
        raise ValueError("Not enough non-AI images to balance with AI images.")
    sampled_ai_label_0_indices = random.sample(ai_label_0_indices, len(ai_label_1_indices))

    balanced_original_indices = ai_label_1_indices + sampled_ai_label_0_indices
    random.shuffle(balanced_original_indices)

    # Map original indices back to dataset indices
    orig_to_pos = {full_dataset.hf_dataset[i]['idx']: i for i in range(len(full_dataset))}
    balanced_indices = [orig_to_pos[orig_idx] for orig_idx in balanced_original_indices]

    balanced_subset = Subset(full_dataset, balanced_indices)
    return balanced_subset


def train_binary_classifier(model, dataloader, criterion, optimizer, epochs=5):
    """Train binary classifier for given number of epochs."""
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, ai_labels in tqdm(dataloader, desc=f'Binary Classifier Epoch {epoch+1}/{epochs}'):
            images, ai_labels = images.to(device), ai_labels.to(device).float().unsqueeze(1)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, ai_labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(dataloader):.4f}")


def collect_predictions(model, dataloader, index_label_dict):
    """Use the model to predict probabilities for non-AI images."""
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            images, ai_labels, labels, original_indices = batch
            images = images.to(device)
            outputs = model(images)
            probs = outputs.squeeze(1).cpu().numpy()
            orig_ids = original_indices.numpy()

            for prob, oid in zip(probs, orig_ids):
                if index_label_dict[oid]['ai_label'] == 0:
                    predictions.append((prob, oid))
    return predictions


def update_ai_labels(index_label_dict, predictions, threshold):
    """Relabel images whose predicted probability > threshold from non-AI to AI."""
    updated_count = 0
    for prob, oid in predictions:
        if prob > threshold and index_label_dict[oid]['ai_label'] == 0:
            index_label_dict[oid]['ai_label'] = 1
            index_label_dict[oid]['label'] = 5
            updated_count += 1
    return updated_count


############################################
# Iterative Process Setup
############################################

num_iterations = 3
threshold = 0.9          # Probability threshold for relabeling
binary_epochs = 5
save_model_path_template = "binary_model_iter_{}.pth"


train_full_dataset = CustomHFDataset(
    dataset['train'], index_label_dict, transform=transform, return_full=True
)

def get_binary_dataset():
    return CustomHFDataset(
        dataset['train'],
        index_label_dict,
        transform=transform,
        return_full=False,
        test_flag=False,
        binary_flag=True
    )

############################################
# Iterative Training and Relabeling Loop
############################################

for iteration in range(1, num_iterations + 1):
    print(f"=== Iteration {iteration}/{num_iterations} ===")

    binary_dataset = get_binary_dataset()

    balanced_binary_dataset = create_balanced_binary_dataset(binary_dataset, index_label_dict)
    binary_train_loader = DataLoader(balanced_binary_dataset, batch_size=16, shuffle=True)

    binary_model = models.resnet18(weights=None)
    binary_model.fc = nn.Sequential(
        nn.Linear(binary_model.fc.in_features, 1),
        nn.Sigmoid()
    )
    binary_model = binary_model.to(device)

    criterion = nn.BCELoss()
    optimizer = optim.Adam(binary_model.parameters(), lr=0.0005)

    train_binary_classifier(binary_model, binary_train_loader, criterion, optimizer, epochs=binary_epochs)

    binary_eval_loader = DataLoader(train_full_dataset, batch_size=16, shuffle=False)
    predictions = collect_predictions(binary_model, binary_eval_loader, index_label_dict)

    updated_count = update_ai_labels(index_label_dict, predictions, threshold)
    print(f"Iteration {iteration}: Relabeled {updated_count} images from non-AI to AI.")

    torch.save(binary_model.state_dict(), save_model_path_template.format(iteration))
    print(f"Model weights saved for iteration {iteration}.")

print("Final relabeling complete.")


=== Iteration 1/3 ===


Binary Classifier Epoch 1/5:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch 1/5, Loss: 0.4773


Binary Classifier Epoch 2/5:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch 2/5, Loss: 0.3739


Binary Classifier Epoch 3/5:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch 3/5, Loss: 0.4121


Binary Classifier Epoch 4/5:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch 4/5, Loss: 0.3506


Binary Classifier Epoch 5/5:   0%|          | 0/50 [00:00<?, ?it/s]

Epoch 5/5, Loss: 0.3716
Iteration 1: Relabeled 43 images from non-AI to AI.
Model weights saved for iteration 1.
=== Iteration 2/3 ===


Binary Classifier Epoch 1/5:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 1/5, Loss: 0.4828


Binary Classifier Epoch 2/5:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 2/5, Loss: 0.4020


Binary Classifier Epoch 3/5:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 3/5, Loss: 0.4054


Binary Classifier Epoch 4/5:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 4/5, Loss: 0.3669


Binary Classifier Epoch 5/5:   0%|          | 0/56 [00:00<?, ?it/s]

Epoch 5/5, Loss: 0.3495
Iteration 2: Relabeled 189 images from non-AI to AI.
Model weights saved for iteration 2.
=== Iteration 3/3 ===


Binary Classifier Epoch 1/5:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 1/5, Loss: 0.4023


Binary Classifier Epoch 2/5:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 2/5, Loss: 0.2901


Binary Classifier Epoch 3/5:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 3/5, Loss: 0.2853


Binary Classifier Epoch 4/5:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 4/5, Loss: 0.2768


Binary Classifier Epoch 5/5:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 5/5, Loss: 0.2795
Iteration 3: Relabeled 64 images from non-AI to AI.
Model weights saved for iteration 3.
Final relabeling complete.


In [7]:
# Recalculate class counts and calculate class weights

num_classes = 6
class_counts = np.zeros(num_classes, dtype=int)

for oid, info in index_label_dict.items():
    updated_label = info['label']
    class_counts[updated_label] += 1

total_samples = sum(class_counts)
print("Updated class counts:", class_counts)
print("Total samples:", total_samples)

# Inverse frequency to calculate class weights
class_weights = total_samples / (num_classes * class_counts)
print("Class weights:", class_weights)


Updated class counts: [ 84 726 248 544 902 696]
Total samples: 3200
Class weights: [6.34920635 0.73461892 2.15053763 0.98039216 0.59127864 0.76628352]


In [8]:
def train_main_model(model, dataloader, criterion, optimizer, epochs=10, lrs=None):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, ai_labels, labels, _ in tqdm(dataloader, desc=f'Epoch {epoch+1}/{epochs}'):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        lrs.step()
        curr = lrs.get_last_lr()[0]
        print(curr)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(dataloader):.4f}")

In [9]:
############################################
# Step 6: Train Main Model with Updated Labels
############################################

from torch.optim.lr_scheduler import CosineAnnealingLR

main_model = models.resnet50(weights=None)
num_classes = 6
main_model.fc = nn.Linear(main_model.fc.in_features, num_classes)
main_model = main_model.to(device)

# Re-create the training dataset with updated labels
main_train_dataset = CustomHFDataset(dataset['train'], index_label_dict, transform=transform, return_full=True)
main_train_loader = DataLoader(main_train_dataset, batch_size=16, shuffle=True)

main_criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float, device=device))
main_optimizer = optim.Adam(main_model.parameters(), lr=0.0001) #learning rate adjustment 10^-4

main_scheduler = CosineAnnealingLR(main_optimizer, T_max=15, eta_min=1e-6)

# Train the main model with updated pseudo-labels
train_main_model(main_model, main_train_loader, main_criterion, main_optimizer, epochs=15, lrs=main_scheduler)


Epoch 1/15:   0%|          | 0/200 [00:00<?, ?it/s]

9.891830623632339e-05
Epoch 1/15, Loss: 1.5963


Epoch 2/15:   0%|          | 0/200 [00:00<?, ?it/s]

9.572050015330874e-05
Epoch 2/15, Loss: 1.3981


Epoch 3/15:   0%|          | 0/200 [00:00<?, ?it/s]

9.05463412215599e-05
Epoch 3/15, Loss: 1.3275


Epoch 4/15:   0%|          | 0/200 [00:00<?, ?it/s]

8.362196501476348e-05
Epoch 4/15, Loss: 1.2571


Epoch 5/15:   0%|          | 0/200 [00:00<?, ?it/s]

7.525e-05
Epoch 5/15, Loss: 1.2374


Epoch 6/15:   0%|          | 0/200 [00:00<?, ?it/s]

6.57963412215599e-05
Epoch 6/15, Loss: 1.1184


Epoch 7/15:   0%|          | 0/200 [00:00<?, ?it/s]

5.567415893174885e-05
Epoch 7/15, Loss: 1.0730


Epoch 8/15:   0%|          | 0/200 [00:00<?, ?it/s]

4.532584106825116e-05
Epoch 8/15, Loss: 0.9936


Epoch 9/15:   0%|          | 0/200 [00:00<?, ?it/s]

3.5203658778440106e-05
Epoch 9/15, Loss: 0.9649


Epoch 10/15:   0%|          | 0/200 [00:00<?, ?it/s]

2.5750000000000013e-05
Epoch 10/15, Loss: 0.8186


Epoch 11/15:   0%|          | 0/200 [00:00<?, ?it/s]

1.7378034985236535e-05
Epoch 11/15, Loss: 0.7963


Epoch 12/15:   0%|          | 0/200 [00:00<?, ?it/s]

1.0453658778440107e-05
Epoch 12/15, Loss: 0.7219


Epoch 13/15:   0%|          | 0/200 [00:00<?, ?it/s]

5.279499846691251e-06
Epoch 13/15, Loss: 0.6753


Epoch 14/15:   0%|          | 0/200 [00:00<?, ?it/s]

2.0816937636766186e-06
Epoch 14/15, Loss: 0.6269


Epoch 15/15:   0%|          | 0/200 [00:00<?, ?it/s]

1e-06
Epoch 15/15, Loss: 0.5831


In [13]:
import pandas as pd

test_dataset = CustomHFDataset(dataset['test'], index_label_dict, transform=transform, test_flag=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


def generate_predictions(model, test_loader, device, output_csv='xxx.csv'):
    model.eval()
    all_preds = []
    all_indices = []

    with torch.no_grad():
        for images, indices in tqdm(test_loader, desc='Predicting'):
            images = images.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_indices.extend(indices.cpu().numpy())

    df = pd.DataFrame({
        'idx': all_indices,
        'predicted_label': all_preds
    })

    df = df.sort_values('idx').reset_index(drop=True)

    df.to_csv(output_csv, index=False)
    print(f"Predictions successfully saved to {output_csv}")

# Generate and save predictions
generate_predictions(main_model, test_loader, device, 'submission_56.csv')

Predicting:   0%|          | 0/25 [00:00<?, ?it/s]

Predictions successfully saved to submission_57.csv


In [11]:
torch.save(main_model.state_dict(), "main_model.pth")