In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import MNIST, FashionMNIST
import copy
from tqdm import tqdm
import os

# ===== CHANGED: Load pre-trained watermarked models instead of clean models =====
# Previously: Started with clean baseline models
# Now: Load models that already have watermarks embedded during training

def load_watermarked_models():
    """Load pre-trained watermarked models from the training phase"""
    # These should be the models saved after watermark embedding in notebook 02_
    watermarked_model_mnist = torch.load('./models/MNIST_SN_finetuned_baseline.pth')
    watermarked_model_fashion = torch.load('./models/FMNIST_SN_finetuned_baseline.pth')
    
    return watermarked_model_mnist, watermarked_model_fashion

# Load the watermarked models
watermarked_modelMNIST, watermarked_modelFashionMNIST = load_watermarked_models()

print("✓ Loaded pre-trained watermarked models for attack evaluation")


✓ Loaded pre-trained watermarked models for attack evaluation


In [3]:
# ===== CHANGED: Use clean datasets for attacks, not trigger sets =====
# Previously: Used trigger sets for fine-tuning
# Now: Use original clean datasets to simulate realistic attack scenarios

# Define transforms (same as training, but without watermark integration)
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Force exact dimensions
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Load CLEAN datasets for attacks (no watermark integration)
clean_dsMNIST = MNIST(root='./data/raw/MNIST', train=True, download=True, transform=transform)
clean_dsFashionMNIST = FashionMNIST(root='./data/raw/FashionMNIST', train=True, download=True, transform=transform)

# Create clean dataloaders for attacks
bsize = 64
clean_trainloaderMNIST = DataLoader(clean_dsMNIST, batch_size=bsize, shuffle=True)
clean_trainloaderFashionMNIST = DataLoader(clean_dsFashionMNIST, batch_size=bsize, shuffle=True)

print("✓ Prepared clean datasets for watermark removal attacks")


✓ Prepared clean datasets for watermark removal attacks


In [7]:


def attack_ftll(watermarked_model, clean_dataloader, num_epochs=10, lr=0.01):
    """
    Fine-Tune Last Layer (FTLL) Attack
    Attempts to remove watermarks by only modifying the final classification layer
    """
    model = copy.deepcopy(watermarked_model)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    model.to(device)
    
    # ===== CHANGED: Freeze feature layers, only train classifier =====
    # Previously: Trained all layers during FTLL
    # Now: Properly isolate final layer training
    
    # Freeze all feature extraction layers
    for param in model.features.parameters():
        param.requires_grad = False
    
    # Enable training only for the final classifier
    for param in model.classifier.parameters():
        param.requires_grad = True
    
    # ===== CHANGED: Higher learning rate for effective watermark removal =====
    # Previously: lr=0.001 (too conservative)
    # Now: lr=0.01 (aggressive enough to overwrite watermark patterns)
    
    optimizer = optim.SGD(model.classifier.parameters(), lr=lr, momentum=0.9)
    criterion = nn.CrossEntropyLoss()
    
    model.train()
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        loop = tqdm(clean_dataloader, desc=f"FTLL Attack Epoch {epoch+1}/{num_epochs}")
        for inputs, labels in loop:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            loop.set_postfix(loss=loss.item(), acc=100.*correct/total)
    
    return model

def attack_ftal(watermarked_model, clean_dataloader, num_epochs=15, lr=0.01):
    """
    Fine-Tune All Layers (FTAL) Attack
    Most aggressive attack - attempts to overwrite all watermark patterns
    """
    model = copy.deepcopy(watermarked_model)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    model.to(device)
    
    # ===== CHANGED: Enable training for all parameters =====
    # Previously: Inconsistent parameter training
    # Now: Full model retraining with clean data
    
    for param in model.parameters():
        param.requires_grad = True
    
    # ===== CHANGED: Higher learning rate with decay schedule =====
    # Previously: Fixed low learning rate
    # Now: Aggressive initial rate with strategic decay
    
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
    criterion = nn.CrossEntropyLoss()
    
    model.train()
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        loop = tqdm(clean_dataloader, desc=f"FTAL Attack Epoch {epoch+1}/{num_epochs}")
        for inputs, labels in loop:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            loop.set_postfix(loss=loss.item(), acc=100.*correct/total)
        
        scheduler.step()
    
    return model

def attack_rtll(watermarked_model, clean_dataloader, num_epochs=10, lr=0.01):
    """
    Retrain Last Layer (RTLL) Attack
    Reinitializes the final layer before training - more aggressive than FTLL
    """
    model = copy.deepcopy(watermarked_model)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    model.to(device)
    
    # ===== CHANGED: Proper layer reinitialization =====
    # Previously: Didn't actually reinitialize layers
    # Now: Reset final layer weights before training
    
    # Freeze feature layers
    for param in model.features.parameters():
        param.requires_grad = False
    
    # Reinitialize the final classifier layer
    if hasattr(model, 'classifier'):
        for layer in model.classifier:
            if hasattr(layer, 'weight'):
                nn.init.xavier_uniform_(layer.weight)
                if hasattr(layer, 'bias') and layer.bias is not None:
                    nn.init.zeros_(layer.bias)
    
    # Enable training only for classifier
    for param in model.classifier.parameters():
        param.requires_grad = True
    
    optimizer = optim.SGD(model.classifier.parameters(), lr=lr, momentum=0.9)
    criterion = nn.CrossEntropyLoss()
    
    model.train()
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        loop = tqdm(clean_dataloader, desc=f"RTLL Attack Epoch {epoch+1}/{num_epochs}")
        for inputs, labels in loop:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            loop.set_postfix(loss=loss.item(), acc=100.*correct/total)
    
    return model

def attack_rtal(watermarked_model, clean_dataloader, num_epochs=20, lr=0.01):
    """
    Retrain All Layers (RTAL) Attack
    Complete model reinitialization and retraining - most aggressive attack
    """
    model = copy.deepcopy(watermarked_model)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    model.to(device)
    
    # ===== CHANGED: Complete model reinitialization =====
    # Previously: Partial reinitialization
    # Now: Reset all trainable parameters
    
    # Reinitialize all layers
    for layer in model.modules():
        if hasattr(layer, 'weight'):
            nn.init.xavier_uniform_(layer.weight)
            if hasattr(layer, 'bias') and layer.bias is not None:
                nn.init.zeros_(layer.bias)
    
    # Enable training for all parameters
    for param in model.parameters():
        param.requires_grad = True
    
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.5)
    criterion = nn.CrossEntropyLoss()
    
    model.train()
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        loop = tqdm(clean_dataloader, desc=f"RTAL Attack Epoch {epoch+1}/{num_epochs}")
        for inputs, labels in loop:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            loop.set_postfix(loss=loss.item(), acc=100.*correct/total)
        
        scheduler.step()
    
    return model


In [8]:


def execute_attacks(watermarked_model, clean_dataloader, dataset_name):
    """Execute all four attack types on a watermarked model"""
    print(f"\n=== Executing attacks on {dataset_name} watermarked model ===")
    
    # Store original model state for fair comparison
    original_state = copy.deepcopy(watermarked_model.state_dict())
    
    attacked_models = {}
    
    # FTLL Attack
    print("\n1. Fine-Tune Last Layer (FTLL) Attack")
    watermarked_model.load_state_dict(original_state)
    attacked_models['FTLL'] = attack_ftll(watermarked_model, clean_dataloader)

    """
    # FTAL Attack
    print("\n2. Fine-Tune All Layers (FTAL) Attack")
    watermarked_model.load_state_dict(original_state)
    attacked_models['FTAL'] = attack_ftal(watermarked_model, clean_dataloader)
    
    # RTLL Attack
    print("\n3. Retrain Last Layer (RTLL) Attack")
    watermarked_model.load_state_dict(original_state)
    attacked_models['RTLL'] = attack_rtll(watermarked_model, clean_dataloader)
    
    # RTAL Attack
    print("\n4. Retrain All Layers (RTAL) Attack")
    watermarked_model.load_state_dict(original_state)
    attacked_models['RTAL'] = attack_rtal(watermarked_model, clean_dataloader)
    """
    return attacked_models

# Execute attacks on both datasets
attacked_models_mnist = execute_attacks(watermarked_modelMNIST, clean_trainloaderMNIST, "MNIST")
attacked_models_fashion = execute_attacks(watermarked_modelFashionMNIST, clean_trainloaderFashionMNIST, "FashionMNIST")



=== Executing attacks on MNIST watermarked model ===

1. Fine-Tune Last Layer (FTLL) Attack
cuda


FTLL Attack Epoch 1/10: 100%|██████████| 938/938 [01:42<00:00,  9.14it/s, acc=97, loss=0.477]     
FTLL Attack Epoch 2/10: 100%|██████████| 938/938 [01:39<00:00,  9.43it/s, acc=98, loss=2.03e-5]   
FTLL Attack Epoch 3/10: 100%|██████████| 938/938 [01:39<00:00,  9.46it/s, acc=98.1, loss=2.05e-7] 
FTLL Attack Epoch 4/10: 100%|██████████| 938/938 [01:36<00:00,  9.70it/s, acc=98, loss=0.219]     
FTLL Attack Epoch 5/10: 100%|██████████| 938/938 [01:39<00:00,  9.47it/s, acc=98.1, loss=2.72e-7] 
FTLL Attack Epoch 6/10: 100%|██████████| 938/938 [01:37<00:00,  9.62it/s, acc=98.1, loss=0.0051]  
FTLL Attack Epoch 7/10: 100%|██████████| 938/938 [01:34<00:00,  9.91it/s, acc=98.1, loss=1.23]    
FTLL Attack Epoch 8/10: 100%|██████████| 938/938 [01:39<00:00,  9.39it/s, acc=98, loss=1.21]      
FTLL Attack Epoch 9/10: 100%|██████████| 938/938 [01:40<00:00,  9.35it/s, acc=97.9, loss=0.00153] 
FTLL Attack Epoch 10/10: 100%|██████████| 938/938 [01:42<00:00,  9.14it/s, acc=98.1, loss=2.24e-8] 



=== Executing attacks on FashionMNIST watermarked model ===

1. Fine-Tune Last Layer (FTLL) Attack
cuda


FTLL Attack Epoch 1/10: 100%|██████████| 938/938 [01:39<00:00,  9.46it/s, acc=88.5, loss=0.703] 
FTLL Attack Epoch 2/10: 100%|██████████| 938/938 [01:41<00:00,  9.24it/s, acc=89.5, loss=0.756] 
FTLL Attack Epoch 3/10: 100%|██████████| 938/938 [01:36<00:00,  9.69it/s, acc=89.7, loss=0.244] 
FTLL Attack Epoch 4/10: 100%|██████████| 938/938 [01:39<00:00,  9.43it/s, acc=89.9, loss=0.147] 
FTLL Attack Epoch 5/10: 100%|██████████| 938/938 [01:33<00:00, 10.03it/s, acc=90, loss=0.252]   
FTLL Attack Epoch 6/10: 100%|██████████| 938/938 [01:41<00:00,  9.26it/s, acc=90.2, loss=0.512] 
FTLL Attack Epoch 7/10: 100%|██████████| 938/938 [01:36<00:00,  9.73it/s, acc=90, loss=0.352]   
FTLL Attack Epoch 8/10: 100%|██████████| 938/938 [01:37<00:00,  9.66it/s, acc=90.1, loss=0.282] 
FTLL Attack Epoch 9/10: 100%|██████████| 938/938 [01:39<00:00,  9.45it/s, acc=90.2, loss=0.214] 
FTLL Attack Epoch 10/10: 100%|██████████| 938/938 [01:39<00:00,  9.44it/s, acc=90.2, loss=0.309] 


In [9]:


def save_attacked_models(attacked_models, dataset_name):
    """Save all attacked models for evaluation"""
    os.makedirs('./models/attacked/', exist_ok=True)
    
    for attack_type, model in attacked_models.items():
        model_path = f'./models/attacked/{dataset_name.lower()}_{attack_type.lower()}_attacked.pth'
        torch.save(model, model_path)
        print(f"✓ Saved {attack_type} attacked model: {model_path}")

# Save all attacked models
save_attacked_models(attacked_models_mnist, "MNIST")
save_attacked_models(attacked_models_fashion, "FashionMNIST")

print("\n✓ All watermark removal attacks completed and models saved")
print("✓ Ready for evaluation in notebook 04_")


✓ Saved FTLL attacked model: ./models/attacked/mnist_ftll_attacked.pth
✓ Saved FTLL attacked model: ./models/attacked/fashionmnist_ftll_attacked.pth

✓ All watermark removal attacks completed and models saved
✓ Ready for evaluation in notebook 04_
