In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision.datasets import CIFAR10
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Subset, TensorDataset
import random
import numpy as np
from sklearn.decomposition import PCA
from typing import Tuple
from tqdm import tqdm
from scipy.spatial import distance_matrix
import pandas as pd

In [2]:
root = "/Users/xiaoy/OneDrive/Desktop/P7/p7 project/data/HPO_Validering_3/CIFAR/"
# load data
X_hpo = torch.tensor(np.load(root + "X_hpo_Cifar.npy" ))                       
y_hpo = np.load(root + "y_hpo_CIfar.npy" )

X_valid = torch.tensor( np.load(root + "X_val_Cifar.npy" ) )
y_valid = np.load(root + "y_val_Cifar.npy")

X_hpo_tensor = torch.tensor(X_hpo, dtype=torch.long)
y_hpo_tensor = torch.tensor(y_hpo, dtype=torch.long)

X_valid_tensor = torch.tensor(X_valid, dtype=torch.long)
y_valid_tensor = torch.tensor(y_valid, dtype=torch.long)

hpo_dataset = TensorDataset(X_hpo_tensor, y_hpo_tensor)
valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)

hpo_loader = DataLoader(hpo_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=True)


  X_hpo_tensor = torch.tensor(X_hpo, dtype=torch.long)
  X_valid_tensor = torch.tensor(X_valid, dtype=torch.long)


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
losses = []
maps = []

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SUBIC_encoder(nn.Module): 
    def __init__(self, input_size=4096, bits=48, num_classes=10, num_blocks=8, block_size=6):
        super(SUBIC_encoder, self).__init__()
       
        assert bits % num_blocks == 0, "Bits must be divisible by num_blocks"

        self.input_size = input_size
        self.bits = bits 
        self.num_blocks = num_blocks
        self.block_size = block_size
        
        # Define the encoder structure
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 256), 
            nn.ReLU(),
            nn.Linear(256, bits)
        )  # Outputs binary feature vectors
        
        self.fc3 = nn.Linear(bits, num_classes)  # Logits for num_classes
    
    def block_softmax(self, x):
        
        batch_size = x.shape[0]
        block_size = x.shape[1] // self.num_blocks
        
        # Ensure that x has the expected shape
        assert x.shape[1] == self.bits, f"Expected shape [batch_size, {self.bits}], got {x.shape}"
        
        # Reshape and apply softmax
        x = x.view(batch_size, self.num_blocks, block_size)
        x = F.softmax(x, dim=-1) 
        return x.view(batch_size, -1) #-1 refers to the value that will match the original elements 
    
    def block_one_hot(self, x):
        batch_size = x.shape[0]

        x = x.view(batch_size, self.num_blocks, self.block_size)
        max_indices = x.argmax(dim=-1, keepdim=True)
        
        # Create one-hot encoding
        one_hot = torch.zeros_like(x).scatter_(-1, max_indices, 1)

        return one_hot.view(batch_size, self.bits)
    
    def forward(self, x, use_one_hot=False):
        # Ensure x is a flat tensor before passing to encoder
        batch_size = x.shape[0]
        x = x.view(batch_size, -1)  # Flatten if necessary

        z = self.encoder(x)

        if use_one_hot:
            binary_codes = self.block_one_hot(z)
        else:
            binary_codes = self.block_softmax(z)

        class_probs = F.softmax(self.fc3(binary_codes), dim=-1) 

        return class_probs, binary_codes


In [4]:
from itertools import product # create cartesian products for all params

bits_12_param_grid = {
'bits':12,
'num_blocks':
[3, 4],
'gamma':[0.2, 0.6, 0.8],
'mu':[0.2, 0.6, 0.8],
'learning_rate': [0.005, 0.01]
}

bits_24_param_grid = {
'bits': 24,
'num_blocks':
[3, 6, 12],
'gamma':[0.2, 0.6, 0.8],
'mu':[0.2, 0.6, 0.8],
'learning_rate': [0.005, 0.01]
}

bits_32_param_grid = {
'bits':32,
'num_blocks':
[4, 8, 16],
'gamma':[0.2, 0.6, 0.8],
'mu':[0.2, 0.6, 0.8],
'learning_rate': [0.005, 0.01]
}

bits_48_param_grid = {
'bits':48,
'num_blocks':
[6, 12, 24],
'gamma':[0.2, 0.6, 0.8],
'mu':[0.2, 0.6, 0.8],
'learning_rate': [0.005, 0.01]
}

def generate_combinations(param_grid):
    keys = list(param_grid.keys())
    values = [v if isinstance(v, list) else [v] for v in param_grid.values()] # make list of dicts, if value is a list then take value from the list if not then take that scalar value
    gamma_idx = keys.index("gamma")
    mu_idx = keys.index("mu")
    gamma_mu_pairs = list(zip(values[gamma_idx], values[mu_idx]))
    combined_values = values[:gamma_idx] + [gamma_mu_pairs] + values[mu_idx+1:]
    combined_keys = keys[:gamma_idx] + ["gamma_mu"] + keys[mu_idx+1:]
    comb = list(product(*combined_values))
    param_combinations_dicts = [
        {
            **dict(zip(combined_keys, comb_item)),
            'gamma': comb_item[combined_keys.index('gamma_mu')][0],
            'mu': comb_item[combined_keys.index('gamma_mu')][1]
        } for comb_item in comb
    ]
    for comb_dict in param_combinations_dicts:
        del comb_dict['gamma_mu']
    
    return param_combinations_dicts

In [5]:
def one_hot_encode(a):
    
    if isinstance(a, torch.Tensor):
         a = a.cpu().numpy()
    b = np.zeros((a.size, a.max() + 1))
    b[np.arange(a.size), a] = 1
    
    return b
def meanAveragePrecision(test_hashes, training_hashes, test_labels, training_labels):
    aps = []
    if len(training_labels.shape) == 1:
        training_labels = one_hot_encode(training_labels)
        test_labels = one_hot_encode(test_labels)
    for i, test_hash in enumerate(tqdm(test_hashes)):
        label = test_labels[i]
        distances = np.abs(training_hashes - test_hashes[i]).sum(axis=1)
        tp = np.where((training_labels*label).sum(axis=1)>0, 1, 0)
        hash_df = pd.DataFrame({"distances":distances, "tp":tp}).reset_index()
        hash_df = hash_df.sort_values(["distances", "index"]).reset_index(drop=True)
        hash_df = hash_df.drop(["index", "distances"], axis=1).reset_index()
        hash_df = hash_df[hash_df["tp"]==1]
        hash_df["tp"] = hash_df["tp"].cumsum()
        hash_df["index"] = hash_df["index"] +1 
        precision = np.array(hash_df["tp"]) / np.array(hash_df["index"])
        ap = precision.mean()
        aps.append(ap)
    
    return np.array(aps).mean()

In [6]:
def entropy(p):
    entropy_result = -torch.sum(p * torch.log2(p + 1e-30), dim=-1)
    return entropy_result

def cross_entropy(class_prob, target):
    if len(target.shape) >1:
        s = (class_prob*target).mean(axis=1)
    else:
        s = class_prob[torch.arange(len(target)), target]
    return -torch.log2(s)/torch.log2(torch.FloatTensor([class_prob.shape[1]]))

def compute_total_loss(class_probs, target, binary_codes, num_blocks, block_size, gamma=0.5, mu=0.5):
    """
    Computes the total loss, which includes:
    - Cross-entropy classification loss
    - Mean entropy loss (encouraging one-hot encoding within each block)
    - Batch entropy loss (encouraging uniform distribution across blocks)
    
    Parameters:
    - class_probs: The class probabilities from the classification layer.
    - target: The true labels.
    - binary_codes: The binary codes generated by the encoder.
    - num_blocks: The number of blocks in the binary codes.
    - block_size: The size of each block in the binary codes.
    - gamma: Weight for the mean entropy loss.
    - mu: Weight for the batch entropy loss.
    
    """

    classification_loss = cross_entropy(class_probs, target)

    batch_size = binary_codes.shape[0]
    binary_codes = binary_codes.view(batch_size, num_blocks, block_size) #used in structure encoding

    #Mean Entropy Loss (encourages each block to resemble a one-hot vector) using softmax binary code
    mean_entropy_loss = entropy(binary_codes).mean(dim=1)

    #Batch Entropy Loss (encourages uniform distribution across blocks)
    average_support = binary_codes.mean(dim=0)  
    batch_entropy_loss = entropy(average_support).mean(dim=0)

    #Combine losses with weights gamma and mu
    entropy_loss = (gamma * mean_entropy_loss - mu * batch_entropy_loss)/torch.log2(torch.FloatTensor([block_size]))
    total_loss = (classification_loss + entropy_loss).mean()
    
    
    return total_loss

#logits, binary_codes = model(X_train, use_one_hot=False)
#loss = compute_total_loss(logits, y_train_tensor, binary_codes, num_blocks=8, block_size=4, gamma=0.5, mu=0.05)

In [7]:
def loss_function(epochs, bits, num_blocks, block_size, gamma, mu, learning_rate):
    # Initialize model
    model = SUBIC_encoder(
        bits=bits,
        input_size=X_hpo.shape[1],
        num_classes=10,
        num_blocks=num_blocks,
        block_size=block_size
    )
    model.to(device)  # Move model to the specified device
    
    # Initialize optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Track losses
    losses = []

    for epoch in range(epochs):
        model.train()  # Set model to training mode
        total_loss = 0.0

        for images, labels in hpo_loader:
            images, labels = images.to(device), labels.to(device)
            images, labels = images.to(torch.float), labels.to(torch.long)

            # Forward pass
            class_probs, binary_codes = model(images, use_one_hot=False)

            # Compute loss
            loss = compute_total_loss(
                class_probs, labels, binary_codes, num_blocks, block_size, gamma, mu
            )

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Accumulate loss
            total_loss += loss.item()

        # Compute average loss for the epoch
        avg_loss = total_loss / len(hpo_loader)
        losses.append(avg_loss)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

    return model, losses

def map_on_call(model):
    
    model.eval()
    all_query_codes, all_query_labels = [], []
    all_db_codes, all_db_labels = [], []

    with torch.no_grad():
        for images, labels in valid_loader:
            images, labels = images.to(device), labels.to(device)
            images, labels = images.to(torch.float), labels.to(torch.long)

            _, binary_codes = model(images, use_one_hot=True)

            # Ensure binary_codes is a tensor
            if not isinstance(binary_codes, torch.Tensor):
                raise TypeError("Expected binary_codes to be a tensor.")
            
            all_db_codes.append(binary_codes)
            all_db_labels.append(labels)
            if len(all_query_codes) == 0:  
                all_query_codes.append(binary_codes.clone())  
                all_query_labels.append(labels.clone())

    # Concatenate all tensors
    all_query_codes = torch.cat(all_query_codes, dim=0)
    all_query_labels = torch.cat(all_query_labels, dim=0)
    all_db_codes = torch.cat(all_db_codes, dim=0)
    all_db_labels = torch.cat(all_db_labels, dim=0)

    # Calculate MAP Score
    map_score = meanAveragePrecision(
        all_query_codes,
        all_db_codes,
        all_query_labels,
        all_db_labels
        )

    return map_score 
        

In [8]:
bits_12_param_dicts = generate_combinations(bits_12_param_grid) 
bits_24_param_dicts = generate_combinations(bits_24_param_grid) 
bits_32_param_dicts = generate_combinations(bits_32_param_grid) 
bits_48_param_dicts = generate_combinations(bits_48_param_grid) 

In [None]:
results = []
epochs = 100
map_results = {}

# iterate over each combination, train the model and evaluate its performance 
# initailize model with current hyperparameters

def hyper_tuning(param_grid, epochs, hpo_train, hpo_loader, device):
    results = {}  # To store MAP scores for each hyperparameter set
    best_map_score = 0  # Initialize with a very low value
    best_params = None  # To store the best parameter set

    for params in param_grid:
        # Extract parameters from the current set
        bits = params['bits']
        num_blocks = params['num_blocks']
        block_size = bits // num_blocks
        gamma = params['gamma']
        mu = params['mu']
        learning_rate = params['learning_rate']

        print(f"Testing hyperparameters: {params}")

        # Initialize model with the current hyperparameters
        model = SUBIC_encoder(bits=bits, input_size= hpo_train.shape[1], num_classes=10, num_blocks=num_blocks, block_size=block_size)
        model.to(device) 

        # Train the model with the current parameter set
        trained_model, _ = loss_function(epochs, bits, num_blocks, block_size, gamma, mu, learning_rate)

        # Evaluate the model using MAP score
        map_score = map_on_call(trained_model)
        print(f"MAP score: {map_score:.4f}")

        # Store the MAP score for the current parameter set
        results[tuple(params.items())] = map_score

        if map_score > best_map_score:
            best_map_score = map_score
            best_params = params
            
    return best_params, best_map_score, results




In [None]:
hyper_tuning(param_grid=bits_32_param_dicts, epochs=100, hpo_train=X_hpo, hpo_loader=hpo_loader, device=device)

Testing hyperparameters: {'bits': 12, 'num_blocks': 3, 'learning_rate': 0.005, 'gamma': 0.2, 'mu': 0.2}
Epoch 1/100, Loss: 0.8455
Epoch 2/100, Loss: 0.7225
Epoch 3/100, Loss: 0.6825
Epoch 4/100, Loss: 0.6424
Epoch 5/100, Loss: 0.6512
Epoch 6/100, Loss: 0.6363
Epoch 7/100, Loss: 0.6138
Epoch 8/100, Loss: 0.6132
Epoch 9/100, Loss: 0.5881
Epoch 10/100, Loss: 0.6076
Epoch 11/100, Loss: 0.6635
Epoch 12/100, Loss: 0.6004
Epoch 13/100, Loss: 0.6803
Epoch 14/100, Loss: 0.5850
Epoch 15/100, Loss: 0.5699
Epoch 16/100, Loss: 0.5828
Epoch 17/100, Loss: 0.5598
Epoch 18/100, Loss: 0.5740
Epoch 19/100, Loss: 0.5706
Epoch 20/100, Loss: 0.5604
Epoch 21/100, Loss: 0.5802
Epoch 22/100, Loss: 0.5657
Epoch 23/100, Loss: 0.5478
Epoch 24/100, Loss: 0.5577
Epoch 25/100, Loss: 0.5642
Epoch 26/100, Loss: 0.5351
Epoch 27/100, Loss: 0.5377
Epoch 28/100, Loss: 0.5669
Epoch 29/100, Loss: 0.5009
Epoch 30/100, Loss: 0.5226
Epoch 31/100, Loss: 0.5095
Epoch 32/100, Loss: 0.5163
Epoch 33/100, Loss: 0.4610
Epoch 34/100, 

KeyboardInterrupt: 

In [None]:
hyper_tuning(param_grid=bits_48_param_dicts, epochs=100, hpo_train=X_hpo, hpo_loader=hpo_loader, device=device)