In [28]:
import zipfile
from PIL import Image
import io
import torch
from torchvision import transforms
from tqdm import tqdm  # For progress bar
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from sklearn.model_selection import train_test_split


In [None]:
# The Nus wide dataset is loaded from kaggle, the structure is funky, so ... 
zip_path = r'C:\Users\Test\Desktop\P7_Data\Nus_Wide\archive.zip'
image_txt_file = 'NUS-WIDE/database_img.txt'
label_txt_file = "NUS-WIDE/database_label.txt"

# Step 1: Read the image paths from database_img.txt
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    with zip_ref.open(image_txt_file) as file:
        image_paths = [line.decode('utf-8').strip() for line in file]
# Step 1: Read the image paths from database_img.txt
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    with zip_ref.open(label_txt_file) as file:
        labels = [line.decode('utf-8').strip() for line in file]

# Ensure image_paths and labels match in length (for simplicity)
assert len(image_paths) == len(labels), "Mismatch between number of images and labels" # This would be  a problem 


In [None]:
df = pd.DataFrame(list(zip(labels, image_paths)),columns = ["labels","img_path"]) 
df_1 = df[df['labels'].apply(lambda x: len(x) == 1)] # Only keep images with single label (multi label out of scope!)

In [None]:

images_folder_in_zip = "NUS-WIDE/"

# The classic transform! 
transform = transforms.Compose([
    transforms.Resize((224, 224)),             # Resize to 224x224
    transforms.ToTensor(),                     # Convert to a PyTorch tensor and scale to [0, 1]
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])

In [None]:
# Extract the filtered image paths and their corresponding labels
image_paths_filtered = df_1['img_path'].tolist()
labels_filtered = df_1['labels'].tolist()  
image_label_tuples = []

with zipfile.ZipFile(zip_path, 'r') as zip_ref: # Preprocess the relevant images, heavy computation, but should work with 32 gb RAM. 
    # List all files in the 'images' folder inside the zip
    image_files = [f for f in zip_ref.namelist() if f.startswith(images_folder_in_zip) and f.endswith('.jpg')]
    for image_file in tqdm(image_files, desc="Processing images"):
        # Check if the current image is in the filtered list
        img_path = image_file.replace(images_folder_in_zip, "")  # Remove folder part for matching
        if img_path in image_paths_filtered: # Check if relevant image, (1 label only)
            idx = image_paths_filtered.index(img_path)
            label = labels_filtered[idx][0]  # Since the labels are stored as lists with one element
            with zip_ref.open(image_file) as img_file:
                # Open image, apply transformations
                img = Image.open(io.BytesIO(img_file.read())).convert("RGB")  # Convert to RGB
                img_tensor = transform(img)  # Transform to tensor and normalize
                
                # Append a tuple (image_tensor, label) to the list
                image_label_tuples.append((img_tensor, label))  

Processing images: 100%|██████████| 269648/269648 [06:57<00:00, 646.63it/s]


In [None]:
# Adam the doofus had string labels, so he makes it integer based
unique_labels = set(label for _, label in image_label_tuples)  # Extract unique string labels
label_to_int = {label: idx for idx, label in enumerate(unique_labels)}  # Map labels to integers
image_label_tuples_int = [(img_tensor, label_to_int[label]) for img_tensor, label in image_label_tuples]


In [None]:
# ved ikke om denne class er vigtig
class ImageLabelDataset(Dataset):
    """Custom Dataset for loading images and their labels."""
    
    def __init__(self, image_label_tuples):
        self.image_label_tuples = image_label_tuples
    
    def __len__(self):
        return len(self.image_label_tuples)
    
    def __getitem__(self, idx):
        image_tensor, label = self.image_label_tuples[idx]
        return image_tensor, label

batch_size = 128
dataset = ImageLabelDataset(image_label_tuples_int)  # image_label_tuples_int is a list of (image_tensor, int_label)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Function to extract features
def extract_features(dataloader, model, device):
    features = []
    labels = []

    with torch.no_grad():  # Disable gradient computation for inference
        # Wrap the DataLoader with tqdm for progress bar
        for inputs, targets in tqdm(dataloader, desc="Extracting features", unit="batch"):
            inputs = inputs.to(device)

            # Forward pass through the VGG-16 model
            outputs = model(inputs)

            # Flatten the output from conv layers to (batch_size, feature_dim)
            outputs = outputs.view(outputs.size(0), -1)

            features.append(outputs.cpu())  # Move to CPU and store
            labels.append(targets)

    # Concatenate all features and labels
    features = torch.cat(features)
    labels = torch.cat(labels)

    return features, labels

# Example usage:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg16 = models.vgg16(pretrained=True).to(device)

# Modify VGG-16 for feature extraction (keep only the convolutional layers)
vgg16.classifier = vgg16.classifier[:4]  # Remove fully connected layers
vgg16.eval()  # Set to evaluation mode

# Extract features using the dataloader and the pre-trained VGG-16 model
features, labels = extract_features(dataloader, vgg16, device)

# Print the shape of the extracted features
print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")


Extracting features: 100%|██████████| 636/636 [05:41<00:00,  1.86batch/s]


Features shape: torch.Size([81314, 4096])
Labels shape: torch.Size([81314])


In [26]:
features = features.cpu().detach().numpy()
labels = labels.cpu().detach().numpy()

In [29]:

# Step 1: Split into train and test (75% train, 16.67% test)
train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.1667, stratify=labels, random_state=42
)

# Step 2: Split the train set into train and validation (75% train, 8.33% validation)
train_features, val_features, train_labels, val_labels = train_test_split(
    train_features, train_labels, test_size=0.1111, stratify=train_labels, random_state=42
)

# Check for the stratisfied sampling

In [31]:
train_features.shape

(60230, 4096)

In [32]:
test_features.shape

(13556, 4096)

In [33]:
val_features.shape

(7528, 4096)

In [None]:
pd.Series(test_labels).value_counts() # Kig på det her, Det er repræsentativt !!!! 

1    6117
0    3174
8    1264
2     721
7     627
4     433
3     422
9     403
6     370
5      25
Name: count, dtype: int64

In [None]:
pd.Series(labels).value_counts() # Ift denne basis, 

1    36690
0    19036
8     7585
2     4327
7     3759
4     2597
3     2532
9     2420
6     2220
5      148
Name: count, dtype: int64

In [40]:
import numpy as np

# Saving the files

In [43]:
r"""
np.save(r'C:\Users\Test\Desktop\P7_Data\Nus_Wide_preprocessed\train_features_vgg16_NUSWIDE.npy', train_features)
np.save(r'C:\Users\Test\Desktop\P7_Data\Nus_Wide_preprocessed\test_features_vgg16_NUSWIDE.npy', test_features)
np.save(r'C:\Users\Test\Desktop\P7_Data\Nus_Wide_preprocessed\train_labels_vgg16_NUSWIDE.npy', train_labels)
np.save(r'C:\Users\Test\Desktop\P7_Data\Nus_Wide_preprocessed\test_labels_vgg16_NUSWIDE.npy', test_labels)
np.save(r'C:\Users\Test\Desktop\P7_Data\Nus_Wide_preprocessed\val_features_vgg16_NUSWIDE.npy', val_features)
np.save(r'C:\Users\Test\Desktop\P7_Data\Nus_Wide_preprocessed\val_labels_vgg16_NUSWIDE.npy', val_labels)
"""

"\nnp.save(r'C:\\Users\\Test\\Desktop\\P7_Data\\Nus_Wide_preprocessed\\train_features_vgg16_NUSWIDE.npy', train_features)\nnp.save(r'C:\\Users\\Test\\Desktop\\P7_Data\\Nus_Wide_preprocessed\\test_features_vgg16_NUSWIDE.npy', test_features)\nnp.save(r'C:\\Users\\Test\\Desktop\\P7_Data\\Nus_Wide_preprocessed\\train_labels_vgg16_NUSWIDE.npy', train_labels)\nnp.save(r'C:\\Users\\Test\\Desktop\\P7_Data\\Nus_Wide_preprocessed\\test_labels_vgg16_NUSWIDE.npy', test_labels)\nnp.save(r'C:\\Users\\Test\\Desktop\\P7_Data\\Nus_Wide_preprocessed\\val_features_vgg16_NUSWIDE.npy', val_features)\nnp.save(r'C:\\Users\\Test\\Desktop\\P7_Data\\Nus_Wide_preprocessed\\val_labels_vgg16_NUSWIDE.npy', val_labels)\n"

In [27]:
features

array([[-1.5032818 , -0.7674331 , -2.7721295 , ..., -0.9128053 ,
        -0.6340852 , -0.38517237],
       [-2.2796998 , -1.3048555 , -2.2760553 , ..., -0.13423966,
         1.7606338 , -1.4704621 ],
       [-1.1428131 , -0.8343891 , -1.9038177 , ..., -1.6542923 ,
         0.2152445 ,  0.02428141],
       ...,
       [-0.46369258, -0.972303  , -2.1136277 , ..., -1.2902718 ,
        -0.21800174,  0.01005483],
       [-1.9904902 , -1.0046502 , -1.3107908 , ..., -1.8287351 ,
         1.3140104 , -0.9812664 ],
       [-2.1479685 , -2.970149  , -1.8542986 , ..., -2.571137  ,
        -1.0085739 ,  0.6855662 ]], dtype=float32)

In [22]:
features[0]

tensor([-1.5033, -0.7674, -2.7721,  ..., -0.9128, -0.6341, -0.3852])

In [23]:
len(labels)

193734

# Proof of multiclass, and class imbalance 

In [24]:
one_class = 0
zero_class = 0
multi_class = 0 
one_label = []
for i in range(len(labels)):
    if len(labels[i])==1:
        one_class += 1 
        one_label.append(labels[i])
    elif len(labels[i]) == 0: 
        zero_class += 1 
    else:
        multi_class += 1 


In [26]:
import pandas as pd

In [27]:
z = pd.Series(one_label)

In [28]:
z.value_counts()

2    36690
4    19036
0     7585
7     4327
3     3759
1     2597
5     2532
6     2420
8     2220
9      148
Name: count, dtype: int64

In [17]:
one_class

81314

In [None]:
multi_class