In [None]:
!pip install tqdm
!pip install sentence-transformers
!pip install annoy
!pip install torchsummary

In [141]:
import os
import torch
import torch.nn as nn
import torchvision.models as models
from PIL import Image
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
import numpy as np
from tqdm import tqdm
from annoy import AnnoyIndex
from sklearn.model_selection import train_test_split
from torchsummary import summary

In [142]:
# Define a custom dataset
class ImageDataset(Dataset):
    def __init__(self, image_paths, transform):
        self.image_paths = image_paths
        self.transform = transform
    def __len__(self):
        return len(self.image_paths)
    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path)
        return self.transform(image), img_path # return (image, path)

In [143]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        resnet50 = models.resnet50(pretrained=True)
        self.encoder = nn.Sequential(*(list(resnet50.children())[:-1])) # remove fc layer used for classification

        # freeze layers up to 3 to retain information learned from pretrained weights
        for name, layer in self.encoder.named_children():
            if name in['0', '1', '2', '3']:
                for param in layer.parameters():
                    param.requires_grad = False
            
    def forward(self, x):
        latent = self.encoder(x).view(x.size(0), -1)
        return latent

class Decoder(nn.Module):
    def __init__(self, latent_dim=2048):
        super(Decoder, self).__init__()
        self.decoder = nn.Sequential(
            # Fully connected layer to expand the latent vector
            nn.Linear(latent_dim, 8 * 8 * 256),  # 8x8 spatial dimension and 256 channels
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(8 * 8 * 256),
            
            # Reshape to (B, 256, 8, 8) via view
            nn.Unflatten(1, (256, 8, 8)),
            
            # Upsampling layers (transpose convolutions)
            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=2),    # 8x8 -> 14x14
            nn.ReLU(inplace=True),
            
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),     # 14x14 -> 28x28
            nn.ReLU(inplace=True),
            
            nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1),      # 28x28 -> 56x56
            nn.ReLU(inplace=True),
            
            nn.ConvTranspose2d(32, 16, kernel_size=4, stride=2, padding=1),      # 56x56 -> 112x112
            nn.ReLU(inplace=True),
            
            nn.ConvTranspose2d(16, 3, kernel_size=4, stride=2, padding=1),       # 112x112 -> 224x224
            nn.Sigmoid()  # Scaling the output to [0, 1] for RGB images
        )
    
    def forward(self, x):
        return self.decoder(x)

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder(latent_dim=2048)

    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return latent, reconstructed

In [144]:
decoder = Autoencoder().decoder
summary(decoder, (2048,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                [-1, 16384]      33,570,816
              ReLU-2                [-1, 16384]               0
       BatchNorm1d-3                [-1, 16384]          32,768
         Unflatten-4            [-1, 256, 8, 8]               0
   ConvTranspose2d-5          [-1, 128, 14, 14]         524,416
              ReLU-6          [-1, 128, 14, 14]               0
   ConvTranspose2d-7           [-1, 64, 28, 28]         131,136
              ReLU-8           [-1, 64, 28, 28]               0
   ConvTranspose2d-9           [-1, 32, 56, 56]          32,800
             ReLU-10           [-1, 32, 56, 56]               0
  ConvTranspose2d-11         [-1, 16, 112, 112]           8,208
             ReLU-12         [-1, 16, 112, 112]               0
  ConvTranspose2d-13          [-1, 3, 224, 224]             771
          Sigmoid-14          [-1, 3, 2

In [145]:
# Define constants
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
batch_size=32
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

mps


In [146]:
model = Autoencoder().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)

# Train, val, test splits (60, 20, 20)
image_folder = "./fashion-dataset/images"
image_paths = [os.path.join(image_folder, fname) for fname in os.listdir(image_folder)]

train_val_paths, test_paths = train_test_split(image_paths, test_size=0.2)
train_paths, val_paths = train_test_split(train_val_paths, test_size=0.25)

train_dataset = ImageDataset(train_paths, transform)
val_dataset = ImageDataset(val_paths, transform)
test_dataset = ImageDataset(test_paths, transform)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [160]:
# Training loop
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    # Define the mean and std for un-normalization (match your transform normalization)
    mean = torch.tensor([0.485, 0.456, 0.406], device=device).view(1, 3, 1, 1).to(device)
    std = torch.tensor([0.229, 0.224, 0.225], device=device).view(1, 3, 1, 1).to(device)
    
    for images, paths in tqdm(dataloader, desc="Training loop", unit='batch', leave=True):
        images = images.to(device)
        optimizer.zero_grad()
        latent, reconstructed = model(images)

        # Un-normalize the input images
        unnormalized_images = images * std + mean
        
        loss = criterion(reconstructed, unnormalized_images)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

epochs = 30
for epoch in range(epochs):
    train_loss = train_epoch(model, train_dataloader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}")

Training loop: 100%|████████████████████████████████████████████████████| 834/834 [1:14:48<00:00,  5.38s/batch]


Epoch 1/30, Loss: 0.0083


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [39:34<00:00,  2.85s/batch]


Epoch 2/30, Loss: 0.0080


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [40:37<00:00,  2.92s/batch]


Epoch 3/30, Loss: 0.0077


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [40:38<00:00,  2.92s/batch]


Epoch 4/30, Loss: 0.0075


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [40:49<00:00,  2.94s/batch]


Epoch 5/30, Loss: 0.0073


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [40:46<00:00,  2.93s/batch]


Epoch 6/30, Loss: 0.0069


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [40:35<00:00,  2.92s/batch]


Epoch 7/30, Loss: 0.0067


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [40:38<00:00,  2.92s/batch]


Epoch 8/30, Loss: 0.0065


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [39:49<00:00,  2.87s/batch]


Epoch 9/30, Loss: 0.0063


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [40:40<00:00,  2.93s/batch]


Epoch 10/30, Loss: 0.0061


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [39:50<00:00,  2.87s/batch]


Epoch 11/30, Loss: 0.0060


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [41:07<00:00,  2.96s/batch]


Epoch 12/30, Loss: 0.0059


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [40:11<00:00,  2.89s/batch]


Epoch 13/30, Loss: 0.0058


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [39:43<00:00,  2.86s/batch]


Epoch 14/30, Loss: 0.0057


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [39:45<00:00,  2.86s/batch]


Epoch 15/30, Loss: 0.0056


Training loop: 100%|████████████████████████████████████████████████████| 834/834 [1:01:57<00:00,  4.46s/batch]


Epoch 16/30, Loss: 0.0055


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [57:28<00:00,  4.13s/batch]


Epoch 17/30, Loss: 0.0054


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [22:16<00:00,  1.60s/batch]


Epoch 18/30, Loss: 0.0053


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [37:49<00:00,  2.72s/batch]


Epoch 19/30, Loss: 0.0052


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [41:04<00:00,  2.95s/batch]


Epoch 20/30, Loss: 0.0052


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [41:50<00:00,  3.01s/batch]


Epoch 21/30, Loss: 0.0051


Training loop: 100%|██████████████████████████████████████████████████████| 834/834 [26:02<00:00,  1.87s/batch]


Epoch 22/30, Loss: 0.0050


Training loop:   0%|▎                                                       | 4/834 [00:06<22:11,  1.60s/batch]


KeyboardInterrupt: 

In [161]:
torch.save(model.state_dict(), "resnet50_autoencoder.pth")

In [163]:
def evaluate_model_loss(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    # Define the mean and std for un-normalization (match your transform normalization)
    mean = torch.tensor([0.485, 0.456, 0.406], device=device).view(1, 3, 1, 1).to(device)
    std = torch.tensor([0.229, 0.224, 0.225], device=device).view(1, 3, 1, 1).to(device)
    for images, paths in tqdm(dataloader, desc="Evaluation loop", unit='batch', leave=True):
        images = images.to(device)
        latent, reconstructed = model(images)

        # Un-normalize the input images
        unnormalized_images = images * std + mean
        
        loss = criterion(reconstructed, unnormalized_images)
        running_loss += loss.item()
    return running_loss / len(dataloader)

val_loss = evaluate_model_loss(model, val_dataloader, criterion, device)
print(val_loss)

Evaluation loop: 100%|████████████████████████████████████████████████████| 278/278 [07:08<00:00,  1.54s/batch]


0.01054813314845665


In [164]:
def embed_image_dataset(dataloader, model, device, save_to_file=False, filename=""):
    latent_representations = {}
    model.to(device)
    model.eval()
    with torch.no_grad():
        for images, paths in tqdm(dataloader, desc="Processing Images", unit='batch'):
            images = images.to(device) # Output: [batch_size, 3, 224, 224]
            features = model(images)[0].squeeze() # Output: [batch_size, 2048]
            for path, feature in zip(paths, features.cpu()):
                latent_representations[path] = feature.numpy()
    if save_to_file:
        np.save(filename, latent_representations)

    return latent_representations

In [165]:
# load the fashion dataset and compute embeddings
# NOTE: without a GPU, this cell could take hours to finish
fd_image_folder = "./fashion-dataset/images"
image_paths = [os.path.join(fd_image_folder, fname) for fname in os.listdir(fd_image_folder)]
fd_dataset = ImageDataset(image_paths, transform)
fd_dataloader = DataLoader(fd_dataset, batch_size=batch_size, shuffle=False)
fd_lat_rep = embed_image_dataset(fd_dataloader, model, device, True, "lat_rep_fd_ft.npy") # save embeddings to a file

Processing Images: 100%|████████████████████████████████████████████████| 1389/1389 [24:57<00:00,  1.08s/batch]


In [166]:
# load the wardrobe dataset and compute embeddings
wardrobe_folder = "./sample-wardrobe/images"
wardrobe_paths = [os.path.join(wardrobe_folder, fname) for fname in os.listdir(wardrobe_folder)]
wardrobe_dataset = ImageDataset(wardrobe_paths, transform)
wardrobe_dataloader = DataLoader(wardrobe_dataset, batch_size=batch_size, shuffle=False)
wardrobe_lat_rep = embed_image_dataset(wardrobe_dataloader, model, device)

Processing Images: 100%|██████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.64s/batch]


In [167]:
# create list of fashion dataset embeddings and paths
latent_fd_images = np.load("lat_rep_fd_ft.npy", allow_pickle=True).item()
fd_img_paths = list(latent_fd_images.keys())
fd_features = np.array(list(latent_fd_images.values())) # Output: (44441, 2048)

# create list of wardrobe embeddings and paths
wardrobe_paths = list(wardrobe_lat_rep.keys())
wardrobe_features = np.array(list(wardrobe_lat_rep.values()))

# get the mean embedding of all items in wardrobe
mean_embedding = np.mean(wardrobe_features, axis=0)

mean_embedding = wardrobe_features[0]
print(wardrobe_paths[0])

./sample-wardrobe/lelabo.jpg


In [168]:
# Perform Annoy
embedding_dim = 2048  # Original dimensionality
annoy_index = AnnoyIndex(embedding_dim, metric='euclidean')

# Add all items to Annoy index
for i, embedding in enumerate(fd_features):
    annoy_index.add_item(i, embedding)

# Build the index
n_trees = 50
annoy_index.build(n_trees)  # Number of trees

# Query the index
n_neighbors = 10
indices = annoy_index.get_nns_by_vector(mean_embedding, n_neighbors, include_distances=True)

print("Recommended indices:", indices[0])
for idx in indices[0]:
    # print(fd_img_paths[idx])
    im = Image.open(fd_img_paths[idx])
    im.show()

Recommended indices: [25122, 16514, 16893, 42606, 7033, 42873, 27679, 19385, 5440, 22821]


In [None]:
# Encoder for metadata

In [170]:
from sentence_transformers import SentenceTransformer

In [171]:
model = SentenceTransformer("all-MiniLM-L6-v2")
metadata = ["Nike", "Nike Pegasus 40 White/Black"] # dim = d
embedding = model.encode(metadata) # shape = [d x 384]
print(embedding.shape)

(2, 384)


In [None]:
# concat embeddings

In [None]:
# perform ANNOY on weighted embedding X' and inventory embeddings Y_1, ... Y_n

In [None]:
# output recommendations