In [None]:
!pip install tqdm
!pip install sentence-transformers
!pip install annoy

In [72]:
import os
import torch
import torch.nn as nn
import torchvision.models as models
from PIL import Image
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
import numpy as np
from tqdm import tqdm
from annoy import AnnoyIndex

In [None]:
# Build the encoder for images using ResNet50
resnet50 = models.resnet50(pretrained=True)
feature_extractor = nn.Sequential(*(list(resnet50.children())[:-1])) # remove fc layer used for classification
feature_extractor.eval()

In [74]:
# Define a custom dataset
class ImageDataset(Dataset):
    def __init__(self, image_folder, transform):
        self.image_folder = image_folder
        self.image_paths = [os.path.join(image_folder, f) for f in os.listdir(image_folder)]
        self.transform = transform
    def __len__(self):
        return len(self.image_paths)
    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path)
        return self.transform(image), img_path # return (image, path)

In [None]:
# Define constants
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
batch_size=32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
feature_extractor.to(device)

In [76]:
def embed_image_dataset(image_folder, save_to_file=False, filename=""):
    dataset = ImageDataset(image_folder, transform)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    latent_representations = {}
    with torch.no_grad():
        for images, paths in tqdm(dataloader, desc="Processing Images", unit='batch'):
            images = images.to(device) # Output: [batch_size, 3, 224, 224]
            features = feature_extractor(images).squeeze() # Output: [batch_size, 2048]
            for path, feature in zip(paths, features.cpu()):
                latent_representations[path] = feature.numpy()
    if save_to_file:
        np.save(filename, latent_representations)

    return latent_representations

In [None]:
# load the fashion dataset and compute embeddings
fd_image_folder = "./fashion-dataset/images"
fd_lat_rep = embed_image_dataset(fd_iamge_folder, True, "lat_rep_fd_nft.npy") # save embeddings to a file

In [None]:
# load the wardrobe dataset and compute embeddings
wardrobe_folder = "./sample-wardrobe"
wardrobe_lat_rep = embed_image_dataset(wardrobe_folder)

In [78]:
# create list of fashion dataset embeddings and paths
latent_fd_images = np.load("lat_rep_fd_nft.npy", allow_pickle=True).item()
fd_img_paths = list(latent_fd_images.keys())
fd_features = np.array(list(latent_fd_images.values())) # Output: (44441, 2048)

# create list of wardrobe embeddings and paths
wardrobe_paths = list(wardrobe_lat_rep.keys())
wardrobe_features = np.array(list(wardrobe_lat_rep.values()))

# get the mean embedding of all items in wardrobe
mean_embedding = np.mean(wardrobe_features, axis=0)

In [79]:
# Perform Annoy
embedding_dim = 2048  # Original dimensionality
annoy_index = AnnoyIndex(embedding_dim, metric='euclidean')

# Add all items to Annoy index
for i, embedding in enumerate(fd_features):
    annoy_index.add_item(i, embedding)

# Build the index
n_trees = 50
annoy_index.build(n_trees)  # Number of trees

# Query the index
n_neighbors = 10
indices = annoy_index.get_nns_by_vector(mean_embedding, n_neighbors, include_distances=True)

print("Recommended indices:", indices[0])
for idx in indices[0]:
    print(fd_img_paths[idx])

Recommended indices: [22758, 25579, 6241, 7707, 34351, 38216, 26620, 19066, 5721, 41925]
./fashion-dataset/images/13326.jpg
./fashion-dataset/images/13325.jpg
./fashion-dataset/images/34044.jpg
./fashion-dataset/images/34045.jpg
./fashion-dataset/images/31090.jpg
./fashion-dataset/images/40131.jpg
./fashion-dataset/images/36205.jpg
./fashion-dataset/images/35890.jpg
./fashion-dataset/images/50717.jpg
./fashion-dataset/images/52122.jpg


In [80]:
im = Image.open("./fashion-dataset/images/52122.jpg")
im.show()

In [None]:
# TODO: fine tune resnet50 model on fashion dataset

In [48]:
# Build the encoder for metadata

In [59]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
metadata = ["Nike", "Nike Pegasus 40 White/Black"] # dim = d
embedding = model.encode(metadata) # shape = [d x 384]
print(embedding.shape)

(2, 384)


In [None]:
# concat embeddings

In [None]:
# perform ANNOY on weighted embedding X' and inventory embeddings Y_1, ... Y_n

In [None]:
# output recommendations