# This is v2 of the recommender system.

To improve on v1, we fine tuned the ResNet50 model by training it to encode and reconstruct images from the fashion dataset (https://www.kaggle.com/datasets/paramaggarwal/fashion-product-images-dataset). On top of embedding images, we also embedded the metadata (name, brand) for products and factored that information into recommendations.

In [None]:
!pip install tqdm
!pip install sentence-transformers
!pip install annoy
!pip install torchsummary

In [None]:
import os
import torch
import torch.nn as nn
import torchvision.models as models
from PIL import Image
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
import numpy as np
from tqdm import tqdm
from annoy import AnnoyIndex
from sklearn.model_selection import train_test_split
from torchsummary import summary

In [None]:
# Define a custom dataset
class ImageDataset(Dataset):
    def __init__(self, image_paths, transform):
        self.image_paths = image_paths
        self.transform = transform
    def __len__(self):
        return len(self.image_paths)
    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path)
        return self.transform(image), img_path # return (image, path)

In [None]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        resnet50 = models.resnet50(pretrained=True)
        self.encoder = nn.Sequential(*(list(resnet50.children())[:-1])) # remove fc layer used for classification

        # freeze layers up to 3 to retain information learned from pretrained weights
        for name, layer in self.encoder.named_children():
            if name in['0', '1', '2', '3']:
                for param in layer.parameters():
                    param.requires_grad = False
            
    def forward(self, x):
        latent = self.encoder(x).view(x.size(0), -1)
        return latent

class Decoder(nn.Module):
    def __init__(self, latent_dim=2048):
        super(Decoder, self).__init__()
        self.decoder = nn.Sequential(
            # Fully connected layer to expand the latent vector
            nn.Linear(latent_dim, 8 * 8 * 256),  # 8x8 spatial dimension and 256 channels
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(8 * 8 * 256),
            
            # Reshape to (B, 256, 8, 8) via view
            nn.Unflatten(1, (256, 8, 8)),
            
            # Upsampling layers (transpose convolutions)
            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=2),    # 8x8 -> 14x14
            nn.ReLU(inplace=True),
            
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),     # 14x14 -> 28x28
            nn.ReLU(inplace=True),
            
            nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1),      # 28x28 -> 56x56
            nn.ReLU(inplace=True),
            
            nn.ConvTranspose2d(32, 16, kernel_size=4, stride=2, padding=1),      # 56x56 -> 112x112
            nn.ReLU(inplace=True),
            
            nn.ConvTranspose2d(16, 3, kernel_size=4, stride=2, padding=1),       # 112x112 -> 224x224
            nn.Sigmoid()  # Scaling the output to [0, 1] for RGB images
        )
    
    def forward(self, x):
        return self.decoder(x)

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder(latent_dim=2048)

    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return latent, reconstructed

In [None]:
decoder = Autoencoder().decoder
summary(decoder, (2048,))

In [None]:
# Define constants
def convert_to_rgb(image):
    # Convert RGBA or grayscale to RGB
    if image.mode != "RGB":
        image = image.convert("RGB")
    return image

transform = transforms.Compose([
    transforms.Lambda(convert_to_rgb),
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
batch_size=32
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

In [None]:
model = Autoencoder().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)

# Train, val, test splits (60, 20, 20)
image_folder = "./fashion-dataset/images"
image_paths = [os.path.join(image_folder, fname) for fname in os.listdir(image_folder)]

train_val_paths, test_paths = train_test_split(image_paths, test_size=0.2)
train_paths, val_paths = train_test_split(train_val_paths, test_size=0.25)

train_dataset = ImageDataset(train_paths, transform)
val_dataset = ImageDataset(val_paths, transform)
test_dataset = ImageDataset(test_paths, transform)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Training loop
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    # Define the mean and std for un-normalization (match your transform normalization)
    mean = torch.tensor([0.485, 0.456, 0.406], device=device).view(1, 3, 1, 1).to(device)
    std = torch.tensor([0.229, 0.224, 0.225], device=device).view(1, 3, 1, 1).to(device)
    
    for images, paths in tqdm(dataloader, desc="Training loop", unit='batch', leave=True):
        images = images.to(device)
        optimizer.zero_grad()
        latent, reconstructed = model(images)

        # Un-normalize the input images
        unnormalized_images = images * std + mean
        
        loss = criterion(reconstructed, unnormalized_images)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

epochs = 30
for epoch in range(epochs):
    train_loss = train_epoch(model, train_dataloader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}")

In [None]:
torch.save(model.state_dict(), "resnet50_autoencoder.pth")

In [None]:
model = Autoencoder().to(device)
model.load_state_dict(torch.load("resnet50_autoencoder.pth", weights_only=True))

In [None]:
def evaluate_model_loss(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    # Define the mean and std for un-normalization (match your transform normalization)
    mean = torch.tensor([0.485, 0.456, 0.406], device=device).view(1, 3, 1, 1).to(device)
    std = torch.tensor([0.229, 0.224, 0.225], device=device).view(1, 3, 1, 1).to(device)
    for images, paths in tqdm(dataloader, desc="Evaluation loop", unit='batch', leave=True):
        images = images.to(device)
        latent, reconstructed = model(images)

        # Un-normalize the input images
        unnormalized_images = images * std + mean
        
        loss = criterion(reconstructed, unnormalized_images)
        running_loss += loss.item()
    return running_loss / len(dataloader)

val_loss = evaluate_model_loss(model, val_dataloader, criterion, device)
print(val_loss)

In [None]:
def embed_image_dataset(dataloader, model, device, save_to_file=False, filename=""):
    latent_representations = {}
    model.to(device)
    model.eval()
    with torch.no_grad():
        for images, paths in tqdm(dataloader, desc="Processing Images", unit='batch'):
            images = images.to(device) # Output: [batch_size, 3, 224, 224]
            features = model(images)[0].squeeze() # Output: [batch_size, 2048]
            for path, feature in zip(paths, features.cpu()):
                latent_representations[path] = feature.numpy()
    if save_to_file:
        np.save(filename, latent_representations)

    return latent_representations

In [None]:
# load the wardrobe dataset and compute embeddings
wardrobe_folder = "./sample-wardrobe/images"
wardrobe_paths = [os.path.join(wardrobe_folder, fname) for fname in os.listdir(wardrobe_folder)]
wardrobe_dataset = ImageDataset(wardrobe_paths, transform)
wardrobe_dataloader = DataLoader(wardrobe_dataset, batch_size=batch_size, shuffle=False)
wardrobe_lat_rep = embed_image_dataset(wardrobe_dataloader, model, device)

In [None]:
inventory_folder = "./nordstrom-data/images"
inventory_paths = [os.path.join(inventory_folder, fname) for fname in os.listdir(inventory_folder)]
inventory_dataset = ImageDataset(inventory_paths, transform)
inventory_dataloader = DataLoader(inventory_dataset, batch_size=batch_size, shuffle=False)
inventory_lat_rep = embed_image_dataset(inventory_dataloader, model, device, True, "lat_rep_inventory_ft.npy")

In [None]:
# create list of fashion dataset embeddings and paths
inventory_lat_rep = np.load("lat_rep_inventory_ft.npy", allow_pickle=True).item()
inventory_img_paths = list(inventory_lat_rep.keys())
inventory_features = np.array(list(inventory_lat_rep.values()))

# create list of wardrobe embeddings and paths
wardrobe_paths = list(wardrobe_lat_rep.keys())
wardrobe_features = np.array(list(wardrobe_lat_rep.values()))

# get the mean embedding of all items in wardrobe
mean_embedding = np.mean(wardrobe_features, axis=0)

In [None]:
# Perform Annoy
embedding_dim = 2048  # Original dimensionality
annoy_index = AnnoyIndex(embedding_dim, metric='euclidean')

# Add all items to Annoy index
for i, embedding in enumerate(inventory_features):
    annoy_index.add_item(i, embedding)

# Build the index
n_trees = 50
annoy_index.build(n_trees)  # Number of trees

# Query the index
n_neighbors = 10
indices = annoy_index.get_nns_by_vector(mean_embedding, n_neighbors, include_distances=True)

print("Recommended indices:", indices[0])
for idx in indices[0]:
    # print(fd_img_paths[idx])
    im = Image.open(inventory_img_paths[idx])
    im.show()

In [None]:
# Encoder for metadata

In [None]:
from sentence_transformers import SentenceTransformer
import json
import csv

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
# example input: 
# metadata = ["Nike", "Nike Pegasus 40 White/Black"] # dim = d
# embedding = model.encode(metadata) # shape = [d x 384]

In [None]:
inventory_metadata_rep = {}
formatted_products = {}
with open('./nordstrom-data/nordstrom_data.json', 'r') as f:
    formatted_products = json.load(f)

for path in tqdm(inventory_img_paths):
    base_path = os.path.basename(path)
    product = formatted_products[base_path]
    metadata = [product['brand'], product['name']]
    inventory_metadata_rep[path] = np.array(model.encode(metadata)).flatten()

np.save("lat_rep_inventory_metadata.npy", inventory_metadata_rep)

In [None]:
wardrobe_metadata_rep = {}
wardrobe_products = {}
with open("./sample-wardrobe/metadata.csv", mode="r") as file:
    csv_reader = csv.DictReader(file)  # DictReader reads rows as dictionaries
    for row in csv_reader:
        src = row['filename']
        wardrobe_products[src] = row

for path in wardrobe_paths:
    base_path = os.path.basename(path)
    product = wardrobe_products[base_path]
    metadata = [product['brand'], product['name']]
    wardrobe_metadata_rep[path] = np.array(model.encode(metadata)).flatten()

In [None]:
# concat embeddings
inventory_concat_embedding = {}
wardrobe_concat_embedding = {}

for path in inventory_img_paths:
    inventory_concat_embedding[path] = np.concatenate((inventory_lat_rep[path], inventory_metadata_rep[path]))

for path in wardrobe_paths:
    wardrobe_concat_embedding[path] = np.concatenate((wardrobe_lat_rep[path], wardrobe_metadata_rep[path]))

In [None]:
# OPTIONAL: save concatenated embeddings to file
np.save("inventory_concat_embed_v2.npy", inventory_concat_embedding)
np.save("wardrobe_concat_embed_v2.npy", wardrobe_concat_embedding)

In [None]:
inventory_concat_embedding = np.load("inventory_concat_embed_v2.npy", allow_pickle=True).item()
wardrobe_concat_embedding = np.load("wardrobe_concat_embed_v2.npy", allow_pickle=True).item()

In [None]:
inventory_img_paths = list(inventory_concat_embedding.keys())
inventory_features = np.array(list(inventory_concat_embedding.values()))

# create list of wardrobe embeddings and paths
wardrobe_paths = list(wardrobe_concat_embedding.keys())
wardrobe_features = np.array(list(wardrobe_concat_embedding.values()))

# grab the mean embedding
mean_embedding = np.mean(wardrobe_features, axis=0)

In [None]:
# Perform Annoy
embedding_dim = 2816  # Original dimensionality
annoy_index = AnnoyIndex(embedding_dim, metric='euclidean')

# Add all items to Annoy index
for i, embedding in enumerate(inventory_features):
    annoy_index.add_item(i, embedding)

# Build the index
n_trees = 50
annoy_index.build(n_trees)  # Number of trees

# Query the index
n_neighbors = 10
indices = annoy_index.get_nns_by_vector(mean_embedding, n_neighbors, include_distances=True)

for idx in indices[0]:
    im = Image.open(inventory_img_paths[idx])
    im.show()