# Annoy indexes for @recommend endpoint

### ANNOY - Approximate Nearest Neighbors Oh Yeah (created by Spotify)

This code creates necessary files at ./api/indexes. If the files already exist - do not run this code

In [20]:
import pandas as pd
from annoy import AnnoyIndex
import json
import os
from pathlib import Path

In [21]:
if Path("../data/clip_embeddings_large.csv").exists():
    EMB_PATH = "../data/clip_embeddings_large.csv"
    print("Using large embeddings file")
else:
    EMB_PATH = "../data/clip_embeddings.csv"
    print("Using small embeddings file")
INDEX_DIR = "../api/indexes"
os.makedirs(INDEX_DIR, exist_ok=True)


Using large embeddings file


In [22]:
# --- Load embeddings ---
df = pd.read_csv(EMB_PATH)
df["embedding"] = df["embedding"].apply(lambda x: list(map(float, x.split(","))))

emb_dim = len(df["embedding"].iloc[0])
index = AnnoyIndex(emb_dim, "angular")

print(f"Embedding dim = {emb_dim}")

Embedding dim = 512


In [23]:

if not Path(INDEX_DIR,"fashion_index.ann").exists():
    print("Building Annoy index...")

    for i, row in df.iterrows():
        index.add_item(i, row["embedding"])

    # More trees â†’ better precision, slower build

    index.build(20)
    index.save(f"{INDEX_DIR}/fashion_index.ann")

    print("Saving metadata...")
    meta = df.drop(columns=["embedding"]).to_dict(orient="records")

    with open(f"{INDEX_DIR}/metadata.json", "w") as f:
        json.dump(meta, f)

    print("Done. Index saved at", INDEX_DIR)
else:
    print("Index already exists at", INDEX_DIR)

Index already exists at ../api/indexes
