In [1]:
import pandas as pd
import numpy as np
from PIL import Image
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image

2025-12-02 18:01:19.467687: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
# setup and cleaning/filtering cell

clothes = pd.read_csv("images.csv")
clothes = clothes[clothes["kids"] == False]
clothes = clothes[clothes["label"] != "Skip"]
clothes = clothes[clothes["label"] != "Other"]
clothes = clothes[clothes["label"] != "Not sure"] # filtering
clothes.dropna(subset=["label"], axis=0, inplace=True)

clothes = clothes[~clothes["label"].isin(["Body", "Blouse", "Top", "Undershirt"])]

clothes.reset_index(drop=True, inplace=True)

clothes["filepath"] = "images_compressed/" + clothes["image"] + ".jpg" # creating the file path column

gender_map = {
    "T-Shirt": "unisex",
    "Longsleeve": "unisex",
    "Pants": "unisex",
    "Shirt": "unisex",
    "Shoes": "unisex",
    "Dress": "female",
    "Shorts": "unisex",
    "Outwear": "unisex",
    "Hat": "unisex",
    "Skirt": "female",
    "Polo": "unisex",
    "Blazer": "unisex",
    "Hoodie": "unisex"
}

clothes["gender"] = clothes["label"].map(gender_map) # gender column cause some are specifically female

# feature engineering:
    # one hot encoding and average coloring begins here

clothes = pd.get_dummies(clothes, columns=["label"])
clothes = pd.get_dummies(clothes, columns=["gender"])

clothes.to_csv("clothes_processed.csv", index=False)

from tqdm import tqdm

model = ResNet50(weights="imagenet", include_top=False, pooling="avg")

paths = clothes["filepath"].tolist()
batch_size = 32
embeddings_list = []

for start in tqdm(range(0, len(paths), batch_size)):
    end = start + batch_size
    batch_paths = paths[start:end]

    batch_imgs = []
    for p in batch_paths:
        img = image.load_img(p, target_size=(224, 224))
        x = image.img_to_array(img)
        batch_imgs.append(x)

    batch_imgs = np.array(batch_imgs)
    batch_imgs = preprocess_input(batch_imgs)

    batch_embeds = model.predict(batch_imgs, verbose=0)
    embeddings_list.append(batch_embeds)

embeddings_array = np.vstack(embeddings_list)

np.save("embeddings.npy", embeddings_array)

100%|█████████████████████████████████████████| 141/141 [04:39<00:00,  1.98s/it]
