In [2]:
import os
import json
import torch
import numpy as np
import pandas as pd
import multiprocessing
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import torchvision.transforms as T
from transformers import AutoFeatureExtractor, AutoModel
from datasets import Dataset, DatasetDict, Image, Features, ClassLabel, Value, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_ckpt = 'vit-base-fashion'
extractor = AutoFeatureExtractor.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
model.eval()

Some weights of ViTModel were not initialized from the model checkpoint at vit-base-fashion and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTSdpaAttention(
          (attention): ViTSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUAct

In [4]:
df = pd.read_csv('fashion-dataset\styles.csv', on_bad_lines='skip')
freq = df['subCategory'].value_counts()
top_freq = {category: freq.loc[category] for category in list(freq.index) if freq.loc[category] >= 100}
sub_df = df[df['subCategory'].isin(list(top_freq.keys()))]
image_dir = 'fashion-dataset\images'
sub_df['image_path'] = sub_df['id'].apply(lambda x: os.path.join(image_dir, str(x) + '.jpg'))
exact_df = sub_df[['image_path', 'subCategory', 'id']]
exact_df['id'] = exact_df['id'].apply(lambda x : str(x))
exact_df = exact_df.rename(columns={'image_path': 'image',
                   'subCategory': 'labels'})
ls1 = set(exact_df['id'])
ls2 = set(os.listdir(image_dir))
not_available_imgs = [i for i in ls1 if f"{i}.jpg" not in ls2]
exact_df = exact_df[~exact_df['id'].isin(not_available_imgs)]
exact_df = exact_df.reset_index(drop=True)
features = Features({
    'image': Image(),
    'labels': ClassLabel(names=exact_df['labels'].unique().tolist()),
    'id': Value('string')
})

dataset = Dataset.from_pandas(exact_df, features=features)
# Access the ClassLabel feature
class_label = features['labels']

# Convert integer labels to string labels
string_labels = [class_label.int2str(label) for label in dataset['labels']]
# dataset = dataset.add_column('string_labels', string_labels)
# ds = dataset.train_test_split(test_size = 0.1, stratify_by_column = 'labels', shuffle = True, seed = 42)
# ds_train_val = ds['train'].train_test_split(test_size = 0.1, shuffle = True, stratify_by_column= 'labels', seed = 42)
# ds = DatasetDict({
#     'train': ds_train_val['train'],
#     'val': ds_train_val['test'],
#     'test': ds['test']})

# ds

  df = pd.read_csv('fashion-dataset\styles.csv', on_bad_lines='skip')
  image_dir = 'fashion-dataset\images'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['image_path'] = sub_df['id'].apply(lambda x: os.path.join(image_dir, str(x) + '.jpg'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exact_df['id'] = exact_df['id'].apply(lambda x : str(x))


In [5]:
# dataset.save_to_disk("dataset")

In [6]:
transformation_chain = T.Compose(
    [
        # We first resize the input image to 256x256 and then we take center crop.
        T.Resize(int((256 / 224) * extractor.size["height"])),
        T.CenterCrop(extractor.size["height"]),
        T.ToTensor(),
        T.Normalize(mean=extractor.image_mean, std=extractor.image_std),
    ]
)

In [7]:
def extract_embeddings(model: torch.nn.Module):
    """Utility to compute embeddings."""
    device = model.device

    def pp(batch):
        images = batch["image"]
        image_batch_transformed = torch.stack(
            [transformation_chain(image) for image in images]
        )
        new_batch = {"pixel_values": image_batch_transformed.to(device)}
        with torch.no_grad():
            embeddings = model(**new_batch).last_hidden_state[:, 0].cpu()
            del image_batch_transformed
        return {"embeddings": embeddings}

    return pp


# Here, we map embedding extraction utility on our subset of candidate images.
device = "cuda" if torch.cuda.is_available() else "cpu"
extract_fn = extract_embeddings(model.to(device))
dataset = load_from_disk("dataset")
candidate_subset_emb = dataset.map(extract_fn, batched=True, batch_size=16)

  context_layer = torch.nn.functional.scaled_dot_product_attention(
Map: 100%|██████████| 43974/43974 [3:03:00<00:00,  4.00 examples/s]  


In [8]:
np.save('embeddings.npy', np.array(candidate_subset_emb['embeddings']))

# Save IDs
np.save('ids.npy', np.array(candidate_subset_emb['id']))