In [1]:
import os
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.image  as mpimg
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing import image
import random
from datasets import Dataset

In [2]:
data = pd.read_csv('./data/clothing-dataset/images.csv').set_index('image')
data.tail()

Unnamed: 0_level_0,sender_id,label,kids
image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dfd4079d-967b-4b3e-8574-fbac11b58103,204,Shorts,False
befa14be-8140-4faf-8061-1039947e329d,204,Body,True
5379356a-40ee-4890-b416-2336a7d84061,310,Shorts,False
65507fb8-3456-4c15-b53e-d1b03bf71a59,204,Shoes,False
32b99302-cec7-4dec-adfa-3d4029674209,204,Skirt,False


# Top 10 labels are defined

In [3]:
top_labels = pd.DataFrame(data.groupby('label').size().reset_index().sort_values(0,ascending = False)[:11]['label'])
top_labels = top_labels[top_labels.label!='Not sure']
top_labels_list = sorted(list(top_labels['label']))
top_labels['label_num'] = top_labels['label'].apply(lambda x: top_labels_list.index(x))
top_labels

Unnamed: 0,label,label_num
17,T-Shirt,9
6,Longsleeve,2
10,Pants,4
13,Shoes,6
12,Shirt,5
3,Dress,0
9,Outwear,3
14,Shorts,7
4,Hat,1
16,Skirt,8


In [4]:
data_filtered = pd.merge(data.reset_index(), top_labels).set_index('image')
data_filtered['label_str'] = data_filtered['label']
data_filtered['label'] = data_filtered['label_num']
data_filtered.tail()

Unnamed: 0_level_0,sender_id,label,kids,label_num,label_str
image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c87b2c6b-8405-4d0c-81ae-58d46f034cb1,204,1,False,1,Hat
31ae3916-c352-45fe-9866-a8947fc74991,204,1,False,1,Hat
6c2f18d0-30ff-468c-a470-39c65abf4851,204,1,False,1,Hat
3f0be386-9a59-4aeb-93f6-dbe63e3a3002,204,1,False,1,Hat
7b5181d5-253f-4936-9207-60e3368bf9e1,204,1,False,1,Hat


In [7]:
labeled_data = []
for i, item in enumerate(os.listdir( './data/clothing-dataset/images' )):
    path = os.path.join('./data/clothing-dataset/images', item) 
    img = image.load_img(path, target_size=(32, 32))
    
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    images = np.vstack([x])[0].tolist()
        
    try:
        label = data_filtered.loc[item[:-4],'label']
        labeled_data.append({'img':images, 'label':label, 'index':item[:-4]})
    except:
        label = 'no_data'           

# Images are preprocessed

In [6]:
data_img = []
data_label = []
data_img_ids = []

for img in labeled_data:
    data_img.append(img['img'])
    data_label.append(img['label'])
    data_img_ids.append(img['index']) 

In [10]:
data_ds = Dataset.from_dict({'img':data_img,'label':data_label})

In [11]:
from transformers import ViTFeatureExtractor

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

In [12]:
def preprocess_images(examples):
    # get batch of images
    images = examples['img']
    # convert to list of NumPy arrays of shape (C, H, W)
    images = [np.array(image, dtype=np.uint8) for image in images]
    images = [np.moveaxis(image, source=-1, destination=0) for image in images]
    # preprocess and add pixel_values
    inputs = feature_extractor(images=images)
    examples['pixel_values'] = inputs['pixel_values']

    return examples

In [13]:
from datasets import Features, ClassLabel, Array3D

# we need to define the features ourselves as both the img and pixel_values have a 3D shape 
features = Features({
    'label': ClassLabel(names = top_labels_list),
    'img': Array3D(dtype="int64", shape=(3,32,32)),
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
})

preprocessed_data_ds = data_ds.map(preprocess_images, batched=True, features=features)

  0%|          | 0/5 [00:00<?, ?ba/s]

# Model is defined and loaded

In [14]:
from transformers import ViTModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn as nn


class ViTForImageClassification(nn.Module):
    def __init__(self, num_labels=10):
        super(ViTForImageClassification, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.dropout = nn.Dropout(0.1)
        self.last_layer = nn.Linear(self.vit.config.hidden_size, num_labels)
        self.num_labels = num_labels

    def forward(self, pixel_values, labels):
        outputs = self.vit(pixel_values=pixel_values)
        output = self.dropout(outputs.last_hidden_state[:,0])
        logits = self.last_layer(output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [15]:
from transformers import TrainingArguments, Trainer

metric_name = "accuracy"

args = TrainingArguments(
    f"test-clothing",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='logs',
)

In [16]:
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
import torch
model_loaded = ViTForImageClassification()
model_loaded.load_state_dict(torch.load('./Model evaluation/model_one_layer',map_location=torch.device('cpu')))
model_loaded.eval()

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): PatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_fea

In [18]:
trainer_for_model_loaded = Trainer(
    model_loaded,
    args,
    compute_metrics=compute_metrics,
)

# Vectors are extracted

In [19]:
class SaveOutput:
    def __init__(self):
        self.outputs = []
        
    def __call__(self, module, module_in, module_out):
        self.outputs.append(module_in)
        
    def clear(self):
        self.outputs = []
        
save_output = SaveOutput()

hook_handles = []

for layer in model_loaded.modules():
    if str(layer) == 'Linear(in_features=768, out_features=10, bias=True)':
        handle = layer.register_forward_hook(save_output)
        hook_handles.append(handle)

len(save_output.outputs)

0

In [20]:
outputs = trainer_for_model_loaded.predict(preprocessed_data_ds)

The following columns in the test set  don't have a corresponding argument in `ViTForImageClassification.forward` and have been ignored: img.
***** Running Prediction *****
  Num examples = 4514
  Batch size = 4


In [26]:
print(outputs.metrics)

{'test_loss': 0.35946235060691833, 'test_accuracy': 0.9353123615418697, 'test_runtime': 2668.6948, 'test_samples_per_second': 1.691, 'test_steps_per_second': 0.423}


In [21]:
outputs_vectors = save_output.outputs

In [22]:
vectors = {}
batch_size=4
for i in range(0,len(data_img_ids)):
    first_index = i // batch_size
    second_index = i % batch_size
    vectors[data_img_ids[i]]=outputs_vectors[first_index][0][second_index].tolist()

In [23]:
import json
with open('vectors_new.json', 'w') as fp:
    json.dump(vectors, fp)