In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import zipfile

# Path to the zip file
zip_path = '/content/drive/MyDrive/mmls_data.zip'

# Directory to extract the files to
extract_to = '/content/data/'

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print(f"Files extracted to {extract_to}")

Files extracted to /content/data/


In [3]:
import torch
from torchvision.datasets import ImageFolder
from torch.utils.data import random_split
from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer
from torch import nn
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, fbeta_score
import wandb


In [3]:
wandb.init(project="fashion_net")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpoliakovva0[0m ([33mpoliakovva0-hse-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
dataset = ImageFolder(root='data/downloaded_images')


In [5]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_subset, val_subset = random_split(dataset, [train_size, val_size])


In [25]:
train_labels = [dataset.targets[i] for i in train_subset.indices]
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

In [26]:
class WrapDataset(torch.utils.data.Dataset):
    def __init__(self, subset):
        self.subset = subset

    def __len__(self):
        return len(self.subset)

    def __getitem__(self, idx):
        image, label = self.subset[idx]
        return {'image': image, 'label': label}

In [27]:
train_dataset = WrapDataset(train_subset)
val_dataset = WrapDataset(val_subset)

In [28]:
feature_extractor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')


In [29]:
def collate_fn(batch):
    images = [item['image'] for item in batch]
    labels = [item['label'] for item in batch]
    inputs = feature_extractor(images=images, return_tensors='pt')
    inputs['labels'] = torch.tensor(labels)
    return inputs

In [30]:
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224-in21k',
    num_labels=len(dataset.classes),
    ignore_mismatched_sizes=True
)


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_steps=10,
    logging_dir='./logs',
    learning_rate=2e-4,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    fp16=True,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    report_to="wandb",
)

In [32]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [33]:

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    f1 = f1_score(labels, predictions, average='macro')

    beta = 0.5
    f_beta = fbeta_score(labels, predictions, beta=beta, average='macro')

    return {
        'accuracy': np.mean(predictions == labels),
        'f1': f1,
        f'f{beta}': f_beta,
    }

In [34]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [35]:
trainer.can_return_loss = True
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1,F0.5
1,0.8987,0.945055,0.727121,0.731555,0.746833
2,0.4924,0.714329,0.785358,0.787071,0.802285
3,0.0938,0.82036,0.757072,0.755549,0.764196
4,0.0477,0.776787,0.801997,0.806311,0.817295
5,0.0084,0.689705,0.823627,0.825495,0.830967
6,0.0059,0.696174,0.838602,0.841085,0.845353
7,0.0046,0.711266,0.84193,0.843985,0.847769
8,0.004,0.72326,0.843594,0.845957,0.849416
9,0.0036,0.733591,0.84193,0.844371,0.847641


TrainOutput(global_step=1500, training_loss=0.22005493929237127, metrics={'train_runtime': 1071.9141, 'train_samples_per_second': 22.418, 'train_steps_per_second': 1.399, 'total_flos': 1.8508904764361626e+18, 'train_loss': 0.22005493929237127, 'epoch': 9.93687707641196})

In [36]:
model.save_pretrained('./style_classifier')
feature_extractor.save_pretrained('./style_classifier')
wandb.finish()

print("Training complete! Model saved to './style_classifier'")

0,1
eval/accuracy,▁▄▃▅▇█████
eval/f0.5,▁▅▂▆▇█████
eval/f1,▁▄▂▆▇█████
eval/loss,█▂▅▃▁▁▂▂▂▂
eval/runtime,▁▅▁▂█▇▄▂▁▃
eval/samples_per_second,█▄█▇▁▂▄▇█▆
eval/steps_per_second,█▄█▇▁▂▄▇█▆
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇███
train/grad_norm,▃▃▅▂▄▅▂▂▁▂▁▄▃▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.84193
eval/f0.5,0.84764
eval/f1,0.84437
eval/loss,0.73359
eval/runtime,17.0851
eval/samples_per_second,35.177
eval/steps_per_second,4.448
total_flos,1.8508904764361623e+18
train/epoch,9.93688
train/global_step,1500.0


Training complete! Model saved to './style_classifier'


In [37]:
from PIL import Image


# Load model and explicitly cast to float32
model = ViTForImageClassification.from_pretrained('./style_classifier').to(torch.float32)
feature_extractor = ViTImageProcessor.from_pretrained('./style_classifier')


class_names = dataset.classes

image = Image.open('test_casual.jpg')
inputs = feature_extractor(images=image, return_tensors='pt')
outputs = model(**inputs)
predicted_class_idx = outputs.logits.argmax(-1).item()

predicted_label = class_names[predicted_class_idx]
print(f"Predicted style: {predicted_label}")

Predicted style: casual


In [6]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
import numpy as np


y_train = [dataset.targets[i] for i in train_subset.indices]
y_val = [dataset.targets[i] for i in val_subset.indices]

dummy = DummyClassifier(strategy='stratified')
dummy.fit(None, y_train)  # X is ignored for dummy classifiers

y_pred = dummy.predict(y_val)  # Pass any input - it will be ignored

def print_metrics(y_true, y_pred):
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Macro F1: {f1_score(y_true, y_pred, average='macro'):.4f}")
    print(f"Macro F0.5: {fbeta_score(y_true, y_pred, beta=0.5, average='macro'):.4f}")

print("Dummy Classifier Performance:")
print_metrics(y_val, y_pred)

Dummy Classifier Performance:
Accuracy: 0.0915
Macro F1: 0.0884
Macro F0.5: 0.0898
