In [None]:
# !pip install -U transformers
# !pip install -U accelerate
# !pip install -U datasets
# !pip install -U bertviz
# !pip install -U umap-learn
# !pip install -U sentencepiece
# !pip install -U urllib3
# !pip install py7zr
# !pip install -U pillow

# Image classification

## Load Dog and Cats dataset

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from datasets import load_dataset

dataset = load_dataset("microsoft/cats_vs_dogs", split=['train'], 
                       trust_remote_code=True, ignore_verifications=True)


In [None]:
# select only 10% of the dataset
dataset = dataset[0].train_test_split(train_size=0.2, seed=42)['train']

In [None]:
dataset
dataset = dataset.train_test_split(test_size=0.3, seed=42)

In [None]:
dataset["train"][0]['image'].size
dataset["train"][0]['image']

In [None]:
# resize the images to 224x224
def resize_image(example, size=(224, 224)):
    image = example['image'] 
    image = image.resize(size)
    example['image'] = image
    return example

dataset = dataset.map(resize_image)

Each example in the dataset has two fields:

- `image`: a PIL image of the food item
- `label`: the label class of the food item

To make it easier for the model to get the label name from the label id, create a dictionary that maps the label name
to an integer and vice versa:

In [None]:
labels = dataset["train"].features["labels"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label


Now you can convert the label id to a label name:

In [None]:
print(id2label)
print(label2id)

## Preprocess

The next step is to load a ViT image processor to process the image into a tensor:

In [None]:
from transformers import AutoImageProcessor
import torch

# https://huggingface.co/google/vit-base-patch16-224-in21k

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# checkpoint = "google/vit-base-patch16-224-in21k"
# checkpoint = "apple/mobilevitv2-1.0-imagenet1k-256"
checkpoint = "microsoft/resnet-50"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

# image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
# model = MobileViTV2ForImageClassification.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")


Now create a batch of examples using [DefaultDataCollator](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DefaultDataCollator). Unlike other data collators in 🤗 Transformers, the `DefaultDataCollator` does not apply additional preprocessing such as padding.

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

## Evaluate

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Your `compute_metrics` function is ready to go now, and you'll return to it when you set up your training.

## Train

In [None]:
from torchvision.transforms import RandomResizedCrop, Compose, ToTensor

# normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor()])

def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    # del examples["image"]
    return examples

# dataset = dataset.with_transform(transforms)

In [None]:
dataset = dataset.map(transforms, batched=True)

In [None]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes = True
).to(device)

In [None]:
training_args = TrainingArguments(
    output_dir="dog_cat_classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    load_best_model_at_end=True,
    gradient_accumulation_steps=100,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("dog_cat_classification")

## Inference

Great, now that you've fine-tuned a model, you can use it for inference!

Load an image you'd like to run inference on:

In [None]:
# load dog image from online url

import requests
from PIL import Image
from io import BytesIO

url = "https://cdn.pixabay.com/photo/2016/12/13/05/15/puppy-1903313_640.jpg"
response = requests.get(url)
image = Image.open(BytesIO(response.content))

# show in 300x300
image = image.resize((224, 224))
image

In [None]:
from transformers import pipeline

classifier = pipeline("image-classification", model="dog_cat_classification", device=device)
classifier(image)