In [1]:
!pip install datasets
!pip install transformers torch tensorflow

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import numpy as np
import tensorflow as tf
from transformers import AutoImageProcessor, AutoModelForImageClassification
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
from pathlib import Path
import zipfile
import requests
from tqdm import tqdm
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

class RPSDataset(Dataset):
    def __init__(self, image_paths, labels, image_processor):
        self.image_paths = image_paths
        self.labels = labels
        self.image_processor = image_processor

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert('RGB')
        image = image.resize((224, 224))

        inputs = self.image_processor(image, return_tensors="pt")
        pixel_values = inputs['pixel_values'].squeeze(0)

        return {
            'pixel_values': pixel_values,
            'labels': torch.tensor(self.labels[idx])
        }

def download_file(url, filename):
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    with open(filename, 'wb') as file, tqdm(
        desc=filename,
        total=total_size,
        unit='iB',
        unit_scale=True
    ) as pbar:
        for data in response.iter_content(chunk_size=1024):
            size = file.write(data)
            pbar.update(size)

def download_and_extract_dataset():
    os.makedirs('/tmp', exist_ok=True)

    if not os.path.exists('/tmp/rps.zip'):
        download_file(
            'https://storage.googleapis.com/learning-datasets/rps.zip',
            '/tmp/rps.zip'
        )
    if not os.path.exists('/tmp/rps-test-set.zip'):
        download_file(
            'https://storage.googleapis.com/learning-datasets/rps-test-set.zip',
            '/tmp/rps-test-set.zip'
        )

    with zipfile.ZipFile('/tmp/rps.zip', 'r') as zip_ref:
        zip_ref.extractall('/tmp/')
    with zipfile.ZipFile('/tmp/rps-test-set.zip', 'r') as zip_ref:
        zip_ref.extractall('/tmp/')

def prepare_dataset():
    train_images = []
    train_labels = []
    test_images = []
    test_labels = []

    class_names = ['paper', 'rock', 'scissors']

    for class_name in class_names:
        path = Path(f'/tmp/rps/{class_name}')
        for img_path in path.glob('*.png'):
            train_images.append(str(img_path))
            train_labels.append(class_names.index(class_name))

    for class_name in class_names:
        path = Path(f'/tmp/rps-test-set/{class_name}')
        for img_path in path.glob('*.png'):
            test_images.append(str(img_path))
            test_labels.append(class_names.index(class_name))

    return train_images, train_labels, test_images, test_labels, class_names

def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(dataloader, desc="Training"):
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(pixel_values=pixel_values, labels=labels)

        loss = outputs.loss
        total_loss += loss.item()

        predictions = outputs.logits.argmax(-1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader), correct / total * 100

def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            pixel_values = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(pixel_values=pixel_values)
            predictions = outputs.logits.argmax(-1)

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    return correct / total * 100

def main():
    print("Downloading and preparing dataset...")
    download_and_extract_dataset()
    train_images, train_labels, test_images, test_labels, class_names = prepare_dataset()

    print("Loading pre-trained model and processor...")
    image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
    model = AutoModelForImageClassification.from_pretrained(
        "google/vit-base-patch16-224",
        num_labels=3,
        ignore_mismatched_sizes=True
    )

    # Prepare datasets and dataloaders
    train_dataset = RPSDataset(train_images, train_labels, image_processor)
    test_dataset = RPSDataset(test_images, test_labels, image_processor)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # Training setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = CrossEntropyLoss()

    # Training loop
    num_epochs = 5
    best_accuracy = 0

    print("\nStarting fine-tuning...")
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")

        # Train
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
        print(f"Training Loss: {train_loss:.4f}")
        print(f"Training Accuracy: {train_acc:.2f}%")

        # Evaluate
        test_acc = evaluate(model, test_loader, device)
        print(f"Test Accuracy: {test_acc:.2f}%")

        # Save best model
        if test_acc > best_accuracy:
            best_accuracy = test_acc
            torch.save(model.state_dict(), 'best_model.pth')

    print(f"\nBest Test Accuracy: {best_accuracy:.2f}%")

if __name__ == "__main__":
    main()

Downloading and preparing dataset...


/tmp/rps.zip: 100%|██████████| 201M/201M [00:10<00:00, 19.5MiB/s]
/tmp/rps-test-set.zip: 100%|██████████| 29.5M/29.5M [00:02<00:00, 12.5MiB/s]


Loading pre-trained model and processor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting fine-tuning...

Epoch 1/5


Training: 100%|██████████| 79/79 [01:13<00:00,  1.07it/s]


Training Loss: 0.1384
Training Accuracy: 96.35%


Evaluating: 100%|██████████| 12/12 [00:05<00:00,  2.01it/s]


Test Accuracy: 99.19%

Epoch 2/5


Training: 100%|██████████| 79/79 [01:21<00:00,  1.03s/it]


Training Loss: 0.0021
Training Accuracy: 100.00%


Evaluating: 100%|██████████| 12/12 [00:06<00:00,  1.90it/s]


Test Accuracy: 99.19%

Epoch 3/5


Training: 100%|██████████| 79/79 [01:21<00:00,  1.03s/it]


Training Loss: 0.0010
Training Accuracy: 100.00%


Evaluating: 100%|██████████| 12/12 [00:06<00:00,  1.92it/s]


Test Accuracy: 99.46%

Epoch 4/5


Training: 100%|██████████| 79/79 [01:22<00:00,  1.04s/it]


Training Loss: 0.0006
Training Accuracy: 100.00%


Evaluating: 100%|██████████| 12/12 [00:06<00:00,  1.81it/s]


Test Accuracy: 99.46%

Epoch 5/5


Training: 100%|██████████| 79/79 [01:22<00:00,  1.04s/it]


Training Loss: 0.0004
Training Accuracy: 100.00%


Evaluating: 100%|██████████| 12/12 [00:06<00:00,  1.83it/s]

Test Accuracy: 99.46%

Best Test Accuracy: 99.46%



