In [1]:
# Install required libraries
!pip install transformers 
!pip install datasets 
!pip install evaluate 
!pip install accelerate
!pip install pillow 
!pip install torchvision
!pip install scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [2]:
# Import necessary libraries
import torch
from datasets import load_dataset
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login


In [3]:
# Authenticate and login to Hugging Face
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("huggingface_token")
login(token=hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
# Load the dataset
Data = load_dataset("vishnun0027/BirdsSpecies")
print(Data)
print(Data['train'][0])

Downloading readme:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/571M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/624M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/718M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.40G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/379M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/765M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 7500
    })
})
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1200x1084 at 0x7926302167A0>, 'label': 0}


In [6]:
# Create label-to-id and id-to-label mappings
labels = Data["train"].features["label"].names
label2id, id2label = {}, {}
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
print(label2id)

{'Asian-Green-Bee-Eater': '0', 'Brown-Headed-Barbet': '1', 'Cattle-Egret': '2', 'Common-Kingfisher': '3', 'Common-Myna': '4', 'Common-Rosefinch': '5', 'Common-Tailorbird': '6', 'Coppersmith-Barbet': '7', 'Forest-Wagtail': '8', 'Gray-Wagtail': '9', 'Hoopoe': '10', 'House-Crow': '11', 'Indian-Grey-Hornbill': '12', 'Indian-Peacock': '13', 'Indian-Pitta': '14', 'Indian-Roller': '15', 'Jungle-Babbler': '16', 'Northern-Lapwing': '17', 'Red-Wattled-Lapwing': '18', 'Ruddy-Shelduck': '19', 'Rufous-Treepie': '20', 'Sarus-Crane': '21', 'White-Breasted-Kingfisher': '22', 'White-Breasted-Waterhen': '23', 'White-Wagtail': '24'}


In [7]:
# Preprocessing
from transformers import AutoImageProcessor
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

In [8]:
# Load the ViT image processor
checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [9]:
# Define image transformations: resize, normalize, and convert to tensor
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

# Create a preprocessing function to apply the transforms and return the pixel values
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

In [10]:
# Apply the transformation to the dataset
Data = Data.with_transform(transforms)

In [11]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [12]:
# Evaluate
import evaluate
import numpy as np

# Load the accuracy metric
accuracy = evaluate.load("accuracy")

# Define a function to compute the metrics (accuracy in this case)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
# Training
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

# Load the model and configure it for image classification
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

# Move model to the GPU
model.to(device)
# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {trainable_params}")

# # Optionally, print the names and shapes of trainable parameters
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(f"Parameter: {name}, Shape: {param.shape}")

# Define training arguments
training_args = TrainingArguments(
    output_dir="BirdsSpecies_vit_finetune",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    report_to=["tensorboard"],

)

# Set up the Trainer with an EarlyStoppingCallback and logging
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=Data["train"],
    eval_dataset=Data["validation"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Use early stopping
)

# Train the model
trainer.train()

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total trainable parameters: 85817881


Epoch,Training Loss,Validation Loss,Accuracy
0,0.824,0.801167,0.9212
1,0.3097,0.363697,0.929067
2,0.2615,0.265935,0.938267
4,0.1777,0.215606,0.9444
5,0.1927,0.201353,0.945333
6,0.2229,0.189294,0.948533
8,0.1525,0.154758,0.9584
9,0.1377,0.164949,0.954533


TrainOutput(global_step=4680, training_loss=0.4110901544109369, metrics={'train_runtime': 10333.2683, 'train_samples_per_second': 29.032, 'train_steps_per_second': 0.453, 'total_flos': 2.3215185361407836e+19, 'train_loss': 0.4110901544109369, 'epoch': 9.984})

In [14]:
# Push the model to the Hugging Face Hub
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/vishnun0027/BirdsSpecies_vit_finetune/commit/f280a9cb5e68e2c705219759e7f5a48db3b1776d', commit_message='End of training', commit_description='', oid='f280a9cb5e68e2c705219759e7f5a48db3b1776d', pr_url=None, pr_revision=None, pr_num=None)