In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, datasets
from transformers import ViTForImageClassification, ViTFeatureExtractor, TrainingArguments, Trainer
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.metrics import precision_recall_fscore_support ,accuracy_score

2024-07-30 18:12:05.696796: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 18:12:05.696915: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 18:12:05.882198: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Load and preprocess the dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load the datasets
train_dataset = datasets.ImageFolder(root='/kaggle/input/ferplus/train', transform=transform)
test_dataset = datasets.ImageFolder(root='/kaggle/input/ferplus/test', transform=transform)
val_dataset = datasets.ImageFolder(root='/kaggle/input/ferplus/test', transform=transform)

In [3]:
class CustomImageDataset(Dataset):
    def __init__(self, image_folder_dataset):
        self.image_folder_dataset = image_folder_dataset

    def __len__(self):
        return len(self.image_folder_dataset)

    def __getitem__(self, idx):
        img, label = self.image_folder_dataset[idx]
        return {"pixel_values": img, "labels": label}

# Create custom datasets
train_dataset = CustomImageDataset(train_dataset)
test_dataset = CustomImageDataset(train_dataset)
val_dataset = CustomImageDataset(val_dataset)

In [4]:
# Load the Vision Transformer model
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224-in21k',
    num_labels=8,
#     output_attentions=False,
#     output_hidden_states=False,
#     attention_probs_dropout_prob=0.5,
#     hidden_dropout_prob=0.5,
    force_download=True,
)
USE_CPU = False
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model.to(device)

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [5]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='model/results',
    num_train_epochs=8,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",  # Changed to use epochs
    save_strategy="epoch",        # Changed to use epochs
    logging_dir='model/logs',
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_steps=10,            # You might want to keep this or adjust it
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_safetensors=False,
    report_to=[],
)

In [6]:
# Define a function to compute metrics
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='macro')
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [7]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [8]:
# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7423,0.758569,0.7719,0.562649,0.624737,0.545795
2,0.4815,0.626359,0.797649,0.652469,0.670334,0.639089
3,0.3526,0.586596,0.804646,0.648499,0.695241,0.63245
4,0.264,0.567781,0.811083,0.668305,0.703131,0.652273
5,0.2404,0.565279,0.815561,0.664836,0.695355,0.645733
6,0.1794,0.587617,0.813882,0.64731,0.690457,0.629977
7,0.1898,0.605414,0.815001,0.65765,0.696571,0.635386
8,0.1051,0.614007,0.8192,0.664755,0.702288,0.645492


  return self.fget.__get__(instance, owner)()
