In [2]:
!pip install torch torchvision transformers scikit-learn

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2025.10.0-py3-none-any.whl.metadata (10 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp311-cp311-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Collecting tqdm>=4.27 (from transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
   ---------------------------------------- 0.0/

In [4]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as transforms
from transformers import ViTConfig, ViTForImageClassification
from torch.optim import AdamW

In [5]:
class EmotionDataset(Dataset):
    def __init__(self, csv_file, img_directory, transform=None):
        self.data_frame = pd.read_csv(csv_file)
        self.img_directory = img_directory
        self.transform = transform

        self.image_col = 'image'
        self.label_col = 'emotion'
        self.data_frame[self.label_col] = self.data_frame[self.label_col].astype(str).str.lower() # normalize labels to lowercase

        # create label mappings
        unique_labels = sorted(self.data_frame[self.label_col].unique())
        self.label_to_int = {label: i for i, label in enumerate(unique_labels)} # {'angry': 0, 'happy': 1, ...}
                                                                                # convert string labels to integers for pytorch to use as classification targets
        self.int_to_label = {i: label for i, label in enumerate(unique_labels)} # {0: 'angry', 1: 'happy', ...}
                                                                                # convert integers back to string labels for easier interpretation of results

        print(f"Loaded {len(self.data_frame)} samples from {img_directory}. Classes: {self.label_to_int}")

    def __len__(self):
        return len(self.data_frame)
    
    def __getitem__(self, idx):
        '''Get image and label at index idx'''

        image_name = self.data_frame.iloc[idx][self.image_col]
        image_path = os.path.join(self.img_directory, image_name)
        image = Image.open(image_path).convert('RGB')

        label_str = self.data_frame.iloc[idx][self.label_col].lower()
        label_idx = self.label_to_int[label_str]

        if self.transform:
            image = self.transform(image)

        return image, label_idx

In [9]:
# configuration for model
BATCH_SIZE = 64
NUM_CLASSES = 8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5])
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5])
])

In [None]:
# load datasets
train_dataset = EmotionDataset(
    csv_file='../data/legend.csv',
    img_directory='../images',
    transform=train_transform
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

Loaded 13690 samples from data/train. Classes: {'anger': 0, 'contempt': 1, 'disgust': 2, 'fear': 3, 'happiness': 4, 'neutral': 5, 'sadness': 6, 'surprise': 7}


In [12]:
# model setup
config = ViTConfig(
    image_size = 224,
    patch_size = 16,
    num_labels = NUM_CLASSES,
    hidden_size = 256,
    num_hidden_layers = 8,
    num_attention_heads = 4,
    intermediate_size = 512,
    hidden_dropout_prob = 0.1
)

model = ViTForImageClassification(config)

class ConvolutionStem(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.conv1 = nn.Conv2d(3,64,kernel_size=3,stride=2,padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64,128,kernel_size=3,stride=2,padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        self.conv3 = nn.Conv2d(128,256,kernel_size=3,stride=2,padding=1)
        self.bn3 = nn.BatchNorm2d(256)
        self.conv4 = nn.Conv2d(256,hidden_size,kernel_size=3,stride=2,padding=1)
        self.bn4 = nn.BatchNorm2d(hidden_size)
        self.relu = nn.ReLU()

    def forward(self, pixel_values):
        x = self.relu(self.bn1(self.conv1(pixel_values)))
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.relu(self.bn3(self.conv3(x)))
        x = self.relu(self.bn4(self.conv4(x)))
        return x.flatten(2).transpose(1,2)
    
model.vit.embeddings.patch_embeddings.projection = ConvolutionStem(hidden_size=config.hidden_size)
model.to(DEVICE)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): ConvolutionStem(
          (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv4): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (bn4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU()
        )
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (

In [13]:
# execute
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss()
num_epochs = 10

print("Starting training...")
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(DEVICE), labels.to(DEVICE)

        optimizer.zero_grad()
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")


Starting training...


FileNotFoundError: [Errno 2] No such file or directory: 'data/train\\KatrinaKaif_35.jpg'