In [1]:
import os
import PIL
import pandas as pd
import numpy as np
from tqdm import tqdm

# torch
import torch
from torch import nn
from torchvision import transforms

# plot
import matplotlib.pyplot as plt
import seaborn as sns

# Load ViT
from pytorch_pretrained_vit import ViT

device = torch.device("cuda:1")

In [2]:
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f13f453af70>

In [3]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("/run/media/viper/LSP/Dataset/Affect/labels.csv")
label = pd.get_dummies(df["label"])
df = pd.concat([df, label], axis=1)

In [4]:
df.head()

Unnamed: 0,pth,label,anger,contempt,disgust,fear,happy,neutral,sad,surprise
0,anger/image0000006.jpg,surprise,0,0,0,0,0,0,0,1
1,anger/image0000007.jpg,anger,1,0,0,0,0,0,0,0
2,anger/image0000012.jpg,anger,1,0,0,0,0,0,0,0
3,anger/image0000035.jpg,fear,0,0,0,1,0,0,0,0
4,anger/image0000060.jpg,anger,1,0,0,0,0,0,0,0


In [5]:
train, test = train_test_split(df, test_size=0.2)
test, valid = train_test_split(test, test_size=0.5)

In [6]:
from torchvision.io import read_image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class CustomImageDataset(Dataset):
    def __init__(self, annotations, img_dir, transform=None, target_transform=None):
        self.img_labels = annotations
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = read_image(img_path)
        label = self.img_labels.iloc[idx, 2:].to_numpy().astype(int)
        label = torch.tensor(label).reshape(8)
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

In [7]:
from torchvision.transforms import InterpolationMode

myTransform = transforms.Compose([transforms.Resize(224),
                                  transforms.ColorJitter(hue=.05, saturation=.05),
                                  transforms.RandomHorizontalFlip(),
                                  transforms.RandomRotation(20, interpolation=InterpolationMode.BILINEAR)])

In [8]:
trainDataset = CustomImageDataset(train, 
                                  "/run/media/viper/LSP/Dataset/Affect/",
                                  myTransform)

testDataset = CustomImageDataset(test, 
                                 "/run/media/viper/LSP/Dataset/Affect/")

valDataset = CustomImageDataset(valid, 
                                "/run/media/viper/LSP/Dataset/Affect/")

In [9]:
batch_size = 1024
trainDataloader = DataLoader(trainDataset, batch_size=batch_size, shuffle=True, num_workers=0)
testDataset = DataLoader(testDataset, batch_size=batch_size, shuffle=False, num_workers=0)
valDataset = DataLoader(valDataset, batch_size=batch_size, shuffle=False, num_workers=0)

In [10]:
def train(model, criterion, optimizer, train_loader, device):
        epoch_loss = 0.0
        model.train()
        for data in train_loader:
        # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs = inputs.float().to(device)
            labels = labels.float().to(device)
            optimizer.zero_grad()

            outputs = model(inputs)
            # outputs = F.softmax(outputs, dim=-1)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        return model, optimizer, epoch_loss
    
@torch.no_grad()    
def test(model, criterion, test_loader, device):
    model.eval()
    tar = []
    out = []

    for data in test_loader:
        pass

    return None

In [11]:
model = ViT('B_32', pretrained=True)
model.fc = nn.Linear(768, 8)

Loaded pretrained weights.


In [12]:
for param in model.parameters():
    param.requires_grad = False

In [13]:
for param in model.fc.parameters():
    param.requires_grad = True

In [14]:
model.to(device)

ViT(
  (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32))
  (positional_embedding): PositionalEmbedding1D()
  (transformer): Transformer(
    (blocks): ModuleList(
      (0): Block(
        (attn): MultiHeadedSelfAttention(
          (proj_q): Linear(in_features=768, out_features=768, bias=True)
          (proj_k): Linear(in_features=768, out_features=768, bias=True)
          (proj_v): Linear(in_features=768, out_features=768, bias=True)
          (drop): Dropout(p=0.1, inplace=False)
        )
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (pwff): PositionWiseFeedForward(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        )
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (drop): Dropout(p=0.1, inplace=False)
      )
      (1): Block(
 

In [15]:
import torch.optim.lr_scheduler as lr_schedular
from torch.optim.lr_scheduler import LambdaLR
from torch import optim
import math

epochs = 100
lr = 1e-3
lrf = 0.1
weight_decay=5e-5
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
lr = lambda x: ((1 + math.cos(x * math.pi /epochs))/2) * (1 - lrf) + lrf
scheduler = lr_schedular.LambdaLR(optimizer, lr_lambda=lr)

criterion = nn.CrossEntropyLoss()

In [16]:
for i in tqdm(range(epochs)):
    model, optimizer, epoch_loss = train(model, criterion, optimizer, trainDataloader, device)
    print(epoch_loss)

  0%|          | 0/100 [01:05<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# for data in trainDataloader:
#     inputs, labels = data
#     inputs = inputs.float().to(device)
#     labels = labels.to(device)
#     outpus = model(inputs)
#     break
# loss = criterion(outpus, labels.float())