Instructions:

- From the three models, run one of the cell with label CUSTOM MODEL, EfficientNetv2 Transfer Model, or InceptionResNet Transfer Model to instantiate the model. Do not rull the three cells together to have a clue which model you are running.

In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn, optim
from torchvision import transforms, datasets, models
from torch.hub import load_state_dict_from_url
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc, r2_score
from sklearn.metrics import r2_score
import random
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm
from timeit import default_timer as timer
import matplotlib.pyplot as plt

# BATCH SIZE and SEED

Set the batch size and set the seed for randomisation. It allows to have reproducible experiments

In [4]:
batch_size = 32

# Seed settings
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False

# SET DEVICE

In [5]:
print(torch.__version__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Running on device: ", device)

2.3.1
Running on device:  cpu


# DATASET CLASS FOR HEAD POSE

In [4]:
class CustomImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        """
        Args:
            root_dir (string): Directory with all the subfolders containing images and annotation files.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = []
        self.labels = []

        self._load_data()

    def _load_data(self):
        # Traverse through each subfolder in the root directory
        for subfolder in os.listdir(self.root_dir):
            subfolder_path = os.path.join(self.root_dir, subfolder)
            if os.path.isdir(subfolder_path):
                # Locate the annotation file in the subfolder
                annotation_file = os.path.join(subfolder_path, 'angles.csv')
                if os.path.exists(annotation_file):
                    annotations = pd.read_csv(annotation_file)
                    for idx in range(len(annotations)):
                        img_id = str(annotations.iloc[idx, 0])
                        img_name = f'frame_{img_id}.png'  # Add .png extension
                        img_path = os.path.join(subfolder_path, img_name)
                        
                        if os.path.exists(img_path):
                            self.image_paths.append(img_path)
                            yaw = annotations.iloc[idx, 1]
                            roll = annotations.iloc[idx, 2]
                            pitch = annotations.iloc[idx, 3]
                            self.labels.append([yaw, roll, pitch])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert("RGB")
        
        labels = torch.tensor(self.labels[idx], dtype=torch.float32)

        if self.transform:
            image = self.transform(image)

        # sample = {'image': image, 'labels': labels}

        return image, labels

    def get_random_sample(self):
        # Get a random index
        random_idx = random.randint(0, self.__len__() - 1)
        return self.__getitem__(random_idx)



# DATASET CLASS FOR EYEGAZE

In [6]:
# EYE GAZE DATASET
class CustomImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = []
        self.labels = []
        
        self._prepare_dataset()

    def _prepare_dataset(self):
        # Iterate through level 1 folders
        for level1_folder in os.listdir(self.root_dir):
            level1_path = os.path.join(self.root_dir, level1_folder)
            if os.path.isdir(level1_path):
                # Iterate through level 2 folders
                for level2_folder in os.listdir(level1_path):
                    level2_path = os.path.join(level1_path, level2_folder)
                    if os.path.isdir(level2_path):
                        # Load the annotations CSV
                        csv_path = os.path.join(level2_path, 'lookat_points.csv')
                        annotations_df = pd.read_csv(csv_path)
                        
                        # Gather all image paths and corresponding labels
                        for file in os.listdir(level2_path):
                            if file.endswith('.png'):
                                image_id = int(file.split('.')[0])
                                annotation_row = annotations_df[annotations_df['id'] == image_id]
                                
                                if not annotation_row.empty:
                                    x, y = annotation_row.iloc[0]['x'], annotation_row.iloc[0]['y']
                                    relative_path = os.path.join(level1_folder, level2_folder, file).replace('\\', '/')
                                    
                                    self.image_paths.append(relative_path)
                                    self.labels.append((x, y))
        
    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Get image path
        img_path = os.path.join(self.root_dir, self.image_paths[idx])
        
        # Load image
        image = Image.open(img_path).convert('RGB')
        
        # Get the label for this image
        labels = torch.tensor(self.labels[idx], dtype=torch.float32)
        
        if self.transform:
            image = self.transform(image)

        return image, labels

# PREPARE DATA
Prepare the dataset for training. Comment and uncomment the appropriate transform object below based on the task type

In [7]:
# Transformation for head pose
# transform = transforms.Compose([
#     transforms.Resize((299, 299)),
#     transforms.ToTensor(),
#     transforms.Normalize((0.4702, 0.3964, 0.3711), (0.2337, 0.2362, 0.2483))
# ])

# Transformation for eye gaze
transform = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize((0.6965, 0.5065, 0.40670), (0.2380, 0.2134, 0.1928))
])

# Initialize the dataset
dataset = CustomImageDataset(root_dir='/kaggle/input/headposeimgs/all_head_pose', transform=transform)

# Define the partition percentages
train_percentage = 0.7
val_percentage = 0.2
test_percentage = 0.1

# Calculate the sizes for each partition
dataset_size = len(dataset)
train_size = int(train_percentage * dataset_size)
val_size = int(val_percentage * dataset_size)
test_size = dataset_size - train_size - val_size  # Ensure the sum equals the dataset size

# Set the random seed for reproducibility
seed = 42
torch.manual_seed(seed)
# Split the dataset

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Create DataLoader for each partition
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

print(len(train_loader))
print(len(val_loader))
print(len(test_loader))

# Example usage: Access a random sample from the training set
# random_sample = train_dataset[random.randint(0, len(train_dataset)-1)]
# print(random_sample['image'].shape, random_sample['labels'])

# print(train_dataset[0])

4028
1151
576


# CUSTOM MODEL

Classes for custom model.

In [48]:
class FactorizedConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, padding, stride=1):
        super(FactorizedConv2d, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=(1, kernel_size), stride=stride, padding=(0, padding), bias=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=(kernel_size, 1), stride=stride, padding=(padding, 0), bias=True)
      
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = nn.ReLU(inplace=True)(x)
        x = self.conv2(x)
        x = self.bn2(x)
        return nn.ReLU(inplace=True)(x)


In [49]:
class InceptionResNetBlock(nn.Module):
    def __init__(self, in_channels, scale=1.0):
        super(InceptionResNetBlock, self).__init__()
        self.scale = scale

        self.branch0 = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=1, bias=True),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True)
        )

        self.branch1 = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=1, bias=True),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            FactorizedConv2d(32, 48, kernel_size=3, padding=1),
            FactorizedConv2d(48, 64, kernel_size=5, padding=2),
            nn.Conv2d(64, 32, kernel_size=1, bias=True)
        )

        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=1, bias=True),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            FactorizedConv2d(32, 48, kernel_size=3, padding=1),
            FactorizedConv2d(48, 64, kernel_size=7, padding=3),
            nn.Conv2d(64, 32, kernel_size=1, bias=True)
        )

        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=1, bias=True),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            FactorizedConv2d(32, 48, kernel_size=5, padding=2),
            FactorizedConv2d(48, 64, kernel_size=7, padding=3),
            nn.Conv2d(64, 32, kernel_size=1, bias=True)
            
        )

        self.conv = nn.Conv2d(128, in_channels, kernel_size=1, bias=True)
        self.bn = nn.BatchNorm2d(in_channels)

    def forward(self, x):
        branch0 = self.branch0(x)
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        branch3 = self.branch3(x)

        mixed = torch.cat([branch0, branch1, branch2, branch3], dim=1)
        up = self.conv(mixed)
        up = self.bn(up)

        x = x + self.scale * up
        return nn.ReLU(inplace=True)(x)


In [50]:
class InceptionResNet(nn.Module):
    def __init__(self, num_classes=3):
        super(InceptionResNet, self).__init__()
        self.stem = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=0, bias=True),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            # nn.Conv2d(32, 32, kernel_size=3, padding=0, bias=True),
            # nn.BatchNorm2d(32),
            # nn.ReLU(inplace=True),
            nn.Conv2d(32, 64, kernel_size=3, padding=1, bias=True),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=0),
            nn.Conv2d(64, 80, kernel_size=1, padding=0, bias=True),
            nn.BatchNorm2d(80),
            nn.ReLU(inplace=True),
            nn.Conv2d(80, 192, kernel_size=3, padding=0, bias=True),
            nn.BatchNorm2d(192),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=0)
        )

        self.inception_resnet_a = nn.Sequential(
            InceptionResNetBlock(192, scale=0.2),
            InceptionResNetBlock(192, scale=0.2),
            # InceptionResNetBlock(192, scale=0.2),
            # InceptionResNetBlock(192, scale=0.2)
        )

        self.reduction_a = nn.Sequential(
            nn.Conv2d(192, 384, kernel_size=3, stride=2, padding=0, bias=True),
            nn.BatchNorm2d(384),
            nn.ReLU(inplace=True)
        )

        self.inception_resnet_b = nn.Sequential(
            InceptionResNetBlock(384, scale=0.2),
            InceptionResNetBlock(384, scale=0.2),
            # InceptionResNetBlock(384, scale=0.2),
            # InceptionResNetBlock(384, scale=0.2)
        )

        self.reduction_b = nn.Sequential(
            nn.Conv2d(384, 1024, kernel_size=3, stride=2, padding=0, bias=True),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True)
        )

        self.inception_resnet_c = nn.Sequential(
            InceptionResNetBlock(1024, scale=0.2),
            InceptionResNetBlock(1024, scale=0.2),
            # InceptionResNetBlock(1024, scale=0.2),
            # InceptionResNetBlock(1024, scale=0.2)
        )

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(1024, num_classes)

    def forward(self, x):
        x = self.stem(x)
        x = self.inception_resnet_a(x)
        x = self.reduction_a(x)
        x = self.inception_resnet_b(x)
        x = self.reduction_b(x)
        x = self.inception_resnet_c(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

Create the model for the custom model. here, InceptionResNet is the customized model for this project

In [51]:
# Define the model, loss function, and optimizer
model = InceptionResNet(num_classes=2)
model = model.to(device)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Example usage:
num_params = count_parameters(model)
print(f'Total number of parameters: {num_params}')

Total number of parameters: 6323762


# EfficientNetv2 Transfer Model

Load and create the EfficientNetv2 pretrained model.

In [None]:
class EfficientNetV2Regression(nn.Module):
    def __init__(self, num_outputs=2):
        super(EfficientNetV2Regression, self).__init__()
        self.model = models.efficientnet_v2_s(weights=None)#weights=models.EfficientNet_V2_S_Weights.IMAGENET1K_V1)
       
        # Replace the classifier layer to output `num_outputs` regression targets
        self.model.classifier[1] = nn.Linear(self.model.classifier[1].in_features, num_outputs)
    def forward(self, x):
        return self.model(x)

# Instantiate the model
model = EfficientNetV2Regression(num_outputs=3)
model = model.to(device)

#  InceptionResNet Transfer Model

Load and create the InceptionResNet pretrained model.

In [None]:
import timm
model = timm.create_model('inception_resnet_v2', pretrained=True)
model = model.to(device)

in_features = model.classif.in_features
model.classif = nn.Linear(in_features, 3)
model.classif = model.classif.to(device)

Define the Mean Absolute Error loss function.

In [2]:
# Define custom loss functions
def l1_loss(outputs, targets):
    return nn.L1Loss()(outputs, targets)

# DECLARE STATISTICS VARIABLES

In [16]:
train_stats = {}
val_stats = {}
checkpoint = {}
start_epoch = 0
output_path = '/kaggle/working/'
resume_training = True

if resume_training:
    # Path to the stat files
    prev_train_stats_file_path = '/kaggle/working/effinet_train_all_stat_dict.pth'
    prev_val_stats_file_path = '/kaggle/working/effinet_val_all_stat_dict.pth'
    prev_checkpoint_path = '/kaggle/working/effinet_checkpoint_at_epoch_3_loss_3_18682163.pth'

    # Read the train stat file
    with open(prev_train_stats_file_path, 'rb') as file:
        train_stats = torch.load(file)

    # Read the val stat file
    with open(prev_val_stats_file_path, 'rb') as file:
        val_stats = torch.load(file)

    print("TRAIN STATS: ",len(train_stats))
    print("VAL STATS: ", len(val_stats))

    print("RESUMING TRAINING FROM PREVIOUS STOPPED MODEL....")
        
#     model.load_state_dict(torch.load(prev_checkpoint_path)['model_state_dict'])

# CHECK NUMBER OF GPUs

In [17]:
if torch.cuda.is_available():
    # Get the number of GPUs
    num_gpus = torch.cuda.device_count()
    print(f'Number of GPUs: {num_gpus}')
else:
    print('No GPUs available.')

# FINE TUNE
# checkpoint = torch.load("checkpoint_at_epoch_26_loss_1_63385426.pth")
# model.load_state_dict(checkpoint['model_state_dict'])

Number of GPUs: 1


# HEAD POSE TRAINING

Training code for head pose

In [None]:
# criterion = nn.L1Loss()  # Mean Absolute Error Loss
initial_lr = 1e-3
optimizer = optim.Adam(model.parameters(), lr=initial_lr)

# Define the training loop
num_epochs = 50
starting_epoch = 0
patience = 1
if resume_training:
    best_val_loss = val_stats[max(val_stats.keys())]['avg_mae']
else:
    best_val_loss = float('inf')
print("CURRENT BEST VAL LOSS IS: ", best_val_loss)
    
early_stop_counter = 0
successive_lr_update_count = 0
max_successive_lr_update_count = 2

best_checkpoint_name = ''
lr = initial_lr
print("INITIAL LEARNING RATE: ", lr)

train_all_stat_dict = train_stats
val_all_stat_dict = val_stats
print("INITIAL TRAIN STATS: ", len(train_all_stat_dict))
print("INITIAL VAL STATS: ", len(val_all_stat_dict))

if torch.cuda.device_count() > 1:
    print(f'Using {torch.cuda.device_count()} GPUs!')
    model = nn.DataParallel(model)

for epoch in range(starting_epoch, num_epochs):
    model.train()
    running_loss = 0.0
    running_yaw_loss = 0.0
    running_roll_loss = 0.0
    running_pitch_loss = 0.0
    num_batches = len(train_loader)
    
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch", mininterval=1)

    for inputs, targets in train_loader_tqdm:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        
        # Compute L1 loss for yaw, roll, and pitch
        yaw_loss = l1_loss(outputs[:, 0], targets[:, 0])
        roll_loss = l1_loss(outputs[:, 1], targets[:, 1])
        pitch_loss = l1_loss(outputs[:, 2], targets[:, 2])
        avg_loss = (yaw_loss + roll_loss + pitch_loss) / 3
        
        avg_loss.backward()
        optimizer.step()
        
        running_loss += avg_loss.item()
        running_yaw_loss += yaw_loss.item()
        running_roll_loss += roll_loss.item()
        running_pitch_loss += pitch_loss.item()
        
        train_loader_tqdm.set_postfix(loss=running_loss / (len(train_loader_tqdm) + 1))

    avg_loss = running_loss / num_batches
    avg_yaw_loss = running_yaw_loss / num_batches
    avg_roll_loss = running_roll_loss / num_batches
    avg_pitch_loss = running_pitch_loss / num_batches
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Avg Loss: {avg_loss:.4f}, Yaw Loss: {avg_yaw_loss:.4f}, Roll Loss: {avg_roll_loss:.4f}, Pitch Loss: {avg_pitch_loss:.4f}')
    

    train_all_stat_dict[epoch + 1] = {
        'lr': lr,
        'mae': avg_loss,
        'yaw_mae': avg_yaw_loss,
        'roll_mae': avg_roll_loss,
        'pitch_mae': avg_pitch_loss
    }

    # Evaluation on validation set with tqdm progress bar
    model.eval()
    val_loss = 0.0
    val_yaw_loss = 0.0
    val_roll_loss = 0.0
    val_pitch_loss = 0.0
    val_rmse_loss = 0.0
    val_r2_score = 0.0
    num_val_batches = len(val_loader)
    
    val_loader_tqdm = tqdm(val_loader, desc="Validation", unit="batch", mininterval=1)

    with torch.no_grad():
        for inputs, targets in val_loader_tqdm:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            
            # Compute L1 loss for yaw, roll, and pitch
            yaw_loss = l1_loss(outputs[:, 0], targets[:, 0])
            roll_loss = l1_loss(outputs[:, 1], targets[:, 1])
            pitch_loss = l1_loss(outputs[:, 2], targets[:, 2])
            avg_loss = (yaw_loss + roll_loss + pitch_loss) / 3
            
            # Compute RMSE and R2 loss
            rmse = rmse_loss(outputs, targets)
            r2 = r2_score(outputs, targets)
            
            val_loss += avg_loss.item()
            val_yaw_loss += yaw_loss.item()
            val_roll_loss += roll_loss.item()
            val_pitch_loss += pitch_loss.item()
            val_rmse_loss += rmse.item()
            val_r2_score += r2 #.item()
            
            val_loader_tqdm.set_postfix(loss=val_loss / (len(val_loader_tqdm) + 1))

    avg_val_loss = val_loss / num_val_batches
    avg_val_yaw_loss = val_yaw_loss / num_val_batches
    avg_val_roll_loss = val_roll_loss / num_val_batches
    avg_val_pitch_loss = val_pitch_loss / num_val_batches
    avg_val_rmse_loss = val_rmse_loss / num_val_batches
    avg_val_r2_score = val_r2_score / num_val_batches
    
    print(f'Validation Avg Loss: {avg_val_loss:.4f}, Yaw Loss: {avg_val_yaw_loss:.4f}, Roll Loss: {avg_val_roll_loss:.4f}, Pitch Loss: {avg_val_pitch_loss:.4f}, RMSE: {avg_val_rmse_loss:.4f}, R2: {avg_val_r2_score:.4f}')
    
    val_all_stat_dict[epoch + 1] = {
        'lr': lr,
        'avg_mae': avg_val_loss,
        'yaw_mae': avg_val_yaw_loss,
        'roll_mae': avg_val_roll_loss,
        'pitch_mae': avg_val_pitch_loss,
        'rmse': avg_val_rmse_loss,
        'r2_score': avg_val_r2_score
    }
    
    # Save model if validation loss has improved
    if avg_val_loss < best_val_loss:
        checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'avg_mae': avg_val_loss,
            'yaw_mae': avg_val_yaw_loss,
            'roll_mae': avg_val_roll_loss,
            'pitch_mae': avg_val_pitch_loss,
            'rmse': avg_val_rmse_loss,
            'r2_score': avg_val_r2_score,
        }
        best_val_loss = avg_val_loss
        
        loss = best_val_loss
        loss = str(round(loss,8)).replace('.', '_')
        
        best_checkpoint_name = f'effinet_checkpoint_at_epoch_{epoch}_loss_{loss}'
        torch.save(checkpoint, f'{output_path}/{best_checkpoint_name}.pth')
        
        early_stop_counter = 0
        successive_lr_update_count = 0
    else:
        early_stop_counter += 1
    
    # Early stopping and learning rate adjustment
    if early_stop_counter >= patience:
        
        for g in optimizer.param_groups:
            lr = g['lr'] * 0.1
            g['lr'] = lr
        successive_lr_update_count += 1
        print(f'Validation loss did not improve for {patience} epochs. Reducing learning rate to {lr} and loading best model.')
        if best_checkpoint_name == '':
            model.load_state_dict(torch.load(prev_checkpoint_path)['model_state_dict'])
        else:
            checkpoint = torch.load(f'{output_path}/{best_checkpoint_name}.pth')
            model.load_state_dict(checkpoint['model_state_dict'])
        early_stop_counter = 0
    
    torch.save(train_all_stat_dict, f'{output_path}/effinet_train_all_stat_dict.pth')
    torch.save(val_all_stat_dict, f'{output_path}/effinet_val_all_stat_dict.pth')

    if successive_lr_update_count > max_successive_lr_update_count:
        break

print('Training complete')

CURRENT BEST VAL LOSS IS:  inf
INITIAL LEARNING RATE:  0.001
INITIAL TRAIN STATS:  0
INITIAL VAL STATS:  0


Epoch 1/50: 100%|██████████| 4028/4028 [42:08<00:00,  1.59batch/s, loss=7.79]  


Epoch [1/50], Avg Loss: 7.7884, Yaw Loss: 8.8732, Roll Loss: 7.0448, Pitch Loss: 7.4472


Validation: 100%|██████████| 1151/1151 [07:39<00:00,  2.51batch/s, loss=5.13]


Validation Avg Loss: 5.1381, Yaw Loss: 6.4810, Roll Loss: 4.3687, Pitch Loss: 4.5646, RMSE: 8.3053, R2: 0.7749


Epoch 2/50: 100%|██████████| 4028/4028 [36:33<00:00,  1.84batch/s, loss=5.08]


Epoch [2/50], Avg Loss: 5.0797, Yaw Loss: 6.3782, Roll Loss: 4.1660, Pitch Loss: 4.6950


Validation: 100%|██████████| 1151/1151 [04:56<00:00,  3.88batch/s, loss=4.57]


Validation Avg Loss: 4.5727, Yaw Loss: 5.8986, Roll Loss: 3.7424, Pitch Loss: 4.0771, RMSE: 7.3029, R2: 0.8257


Epoch 3/50: 100%|██████████| 4028/4028 [38:52<00:00,  1.73batch/s, loss=4.2] 


Epoch [3/50], Avg Loss: 4.1971, Yaw Loss: 5.5364, Roll Loss: 3.3240, Pitch Loss: 3.7309


Validation: 100%|██████████| 1151/1151 [05:06<00:00,  3.75batch/s, loss=3.65]


Validation Avg Loss: 3.6536, Yaw Loss: 5.0174, Roll Loss: 2.5220, Pitch Loss: 3.4214, RMSE: 5.9587, R2: 0.8804


Epoch 4/50:  44%|████▎     | 1758/4028 [16:01<20:53,  1.81batch/s, loss=1.67] 

## Fine-tuning of head pose to eyegaze

Fine-tuning of the eye gaze from head pose model

In [30]:
model = InceptionResNet(num_classes=2)
checkpoint = "/kaggle/working/eyegaze_checkpoint_at_epoch_8_loss_0_34554856.pth"
model.load_state_dict(torch.load(checkpoint)['model_state_dict'])
# model.fc = nn.Linear(model.fc.in_features, 2)
model = model.to(device)

# EYEGAZE TRAINING

Eye gaze training code

In [31]:
# criterion = nn.L1Loss()  # Mean Absolute Error Loss
initial_lr = 1e-4
optimizer = optim.Adam(model.parameters(), lr=initial_lr)

# Define the training loop
num_epochs = 50
starting_epoch = 9
patience = 1
if resume_training:
    best_val_loss = val_stats[max(val_stats.keys())]['avg_mae']
else:
    best_val_loss = float('inf')
print("CURRENT BEST VAL LOSS IS: ", best_val_loss)
    
early_stop_counter = 0
successive_lr_update_count = 0
max_successive_lr_update_count = 2

best_checkpoint_name = ''
lr = initial_lr
print("INITIAL LEARNING RATE: ", lr)

train_all_stat_dict = train_stats
val_all_stat_dict = val_stats
print("INITIAL TRAIN STATS: ", len(train_all_stat_dict))
print("INITIAL VAL STATS: ", len(val_all_stat_dict))

if torch.cuda.device_count() > 1:
    print(f'Using {torch.cuda.device_count()} GPUs!')
    model = nn.DataParallel(model)

for epoch in range(starting_epoch, num_epochs):
    model.train()
    running_loss = 0.0
    running_x_loss = 0.0
    running_y_loss = 0.0
    num_batches = len(train_loader)
    
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch", mininterval=1)

    for inputs, targets in train_loader_tqdm:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        
        # Compute L1 loss for yaw, roll, and pitch
        x_loss = l1_loss(outputs[:, 0], targets[:, 0])
        y_loss = l1_loss(outputs[:, 1], targets[:, 1])
        avg_loss = (x_loss + y_loss) / 2
        
        avg_loss.backward()
        optimizer.step()
        
        running_loss += avg_loss.item()
        running_x_loss += x_loss.item()
        running_y_loss += y_loss.item()
        
        train_loader_tqdm.set_postfix(loss=running_loss / (len(train_loader_tqdm) + 1))

    avg_loss = running_loss / num_batches
    avg_x_loss = running_x_loss / num_batches
    avg_y_loss = running_y_loss / num_batches
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Avg Loss: {avg_loss:.4f}, X Loss: {avg_x_loss:.4f}, Y Loss: {avg_y_loss:.4f}')
    

    train_all_stat_dict[epoch + 1] = {
        'lr': lr,
        'mae': avg_loss,
        'x_mae': avg_x_loss,
        'y_mae': avg_y_loss
    }

    # Evaluation on validation set with tqdm progress bar
    model.eval()
    val_loss = 0.0
    val_x_loss = 0.0
    val_y_loss = 0.0
    val_rmse_loss = 0.0
    val_r2_score = 0.0
    num_val_batches = len(val_loader)
    
    val_loader_tqdm = tqdm(val_loader, desc="Validation", unit="batch", mininterval=1)

    with torch.no_grad():
        for inputs, targets in val_loader_tqdm:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            
            # Compute L1 loss for yaw, roll, and pitch
            x_loss = l1_loss(outputs[:, 0], targets[:, 0])
            y_loss = l1_loss(outputs[:, 1], targets[:, 1])
            avg_loss = (x_loss + y_loss) / 2
            
            # Compute RMSE and R2 loss
            rmse = rmse_loss(outputs, targets)
            r2 = r2_score(outputs, targets)
            
            val_loss += avg_loss.item()
            val_x_loss += x_loss.item()
            val_y_loss += y_loss.item()
            val_rmse_loss += rmse.item()
            val_r2_score += r2 #.item()
            
            val_loader_tqdm.set_postfix(loss=val_loss / (len(val_loader_tqdm) + 1))

    avg_val_loss = val_loss / num_val_batches
    avg_val_x_loss = val_x_loss / num_val_batches
    avg_val_y_loss = val_y_loss / num_val_batches
    avg_val_rmse_loss = val_rmse_loss / num_val_batches
    avg_val_r2_score = val_r2_score / num_val_batches
    
    print(f'Validation Avg Loss: {avg_val_loss:.4f}, X Loss: {avg_val_x_loss:.4f}, Y Loss: {avg_val_y_loss:.4f}, RMSE: {avg_val_rmse_loss:.4f}, R2: {avg_val_r2_score:.4f}')
    
    val_all_stat_dict[epoch + 1] = {
        'lr': lr,
        'avg_mae': avg_val_loss,
        'x_mae': avg_val_x_loss,
        'y_mae': avg_val_y_loss,
        'rmse': avg_val_rmse_loss,
        'r2_score': avg_val_r2_score
    }
    
    # Save model if validation loss has improved
    if avg_val_loss < best_val_loss:
        checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'avg_mae': avg_val_loss,
            'x_mae': avg_val_x_loss,
            'y_mae': avg_val_y_loss,
            'rmse': avg_val_rmse_loss,
            'r2_score': avg_val_r2_score,
        }
        best_val_loss = avg_val_loss
        
        loss = best_val_loss
        loss = str(round(loss,8)).replace('.', '_')
        
        best_checkpoint_name = f'eyegaze_checkpoint_at_epoch_{epoch}_loss_{loss}'
        torch.save(checkpoint, f'{output_path}/{best_checkpoint_name}.pth')
        
        early_stop_counter = 0
        successive_lr_update_count = 0
    else:
        early_stop_counter += 1
    
    # Early stopping and learning rate adjustment
    if early_stop_counter >= patience:
        
        for g in optimizer.param_groups:
            g['lr'] = g['lr'] * 0.1
            lr = g['lr']
            successive_lr_update_count += 1
        print(f'Validation loss did not improve for {patience} epochs. Reducing learning rate to {lr} and loading best model.')
        if best_checkpoint_name == '':
            model.load_state_dict(torch.load(prev_checkpoint_path)['model_state_dict'])
        else:
            checkpoint = torch.load(f'{output_path}/{best_checkpoint_name}.pth')
            model.load_state_dict(checkpoint['model_state_dict'])
        early_stop_counter = 0
    
    torch.save(train_all_stat_dict, f'{output_path}/train_all_stat_dict.pth')
    torch.save(val_all_stat_dict, f'{output_path}/val_all_stat_dict.pth')
    
    if successive_lr_update_count > max_successive_lr_update_count:
        break

print('Training complete')


CURRENT BEST VAL LOSS IS:  0.34554855867923245
INITIAL LEARNING RATE:  0.0001
INITIAL TRAIN STATS:  9
INITIAL VAL STATS:  9


Epoch 10/50: 100%|██████████| 3336/3336 [25:11<00:00,  2.21batch/s, loss=0.403]


Epoch [10/50], Avg Loss: 0.4035, X Loss: 0.4185, Y Loss: 0.3885


Validation: 100%|██████████| 953/953 [06:28<00:00,  2.45batch/s, loss=0.339] 


Validation Avg Loss: 0.3393, X Loss: 0.3536, Y Loss: 0.3250, RMSE: 0.4448, R2: 0.9960


Epoch 11/50: 100%|██████████| 3336/3336 [19:19<00:00,  2.88batch/s, loss=0.398]


Epoch [11/50], Avg Loss: 0.3982, X Loss: 0.4139, Y Loss: 0.3824


Validation: 100%|██████████| 953/953 [03:22<00:00,  4.70batch/s, loss=0.348] 


Validation Avg Loss: 0.3481, X Loss: 0.3545, Y Loss: 0.3417, RMSE: 0.4578, R2: 0.9957
Validation loss did not improve for 1 epochs. Reducing learning rate to 1e-05 and loading best model.


Epoch 12/50: 100%|██████████| 3336/3336 [19:45<00:00,  2.81batch/s, loss=0.384]


Epoch [12/50], Avg Loss: 0.3845, X Loss: 0.3991, Y Loss: 0.3699


Validation: 100%|██████████| 953/953 [03:34<00:00,  4.44batch/s, loss=0.328] 


Validation Avg Loss: 0.3285, X Loss: 0.3368, Y Loss: 0.3201, RMSE: 0.4338, R2: 0.9961


Epoch 13/50: 100%|██████████| 3336/3336 [20:12<00:00,  2.75batch/s, loss=0.38] 


Epoch [13/50], Avg Loss: 0.3798, X Loss: 0.3945, Y Loss: 0.3652


Validation: 100%|██████████| 953/953 [03:52<00:00,  4.10batch/s, loss=0.322] 


Validation Avg Loss: 0.3220, X Loss: 0.3388, Y Loss: 0.3052, RMSE: 0.4248, R2: 0.9963


Epoch 14/50: 100%|██████████| 3336/3336 [21:49<00:00,  2.55batch/s, loss=0.379]


Epoch [14/50], Avg Loss: 0.3793, X Loss: 0.3924, Y Loss: 0.3661


Validation: 100%|██████████| 953/953 [04:03<00:00,  3.91batch/s, loss=0.319] 


Validation Avg Loss: 0.3190, X Loss: 0.3347, Y Loss: 0.3034, RMSE: 0.4216, R2: 0.9964


Epoch 15/50: 100%|██████████| 3336/3336 [20:31<00:00,  2.71batch/s, loss=0.377]


Epoch [15/50], Avg Loss: 0.3776, X Loss: 0.3913, Y Loss: 0.3639


Validation: 100%|██████████| 953/953 [03:43<00:00,  4.26batch/s, loss=0.322] 


Validation Avg Loss: 0.3219, X Loss: 0.3354, Y Loss: 0.3084, RMSE: 0.4232, R2: 0.9964
Validation loss did not improve for 1 epochs. Reducing learning rate to 1.0000000000000002e-06 and loading best model.


Epoch 16/50: 100%|██████████| 3336/3336 [20:37<00:00,  2.70batch/s, loss=0.377]


Epoch [16/50], Avg Loss: 0.3769, X Loss: 0.3904, Y Loss: 0.3633


Validation: 100%|██████████| 953/953 [03:41<00:00,  4.30batch/s, loss=0.327] 


Validation Avg Loss: 0.3278, X Loss: 0.3486, Y Loss: 0.3070, RMSE: 0.4315, R2: 0.9962
Validation loss did not improve for 1 epochs. Reducing learning rate to 1.0000000000000002e-07 and loading best model.


Epoch 17/50: 100%|██████████| 3336/3336 [20:52<00:00,  2.66batch/s, loss=0.376]


Epoch [17/50], Avg Loss: 0.3765, X Loss: 0.3911, Y Loss: 0.3619


Validation: 100%|██████████| 953/953 [05:21<00:00,  2.97batch/s, loss=0.327] 


Validation Avg Loss: 0.3272, X Loss: 0.3478, Y Loss: 0.3066, RMSE: 0.4315, R2: 0.9962
Validation loss did not improve for 1 epochs. Reducing learning rate to 1.0000000000000004e-08 and loading best model.
Training complete


# HEAD POSE TEST

Testing code for head pose

In [None]:
checkpoint = torch.load("/kaggle/working/eyegaze_checkpoint_at_epoch_13_loss_0_31901939.pth")
model.load_state_dict(checkpoint['model_state_dict'])

model.eval()
test_loss = 0.0
test_yaw_loss = 0.0
test_roll_loss = 0.0
test_pitch_loss = 0.0
test_rmse_loss = 0.0
test_r2_score = 0.0
num_test_batches = len(test_loader)

test_loader_tqdm = tqdm(test_loader, desc="TEST", unit="batch", mininterval=1)

with torch.no_grad():
    for inputs, targets in test_loader_tqdm:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)

        # Compute L1 loss for yaw, roll, and pitch
        yaw_loss = l1_loss(outputs[:, 0], targets[:, 0])
        roll_loss = l1_loss(outputs[:, 1], targets[:, 1])
        pitch_loss = l1_loss(outputs[:, 2], targets[:, 2])
        avg_loss = (yaw_loss + roll_loss + pitch_loss) / 3

        # Compute RMSE and R2 loss
        rmse = rmse_loss(outputs, targets)
        r2 = r2_score(outputs, targets)

        test_loss += avg_loss.item()
        test_yaw_loss += yaw_loss.item()
        test_roll_loss += roll_loss.item()
        test_pitch_loss += pitch_loss.item()
        test_rmse_loss += rmse.item()
        test_r2_score += r2 #.item()

        test_loader_tqdm.set_postfix(loss=test_loss / (len(test_loader_tqdm) + 1))

avg_test_loss = test_loss / num_test_batches
avg_test_yaw_loss = test_yaw_loss / num_test_batches
avg_test_roll_loss = test_roll_loss / num_test_batches
avg_test_pitch_loss = test_pitch_loss / num_test_batches
avg_test_rmse_loss = test_rmse_loss / num_test_batches
avg_test_r2_score = test_r2_score / num_test_batches

print(f'Validation Avg Loss: {avg_test_loss:.4f}, Yaw Loss: {avg_test_yaw_loss:.4f}, Roll Loss: {avg_test_roll_loss:.4f}, Pitch Loss: {avg_test_pitch_loss:.4f}, RMSE: {avg_test_rmse_loss:.4f}, R2: {avg_test_r2_score:.4f}')

In [65]:
class EfficientNetV2Regression(nn.Module):
    def __init__(self, num_outputs=2):
        super(EfficientNetV2Regression, self).__init__()
        self.model = models.efficientnet_v2_s(weights=models.EfficientNet_V2_S_Weights.IMAGENET1K_V1)
       
        # Replace the classifier layer to output `num_outputs` regression targets
        self.model.classifier[1] = nn.Linear(self.model.classifier[1].in_features, num_outputs)
    def forward(self, x):
        return self.model(x)

# Instantiate the model
model = EfficientNetV2Regression(num_outputs=2)
model = model.to(device)

In [58]:
import timm
model = timm.create_model('inception_resnet_v2', pretrained=True)
model = model.to(device)

in_features = model.classif.in_features
model.classif = nn.Linear(in_features, 2)
model.classif = model.classif.to(device)

# EYEGAZE TEST CODE

Testing code for eye gaze model

In [67]:
checkpoint = torch.load("/kaggle/input/eyegaze-effinet-model-and-data/eyegaze_checkpoint_at_epoch_7_loss_0_27638153.pth")
model.load_state_dict(checkpoint['model_state_dict'])

model.eval()
test_loss = 0.0
test_x_loss = 0.0
test_y_loss = 0.0
test_rmse_loss = 0.0
test_r2_score = 0.0
num_test_batches = len(test_loader)

test_loader_tqdm = tqdm(test_loader, desc="TEST", unit="batch", mininterval=1)

with torch.no_grad():
    for inputs, targets in test_loader_tqdm:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)

        # Compute L1 loss for x, y, and pitch
        x_loss = l1_loss(outputs[:, 0], targets[:, 0])
        y_loss = l1_loss(outputs[:, 1], targets[:, 1])
        avg_loss = (x_loss + y_loss) / 2

        # Compute RMSE and R2 loss
        rmse = rmse_loss(outputs, targets)
        r2 = r2_score(outputs, targets)

        test_loss += avg_loss.item()
        test_x_loss += x_loss.item()
        test_y_loss += y_loss.item()
        test_rmse_loss += rmse.item()
        test_r2_score += r2 #.item()

        test_loader_tqdm.set_postfix(loss=test_loss / (len(test_loader_tqdm) + 1))

avg_test_loss = test_loss / num_test_batches
avg_test_x_loss = test_x_loss / num_test_batches
avg_test_y_loss = test_y_loss / num_test_batches
avg_test_rmse_loss = test_rmse_loss / num_test_batches
avg_test_r2_score = test_r2_score / num_test_batches

print(f'Validation Avg Loss: {avg_test_loss:.4f}, x Loss: {avg_test_x_loss:.4f}, y Loss: {avg_test_y_loss:.4f}, RMSE: {avg_test_rmse_loss:.4f}, R2: {avg_test_r2_score:.4f}')

TEST: 100%|██████████| 477/477 [01:58<00:00,  4.03batch/s, loss=0.256] 

Validation Avg Loss: 0.2567, x Loss: 0.2893, y Loss: 0.2242, RMSE: 0.3406, R2: 0.9976



