In [None]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Data Load

In [None]:
import os
import numpy as np

def load_images_from_batches(batch_folder):
    """
    Load processed images from batch files and store them in a dictionary.

    Args:
        batch_folder (str): Path to the folder containing the saved batch files.

    Returns:
        image_dict (dict): Dictionary mapping image ID (integer) to preprocessed image data.
    """
    image_dict = {}

    # Loop through all files in the batch folder
    for batch_file in os.listdir(batch_folder):
        if batch_file.endswith('.npz'):
            batch_path = os.path.join(batch_folder, batch_file)

            # Load the batch file
            batch_data = np.load(batch_path)
            images = batch_data['images']
            filenames = batch_data['filenames']

            # Store each image and its corresponding image ID in the dictionary (convert image_id to int)
            for i, image_id in enumerate(filenames):
                image_id_int = int(image_id)  # Convert image ID to integer to remove leading zeros
                image_dict[image_id_int] = images[i]

    return image_dict

# Path where the batch files are stored
batch_folder = '/content/drive/MyDrive/Colab Notebooks/Deep Learning Project/Processed Images Train'

# Load the images into a dictionary
image_dict = load_images_from_batches(batch_folder)

# Print the number of images loaded
print(f"Total images loaded: {len(image_dict)}")


Total images loaded: 20000


In [None]:
import pickle
import numpy as np


# Load tokenized data (train_x and test_x) from pickle files
with open('/content/drive/MyDrive/Colab Notebooks/Deep Learning Project/tokenized_train_y.pkl', 'rb') as f:
    train_y = pickle.load(f)

with open('/content/drive/MyDrive/Colab Notebooks/Deep Learning Project/tokenized_test_y.pkl', 'rb') as f:
    test_y = pickle.load(f)

# Load tokenized data (train_x and test_x) from pickle files
with open('/content/drive/MyDrive/Colab Notebooks/Deep Learning Project/tokenized_train_x.pkl', 'rb') as f:
    train_x = pickle.load(f)

with open('/content/drive/MyDrive/Colab Notebooks/Deep Learning Project/tokenized_test_x.pkl', 'rb') as f:
    test_x = pickle.load(f)

#bring confidence
with open('/content/drive/MyDrive/Colab Notebooks/Deep Learning Project/processed_train_y_conf.pkl', 'rb') as f:
    train_y_cof = pickle.load(f)

with open('/content/drive/MyDrive/Colab Notebooks/Deep Learning Project/processed_test_y_conf.pkl', 'rb') as f:
    test_y_cof = pickle.load(f)

train_y['confidence_scores']=train_y_cof['confidence_scores']
test_y['confidence_scores']=test_y_cof['confidence_scores']


### OLD

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
from torchvision import transforms, models
from torchvision.models import VGG16_Weights

###########################################
# Custom Dataset Class for Loading VQA Data
###########################################

class VQADataset(Dataset):
    def __init__(self, data_x, data_y, image_dict, augment=False, augment_prob=0.3, is_training=True):
        self.data_x = data_x
        self.data_y = data_y
        self.image_dict = image_dict
        self.augment = augment
        self.augment_prob = augment_prob
        self.is_training = is_training

        self.train_transform = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(5),
            transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),
            transforms.RandomResizedCrop(224, scale=(0.95, 1.0))
        ])

    def __len__(self):
        return len(self.data_x)

    def __getitem__(self, idx):
        input_ids_list = torch.tensor(self.data_x['input_ids'].iloc[idx], dtype=torch.long)
        attention_mask_list = torch.tensor(self.data_x['attention_mask'].iloc[idx], dtype=torch.long)
        image_id = self.data_x['image_id'].iloc[idx]
        image_tensor = torch.tensor(self.image_dict[image_id], dtype=torch.float32).permute(2, 0, 1)

        if self.augment and torch.rand(1).item() < self.augment_prob:
            image_tensor = self.train_transform(image_tensor)

        label = torch.tensor(self.data_y['label'].iloc[idx], dtype=torch.long)

        if self.is_training:
            confidence_scores = torch.tensor(self.data_y['confidence_scores'].iloc[idx], dtype=torch.float32)
            return input_ids_list, attention_mask_list, image_tensor, label, confidence_scores
        else:
            confidence_scores = torch.zeros(len(self.data_x['input_ids'].iloc[idx]), dtype=torch.float32)
            correct_answer_idx = self.data_y['label'].iloc[idx]
            confidence_scores[correct_answer_idx] = 1.0
            return input_ids_list, attention_mask_list, image_tensor, label, confidence_scores

###########################################
# BERT-based Text Model with Freezing up to Layer 8
###########################################

class TextModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune=False, reduced_dim=512):
        super(TextModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)

        # Freeze the first 8 layers of BERT
        for param in self.bert.encoder.layer[:8].parameters():
            param.requires_grad = False

        # Dimensionality reduction after BERT
        self.fc_reduce = nn.Linear(768, reduced_dim)

    def forward(self, input_ids, attention_mask):
        batch_size, num_choices, seq_len = input_ids.size()
        input_ids = input_ids.view(-1, seq_len)
        attention_mask = attention_mask.view(-1, seq_len)

        text_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_output.last_hidden_state[:, 0, :]

        reduced_text_features = self.fc_reduce(text_features)
        reduced_text_features = reduced_text_features.view(batch_size, num_choices, -1)
        return reduced_text_features

###########################################
# Image Model using VGG-16 with Customization
###########################################

class ImageModel(nn.Module):
    def __init__(self, fine_tune_layers=True, reduced_dim=512):
        super(ImageModel, self).__init__()
        self.vgg16 = models.vgg16(weights=VGG16_Weights.IMAGENET1K_V1)

        # Freeze VGG layers by default
        for param in self.vgg16.parameters():
            param.requires_grad = False

        # Unfreeze the last layers for fine-tuning
        if fine_tune_layers:
            for param in list(self.vgg16.features.parameters())[-5:]:
                param.requires_grad = True

        self.fc_reduce = nn.Linear(25088, reduced_dim)

    def forward(self, image_tensor):
        image_features = self.vgg16.features(image_tensor)
        image_features = torch.flatten(image_features, start_dim=1)
        image_features = self.fc_reduce(image_features)
        return image_features

###########################################
# Fusion Model combining Text and Image Features
###########################################

class FusionModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune_bert=True, fine_tune_layers=True, dropout_prob=0.3, reduced_dim=512):
        super(FusionModel, self).__init__()

        self.text_model = TextModel(bert_model_name=bert_model_name, fine_tune=fine_tune_bert)
        self.image_model = ImageModel(fine_tune_layers=fine_tune_layers, reduced_dim=reduced_dim)

        # Fusion layer
        self.fusion_layer = nn.Linear(reduced_dim + reduced_dim, 64)
        self.batch_norm = nn.BatchNorm1d(64)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input_ids, attention_mask, image_tensor):
        text_features = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        image_features = self.image_model(image_tensor=image_tensor)

        batch_size, num_choices, _ = text_features.size()
        image_features = image_features.unsqueeze(1).expand(batch_size, num_choices, -1)

        combined_features = torch.cat((text_features, image_features), dim=2)
        combined_features = combined_features.view(-1, combined_features.size(2))
        fused_features = self.fusion_layer(combined_features)
        fused_features = self.batch_norm(fused_features)
        fused_features = self.dropout(fused_features)
        fused_features = fused_features.view(batch_size, num_choices, -1)

        return fused_features

###########################################
# VQA Model for Final Prediction
###########################################

class VQA_Model(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune_bert=True, fine_tune_layers=True, dropout_prob=0.3, reduced_dim=512):
        super(VQA_Model, self).__init__()
        self.fusion_model = FusionModel(bert_model_name=bert_model_name, fine_tune_bert=fine_tune_bert, fine_tune_layers=fine_tune_layers, dropout_prob=dropout_prob, reduced_dim=reduced_dim)
        self.classifier = nn.Linear(64, 1)

    def forward(self, input_ids, attention_mask, image_tensor, confidence_scores=None):
        fused_features = self.fusion_model(input_ids=input_ids, attention_mask=attention_mask, image_tensor=image_tensor)
        logits = self.classifier(fused_features).squeeze(-1)

        # Adjust logits with confidence scores during training (if provided)
        if confidence_scores is not None:
            logits = logits * confidence_scores

        batch_size, num_choices = input_ids.size(0), fused_features.size(1)
        logits = logits.view(batch_size, num_choices)

        return logits

###########################################
# Training Function
###########################################

def train_model(model, train_dataloader, val_dataloader, num_epochs=10, optimizer=None, checkpoint_path='model_checkpoint.pt'):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    if optimizer is None:
        raise ValueError("Optimizer must be provided")

    scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

    torch.backends.cudnn.benchmark = True  # Enable cuDNN autotuning

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
            input_ids, attention_mask, image_tensor, labels, confidence_scores = batch
            input_ids, attention_mask, image_tensor, labels, confidence_scores = (
                input_ids.to(device), attention_mask.to(device), image_tensor.to(device), labels.to(device), confidence_scores.to(device)
            )

            optimizer.zero_grad()

            logits = model(input_ids, attention_mask, image_tensor, confidence_scores=confidence_scores)
            loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
            loss = loss_fn(logits, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        avg_loss = total_loss / len(train_dataloader)
        accuracy = 100 * correct / total
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

        # Save model after each epoch
        checkpoint_file = f"{checkpoint_path}_epoch_{epoch + 1}.pt"
        torch.save(model.state_dict(), checkpoint_file)
        print(f"Model saved to {checkpoint_file}")

        # Only run validation every 3 epochs
        if (epoch + 1) % 3 == 0:
            model.eval()
            val_loss = 0
            val_correct = 0
            val_total = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    input_ids, attention_mask, image_tensor, labels, confidence_scores = batch
                    input_ids, attention_mask, image_tensor, labels, confidence_scores = (
                        input_ids.to(device), attention_mask.to(device), image_tensor.to(device), labels.to(device), confidence_scores.to(device)
                    )
                    logits = model(input_ids, attention_mask, image_tensor)
                    val_loss += loss_fn(logits, labels).item()

                    _, predicted = torch.max(logits, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()

            avg_val_loss = val_loss / len(val_dataloader)
            val_accuracy = 100 * val_correct / val_total
            print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")
            scheduler.step()

    print("Training complete.")



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchvision import transforms, models
from torchvision.models import VGG16_Weights
from torch.cuda.amp import autocast, GradScaler  # For mixed precision training
import torch.autograd.profiler as profiler  # For profiling

###########################################
# Custom Dataset Class for Loading VQA Data
###########################################

class VQADataset(Dataset):
    def __init__(self, data_x, data_y, image_dict, augment=False, augment_prob=0.3, is_training=True):
        self.data_x = data_x
        self.data_y = data_y
        self.image_dict = image_dict
        self.augment = augment
        self.augment_prob = augment_prob  # Probability of applying augmentation
        self.is_training = is_training

        # Data augmentation for training
        self.train_transform = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(5),
            transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),
            transforms.RandomResizedCrop(224, scale=(0.95, 1.0))
        ])

    def __len__(self):
        return len(self.data_x)

    def __getitem__(self, idx):
        # Tokenized text (input_ids and attention_mask)
        input_ids_list = torch.tensor(self.data_x['input_ids'].iloc[idx], dtype=torch.long)
        attention_mask_list = torch.tensor(self.data_x['attention_mask'].iloc[idx], dtype=torch.long)

        # Image data
        image_id = self.data_x['image_id'].iloc[idx]
        image_tensor = torch.tensor(self.image_dict[image_id], dtype=torch.float32).permute(2, 0, 1)

        # Apply augmentation
        if self.augment and torch.rand(1).item() < self.augment_prob:
            image_tensor = self.train_transform(image_tensor)

        # Get label (correct answer)
        label = torch.tensor(self.data_y['label'].iloc[idx], dtype=torch.long)

        # Only return confidence scores if in training mode
        if self.is_training:
            confidence_scores = torch.tensor(self.data_y['confidence_scores'].iloc[idx], dtype=torch.float32)
            return input_ids_list, attention_mask_list, image_tensor, label, confidence_scores
        else:
            return input_ids_list, attention_mask_list, image_tensor, label

###########################################
# BERT-based Text Model with Dimensionality Reduction
###########################################

class TextModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune=False, reduced_dim=512):
        super(TextModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)

        # Optionally freeze BERT weights
        if not fine_tune:
            for param in self.bert.parameters():
                param.requires_grad = False

        # Dimensionality reduction after BERT
        self.fc_reduce = nn.Linear(768, reduced_dim)

    def forward(self, input_ids, attention_mask):
        batch_size, num_choices, seq_len = input_ids.size()
        input_ids = input_ids.view(-1, seq_len)
        attention_mask = attention_mask.view(-1, seq_len)

        # Extract [CLS] token output from BERT
        with torch.no_grad():
            text_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            text_features = text_output.last_hidden_state[:, 0, :]

        reduced_text_features = self.fc_reduce(text_features)
        reduced_text_features = reduced_text_features.view(batch_size, num_choices, -1)
        return reduced_text_features

###########################################
# Image Model using VGG-16 with Customization
###########################################

class ImageModel(nn.Module):
    def __init__(self, fine_tune_layers=True, reduced_dim=512):
        super(ImageModel, self).__init__()
        self.vgg16 = models.vgg16(weights=VGG16_Weights.IMAGENET1K_V1)

        # Freeze VGG layers by default
        for param in self.vgg16.parameters():
            param.requires_grad = False

        # Unfreeze the last layers for fine-tuning
        if fine_tune_layers:
            for param in list(self.vgg16.features.parameters())[-9:]:
                param.requires_grad = True

        self.fc_reduce = nn.Linear(25088, reduced_dim)

    def forward(self, image_tensor):
        image_features = self.vgg16.features(image_tensor)
        image_features = torch.flatten(image_features, start_dim=1)
        image_features = self.fc_reduce(image_features)
        return image_features

###########################################
# Fusion Model combining Text and Image Features
###########################################

class FusionModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune_bert=True, fine_tune_layers=True, dropout_prob=0.3, reduced_dim=512):
        super(FusionModel, self).__init__()

        self.text_model = TextModel(bert_model_name=bert_model_name, fine_tune=fine_tune_bert)
        self.image_model = ImageModel(fine_tune_layers=fine_tune_layers, reduced_dim=reduced_dim)

        # Fusion layer
        self.fusion_layer = nn.Linear(reduced_dim + reduced_dim, 64)
        self.batch_norm = nn.BatchNorm1d(64)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input_ids, attention_mask, image_tensor):
        text_features = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        image_features = self.image_model(image_tensor=image_tensor)

        batch_size, num_choices, _ = text_features.size()
        image_features = image_features.unsqueeze(1).expand(batch_size, num_choices, -1)

        combined_features = torch.cat((text_features, image_features), dim=2)
        combined_features = combined_features.view(-1, combined_features.size(2))
        fused_features = self.fusion_layer(combined_features)
        fused_features = self.batch_norm(fused_features)
        fused_features = self.dropout(fused_features)
        fused_features = fused_features.view(batch_size, num_choices, -1)

        return fused_features

###########################################
# VQA Model for Final Prediction
###########################################

class VQA_Model(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune_bert=True, fine_tune_layers=True, dropout_prob=0.3, reduced_dim=512):
        super(VQA_Model, self).__init__()
        self.fusion_model = FusionModel(bert_model_name=bert_model_name, fine_tune_bert=fine_tune_bert, fine_tune_layers=fine_tune_layers, dropout_prob=dropout_prob, reduced_dim=reduced_dim)
        self.classifier = nn.Linear(64, 1)

    def forward(self, input_ids, attention_mask, image_tensor, confidence_scores=None):
        fused_features = self.fusion_model(input_ids=input_ids, attention_mask=attention_mask, image_tensor=image_tensor)
        logits = self.classifier(fused_features).squeeze(-1)

        # Adjust logits with confidence scores during training (if provided)
        if confidence_scores is not None:
            logits = logits * confidence_scores

        batch_size, num_choices = input_ids.size(0), fused_features.size(1)
        logits = logits.view(batch_size, num_choices)

        return logits

###########################################
# Early Stopping Class
###########################################

class EarlyStopping:
    def __init__(self, patience=5, delta=0, path='checkpoint.pt'):
        self.patience = patience
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(val_loss, model)
        elif val_loss > self.best_loss + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decreases.'''
        torch.save(model.state_dict(), self.path)

###########################################
# Training Function with Accuracy and Mixed Precision
###########################################

def train_model(model, train_dataloader, val_dataloader, num_epochs=10, lr=1e-5, checkpoint_path='model_checkpoint.pt', early_stopping_patience=3):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # Mixed precision scaler
    scaler = GradScaler()

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.05, patience=2)

    early_stopping = EarlyStopping(patience=early_stopping_patience, path=checkpoint_path)

    torch.backends.cudnn.benchmark = True  # Enable cuDNN autotuning for optimization

    with profiler.profile(use_cuda=True) as prof:  # Enable profiling
        for epoch in range(num_epochs):
            model.train()
            total_loss = 0
            correct = 0
            total = 0

            for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
                input_ids, attention_mask, image_tensor, labels, confidence_scores = batch
                input_ids, attention_mask, image_tensor, labels, confidence_scores = input_ids.to(device), attention_mask.to(device), image_tensor.to(device), labels.to(device), confidence_scores.to(device)

                optimizer.zero_grad()

                # Mixed precision forward pass
                with autocast():
                    logits = model(input_ids, attention_mask, image_tensor, confidence_scores=confidence_scores)
                    loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
                    loss = loss_fn(logits, labels)

                # Backward pass with gradient scaling
                scaler.scale(loss).backward()

                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

                # Optimizer step with mixed precision
                scaler.step(optimizer)
                scaler.update()

                total_loss += loss.item()

                _, predicted = torch.max(logits, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            avg_loss = total_loss / len(train_dataloader)
            accuracy = 100 * correct / total
            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

            # Validation Loop
            model.eval()
            val_loss = 0
            val_correct = 0
            val_total = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    input_ids, attention_mask, image_tensor, labels, confidence_scores = batch
                    input_ids, attention_mask, image_tensor, labels, confidence_scores = input_ids.to(device), attention_mask.to(device), image_tensor.to(device), labels.to(device), confidence_scores.to(device)
                    logits = model(input_ids, attention_mask, image_tensor)
                    val_loss += loss_fn(logits, labels).item()

                    _, predicted = torch.max(logits, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()

            avg_val_loss = val_loss / len(val_dataloader)
            val_accuracy = 100 * val_correct / val_total
            print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")
            scheduler.step(avg_val_loss)

            early_stopping(avg_val_loss, model)
            if early_stopping.early_stop:
                print("Early stopping triggered")
                break

    print("Training complete.")
    print(prof.key_averages().table(sort_by="cuda_time_total"))  # Print profiling results


In [None]:

###########################################
# Example Usage
###########################################

# Initialize the dataset for training
train_dataset = VQADataset(data_x=train_x, data_y=train_y, image_dict=image_dict)

# Initialize the dataset for validation (optional)
val_dataset = VQADataset(data_x=test_x, data_y=test_y, image_dict=image_dict)

# Define Dataset and DataLoader with optimizations
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

# Define the model
model = VQA_Model(
    bert_model_name='bert-base-uncased',
    fine_tune_bert=True,
    fine_tune_layers=True,
    dropout_prob=0.5,
    reduced_dim=512
)

# Train the model for 10 epochs and save checkpoints
train_model(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    num_epochs=1000,
    lr=2e-5,
    checkpoint_path='vqa_model_checkpoint_89.pt',
    early_stopping_patience=10
)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchvision import transforms, models
from torchvision.models import VGG16_Weights

###########################################
# Custom Dataset Class for Loading VQA Data
###########################################

class VQADataset(Dataset):
    def __init__(self, data_x, data_y, image_dict, augment=False, augment_prob=0.3, is_training=True):
        self.data_x = data_x
        self.data_y = data_y
        self.image_dict = image_dict
        self.augment = augment
        self.augment_prob = augment_prob  # Probability of applying augmentation
        self.is_training = is_training

        # Data augmentation for training
        self.train_transform = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(5),  # Reduced rotation
            transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),  # Less jitter
            transforms.RandomResizedCrop(224, scale=(0.95, 1.0))  # Slightly less aggressive cropping
        ])

    def __len__(self):
        return len(self.data_x)

    def __getitem__(self, idx):
        # Tokenized text (input_ids and attention_mask)
        input_ids_list = torch.tensor(self.data_x['input_ids'].iloc[idx], dtype=torch.long)
        attention_mask_list = torch.tensor(self.data_x['attention_mask'].iloc[idx], dtype=torch.long)

        # Image data
        image_id = self.data_x['image_id'].iloc[idx]
        image_tensor = torch.tensor(self.image_dict[image_id], dtype=torch.float32).permute(2, 0, 1)

        # Apply augmentation
        if self.augment and torch.rand(1).item() < self.augment_prob:
            image_tensor = self.train_transform(image_tensor)

        # Get label (correct answer)
        label = torch.tensor(self.data_y['label'].iloc[idx], dtype=torch.long)

        # Only return confidence scores if in training mode
        if self.is_training:
            confidence_scores = torch.tensor(self.data_y['confidence_scores'].iloc[idx], dtype=torch.float32)
            return input_ids_list, attention_mask_list, image_tensor, label, confidence_scores
        else:
            return input_ids_list, attention_mask_list, image_tensor, label

###########################################
# BERT-based Text Model with Dimensionality Reduction
###########################################

class TextModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune=False, reduced_dim=512):
        super(TextModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)

        # Optionally freeze BERT weights
        if not fine_tune:
            for param in self.bert.parameters():
                param.requires_grad = False

        # Dimensionality reduction after BERT
        self.fc_reduce = nn.Linear(768, reduced_dim)

    def forward(self, input_ids, attention_mask):
        batch_size, num_choices, seq_len = input_ids.size()
        input_ids = input_ids.view(-1, seq_len)
        attention_mask = attention_mask.view(-1, seq_len)

        # Extract [CLS] token output from BERT
        with torch.no_grad():
            text_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            text_features = text_output.last_hidden_state[:, 0, :]

        reduced_text_features = self.fc_reduce(text_features)
        reduced_text_features = reduced_text_features.view(batch_size, num_choices, -1)
        return reduced_text_features

###########################################
# Image Model using VGG-16 with Customization
###########################################

class ImageModel(nn.Module):
    def __init__(self, fine_tune_layers=True, reduced_dim=512):
        super(ImageModel, self).__init__()
        self.vgg16 = models.vgg16(weights=VGG16_Weights.IMAGENET1K_V1)

        # Freeze VGG layers by default
        for param in self.vgg16.parameters():
            param.requires_grad = False

        # Unfreeze the last layers for fine-tuning
        if fine_tune_layers:
            for param in list(self.vgg16.features.parameters())[-9:]:
                param.requires_grad = True

        self.fc_reduce = nn.Linear(25088, reduced_dim)

    def forward(self, image_tensor):
        image_features = self.vgg16.features(image_tensor)
        image_features = torch.flatten(image_features, start_dim=1)
        image_features = self.fc_reduce(image_features)
        return image_features

###########################################
# Fusion Model combining Text and Image Features
###########################################

class FusionModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune_bert=True, fine_tune_layers=True, dropout_prob=0.3, reduced_dim=512):
        super(FusionModel, self).__init__()

        self.text_model = TextModel(bert_model_name=bert_model_name, fine_tune=fine_tune_bert)
        self.image_model = ImageModel(fine_tune_layers=fine_tune_layers, reduced_dim=reduced_dim)

        # Fusion layer
        self.fusion_layer = nn.Linear(reduced_dim + reduced_dim, 64)
        self.batch_norm = nn.BatchNorm1d(64)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input_ids, attention_mask, image_tensor):
        text_features = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        image_features = self.image_model(image_tensor=image_tensor)

        batch_size, num_choices, _ = text_features.size()
        image_features = image_features.unsqueeze(1).expand(batch_size, num_choices, -1)

        combined_features = torch.cat((text_features, image_features), dim=2)
        combined_features = combined_features.view(-1, combined_features.size(2))
        fused_features = self.fusion_layer(combined_features)
        fused_features = self.batch_norm(fused_features)
        fused_features = self.dropout(fused_features)
        fused_features = fused_features.view(batch_size, num_choices, -1)

        return fused_features

###########################################
# VQA Model for Final Prediction
###########################################

class VQA_Model(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune_bert=True, fine_tune_layers=True, dropout_prob=0.3, reduced_dim=512):
        super(VQA_Model, self).__init__()
        self.fusion_model = FusionModel(bert_model_name=bert_model_name, fine_tune_bert=fine_tune_bert, fine_tune_layers=fine_tune_layers, dropout_prob=dropout_prob, reduced_dim=reduced_dim)
        self.classifier = nn.Linear(64, 1)

    def forward(self, input_ids, attention_mask, image_tensor, confidence_scores=None):
        fused_features = self.fusion_model(input_ids=input_ids, attention_mask=attention_mask, image_tensor=image_tensor)
        logits = self.classifier(fused_features).squeeze(-1)

        # Adjust logits with confidence scores during training (if provided)
        if confidence_scores is not None:
            logits = logits * confidence_scores

        batch_size, num_choices = input_ids.size(0), fused_features.size(1)
        logits = logits.view(batch_size, num_choices)

        return logits

###########################################
# Training Function with Accuracy
###########################################

def train_model(model, train_dataloader, val_dataloader, num_epochs=10, lr=1e-5, checkpoint_path='model_checkpoint.pt', early_stopping_patience=3):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)

    early_stopping = EarlyStopping(patience=early_stopping_patience, path=checkpoint_path)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
            input_ids, attention_mask, image_tensor, labels, confidence_scores = batch
            input_ids, attention_mask, image_tensor, labels, confidence_scores = input_ids.to(device), attention_mask.to(device), image_tensor.to(device), labels.to(device), confidence_scores.to(device)
            optimizer.zero_grad()

            logits = model(input_ids, attention_mask, image_tensor, confidence_scores=confidence_scores)
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()

            _, predicted = torch.max(logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        avg_loss = total_loss / len(train_dataloader)
        accuracy = 100 * correct / total
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

        # Validation Loop
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids, attention_mask, image_tensor, labels, confidence_scores = batch
                input_ids, attention_mask, image_tensor, labels, confidence_scores = input_ids.to(device), attention_mask.to(device), image_tensor.to(device), labels.to(device), confidence_scores.to(device)
                logits = model(input_ids, attention_mask, image_tensor)
                val_loss += loss_fn(logits, labels).item()

                _, predicted = torch.max(logits, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        avg_val_loss = val_loss / len(val_dataloader)
        val_accuracy = 100 * val_correct / val_total
        print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")
        scheduler.step(avg_val_loss)

        early_stopping(avg_val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping triggered")
            break

    print("Training complete.")


### new

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
from torchvision import transforms, models
from torchvision.models import VGG16_Weights

###########################################
# Custom Dataset Class for Loading VQA Data
###########################################

class VQADataset(Dataset):
    def __init__(self, data_x, data_y, image_dict, augment=False, augment_prob=0.3, is_training=True):
        self.data_x = data_x
        self.data_y = data_y
        self.image_dict = image_dict
        self.augment = augment
        self.augment_prob = augment_prob
        self.is_training = is_training

        self.train_transform = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(5),
            transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),
            transforms.RandomResizedCrop(224, scale=(0.95, 1.0))
        ])

    def __len__(self):
        return len(self.data_x)

    def __getitem__(self, idx):
        input_ids_list = torch.tensor(self.data_x['input_ids'].iloc[idx], dtype=torch.long)
        attention_mask_list = torch.tensor(self.data_x['attention_mask'].iloc[idx], dtype=torch.long)
        image_id = self.data_x['image_id'].iloc[idx]
        image_tensor = torch.tensor(self.image_dict[image_id], dtype=torch.float32).permute(2, 0, 1)

        if self.augment and torch.rand(1).item() < self.augment_prob:
            image_tensor = self.train_transform(image_tensor)

        label = torch.tensor(self.data_y['label'].iloc[idx], dtype=torch.long)

        return input_ids_list, attention_mask_list, image_tensor, label

###########################################
# BERT-based Text Model with Freezing up to Layer 8
###########################################

class TextModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune=False, reduced_dim=512):
        super(TextModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)

        # Freeze the first 8 layers of BERT
        for param in self.bert.encoder.layer[:8].parameters():
            param.requires_grad = False

        # Dimensionality reduction after BERT
        self.fc_reduce = nn.Linear(768, reduced_dim)

    def forward(self, input_ids, attention_mask):
        batch_size, num_choices, seq_len = input_ids.size()
        input_ids = input_ids.view(-1, seq_len)
        attention_mask = attention_mask.view(-1, seq_len)

        text_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_output.last_hidden_state[:, 0, :]

        reduced_text_features = self.fc_reduce(text_features)
        reduced_text_features = reduced_text_features.view(batch_size, num_choices, -1)
        return reduced_text_features

###########################################
# Image Model using VGG-16 with Customization
###########################################

class ImageModel(nn.Module):
    def __init__(self, fine_tune_layers=True, reduced_dim=512):
        super(ImageModel, self).__init__()
        self.vgg16 = models.vgg16(weights=VGG16_Weights.IMAGENET1K_V1)

        # Freeze VGG layers by default
        for param in self.vgg16.parameters():
            param.requires_grad = False

        # Unfreeze the last layers for fine-tuning
        if fine_tune_layers:
            for param in list(self.vgg16.features.parameters())[-5:]:
                param.requires_grad = True

        self.fc_reduce = nn.Linear(25088, reduced_dim)

    def forward(self, image_tensor):
        image_features = self.vgg16.features(image_tensor)
        image_features = torch.flatten(image_features, start_dim=1)
        image_features = self.fc_reduce(image_features)
        return image_features

###########################################
# Fusion Model combining Text and Image Features
###########################################

class FusionModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune_bert=True, fine_tune_layers=True, dropout_prob=0.3, reduced_dim=512):
        super(FusionModel, self).__init__()

        self.text_model = TextModel(bert_model_name=bert_model_name, fine_tune=fine_tune_bert)
        self.image_model = ImageModel(fine_tune_layers=fine_tune_layers, reduced_dim=reduced_dim)

        # Fusion layer
        self.fusion_layer = nn.Linear(reduced_dim + reduced_dim, 64)
        self.batch_norm = nn.BatchNorm1d(64)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input_ids, attention_mask, image_tensor):
        text_features = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        image_features = self.image_model(image_tensor=image_tensor)

        batch_size, num_choices, _ = text_features.size()
        image_features = image_features.unsqueeze(1).expand(batch_size, num_choices, -1)

        combined_features = torch.cat((text_features, image_features), dim=2)
        combined_features = combined_features.view(-1, combined_features.size(2))
        fused_features = self.fusion_layer(combined_features)
        fused_features = self.batch_norm(fused_features)
        fused_features = self.dropout(fused_features)
        fused_features = fused_features.view(batch_size, num_choices, -1)

        return fused_features

###########################################
# VQA Model for Final Prediction
###########################################

class VQA_Model(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune_bert=True, fine_tune_layers=True, dropout_prob=0.3, reduced_dim=512):
        super(VQA_Model, self).__init__()
        self.fusion_model = FusionModel(bert_model_name=bert_model_name, fine_tune_bert=fine_tune_bert, fine_tune_layers=fine_tune_layers, dropout_prob=dropout_prob, reduced_dim=reduced_dim)
        self.classifier = nn.Linear(64, 1)

    def forward(self, input_ids, attention_mask, image_tensor):
        fused_features = self.fusion_model(input_ids=input_ids, attention_mask=attention_mask, image_tensor=image_tensor)
        logits = self.classifier(fused_features).squeeze(-1)

        batch_size, num_choices = input_ids.size(0), fused_features.size(1)
        logits = logits.view(batch_size, num_choices)

        return logits

###########################################
# Training Function
###########################################


def train_model(model, train_dataloader, val_dataloader, num_epochs=10, optimizer=None, checkpoint_path='model_checkpoint.pt', summary_path='training_summary.txt'):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    if optimizer is None:
        raise ValueError("Optimizer must be provided")

    scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

    # Create a folder for checkpoints if it doesn't exist
    if not os.path.exists(os.path.dirname(checkpoint_path)):
        os.makedirs(os.path.dirname(checkpoint_path))

    torch.backends.cudnn.benchmark = True  # Enable cuDNN autotuning

    # Open summary file
    with open(summary_path, 'w') as summary_file:
        for epoch in range(num_epochs):
            model.train()
            total_loss = 0
            correct = 0
            total = 0

            for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
                input_ids, attention_mask, image_tensor, labels = batch
                input_ids, attention_mask, image_tensor, labels = (
                    input_ids.to(device), attention_mask.to(device), image_tensor.to(device), labels.to(device)
                )

                optimizer.zero_grad()

                logits = model(input_ids, attention_mask, image_tensor)
                loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
                loss = loss_fn(logits, labels)

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

                total_loss += loss.item()
                _, predicted = torch.max(logits, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            avg_loss = total_loss / len(train_dataloader)
            accuracy = 100 * correct / total
            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

            # Save model with training loss & accuracy
            checkpoint_file = f"{checkpoint_path}_epoch_{epoch + 1}_trainloss_{avg_loss:.4f}_trainacc_{accuracy:.2f}.pt"
            torch.save(model.state_dict(), checkpoint_file)
            print(f"Model saved to {checkpoint_file}")

            # Write to summary file
            summary_file.write(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%\n")

            # Only run validation every 3 epochs
            if (epoch + 1) % 3 == 0:
                model.eval()
                val_loss = 0
                val_correct = 0
                val_total = 0
                with torch.no_grad():
                    for batch in val_dataloader:
                        input_ids, attention_mask, image_tensor, labels = batch
                        input_ids, attention_mask, image_tensor, labels = (
                            input_ids.to(device), attention_mask.to(device), image_tensor.to(device), labels.to(device)
                        )
                        logits = model(input_ids, attention_mask, image_tensor)
                        val_loss += loss_fn(logits, labels).item()

                        _, predicted = torch.max(logits, 1)
                        val_total += labels.size(0)
                        val_correct += (predicted == labels).sum().item()

                avg_val_loss = val_loss / len(val_dataloader)
                val_accuracy = 100 * val_correct / val_total
                print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")

                # Write validation scores to summary file
                summary_file.write(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%\n")

                # Save model with validation score
                checkpoint_file = f"{checkpoint_path}_epoch_{epoch + 1}_trainloss_{avg_loss:.4f}_trainacc_{accuracy:.2f}_valloss_{avg_val_loss:.4f}_valacc_{val_accuracy:.2f}.pt"
                torch.save(model.state_dict(), checkpoint_file)
                print(f"Model saved to {checkpoint_file} with validation scores")

                scheduler.step()

    print("Training complete.")


In [None]:
###########################################
# Example Usage
###########################################

import os

# Initialize the dataset for training
train_dataset = VQADataset(data_x=train_x, data_y=train_y, image_dict=image_dict)

# Initialize the dataset for validation (optional)
val_dataset = VQADataset(data_x=test_x, data_y=test_y, image_dict=image_dict)

# Define Dataset and DataLoader with optimizations
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)

# Define the model
model = VQA_Model(
    bert_model_name='bert-base-uncased',
    fine_tune_bert=True,
    fine_tune_layers=True,
    dropout_prob=0.5,
    reduced_dim=512
)

# Optimizer setup with adjusted learning rates for BERT and VGG layers
optimizer = optim.AdamW([
    {'params': model.fusion_model.text_model.bert.parameters(), 'lr': 1e-6},  # Lower learning rate for BERT
    {'params': model.fusion_model.image_model.parameters(), 'lr': 5e-5},     # Lower learning rate for VGG layers
    {'params': model.fusion_model.fusion_layer.parameters(), 'lr': 1e-4},    # Learning rate for the fusion layer
    {'params': model.classifier.parameters(), 'lr': 1e-4}                    # Learning rate for classifier
], weight_decay=1e-4)

# Path to save checkpoints and training summaries on Google Drive
save_dir = '/content/drive/MyDrive/Colab Notebooks/Deep Learning Project'
checkpoint_path = f'{save_dir}/vqa_model_checkpoint_with_confidence.pt'
summary_path = f'{save_dir}/training_summary.txt'

# Train the model with the new optimizer and checkpoint saving configuration
train_model(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    num_epochs=1000,
    optimizer=optimizer,
    checkpoint_path=checkpoint_path,  # Save checkpoints to Google Drive
    summary_path=summary_path  # Save the training and validation summary to Google Drive
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:02<00:00, 217MB/s]
Epoch 1/1000: 100%|██████████| 3000/3000 [45:36<00:00,  1.10it/s]


Epoch 1/1000, Loss: 2.0465, Accuracy: 33.18%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_1_trainloss_2.0465_trainacc_33.18.pt


Epoch 2/1000: 100%|██████████| 3000/3000 [45:33<00:00,  1.10it/s]


Epoch 2/1000, Loss: 1.7838, Accuracy: 39.74%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_2_trainloss_1.7838_trainacc_39.74.pt


Epoch 3/1000: 100%|██████████| 3000/3000 [45:33<00:00,  1.10it/s]


Epoch 3/1000, Loss: 1.7205, Accuracy: 42.06%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_3_trainloss_1.7205_trainacc_42.06.pt
Validation Loss: 1.6016, Validation Accuracy: 48.41%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_3_trainloss_1.7205_trainacc_42.06_valloss_1.6016_valacc_48.41.pt with validation scores


Epoch 4/1000: 100%|██████████| 3000/3000 [45:33<00:00,  1.10it/s]


Epoch 4/1000, Loss: 1.6795, Accuracy: 43.92%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_4_trainloss_1.6795_trainacc_43.92.pt


Epoch 5/1000: 100%|██████████| 3000/3000 [45:34<00:00,  1.10it/s]


Epoch 5/1000, Loss: 1.6498, Accuracy: 45.56%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_5_trainloss_1.6498_trainacc_45.56.pt


Epoch 6/1000: 100%|██████████| 3000/3000 [45:34<00:00,  1.10it/s]


Epoch 6/1000, Loss: 1.6248, Accuracy: 46.77%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_6_trainloss_1.6248_trainacc_46.77.pt
Validation Loss: 1.5240, Validation Accuracy: 52.78%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_6_trainloss_1.6248_trainacc_46.77_valloss_1.5240_valacc_52.78.pt with validation scores


Epoch 7/1000: 100%|██████████| 3000/3000 [45:33<00:00,  1.10it/s]


Epoch 7/1000, Loss: 1.5969, Accuracy: 48.45%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_7_trainloss_1.5969_trainacc_48.45.pt


Epoch 8/1000: 100%|██████████| 3000/3000 [45:34<00:00,  1.10it/s]


Epoch 8/1000, Loss: 1.5802, Accuracy: 49.25%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_8_trainloss_1.5802_trainacc_49.25.pt


Epoch 9/1000: 100%|██████████| 3000/3000 [45:34<00:00,  1.10it/s]


Epoch 9/1000, Loss: 1.5592, Accuracy: 50.34%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_9_trainloss_1.5592_trainacc_50.34.pt
Validation Loss: 1.4765, Validation Accuracy: 54.86%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_9_trainloss_1.5592_trainacc_50.34_valloss_1.4765_valacc_54.86.pt with validation scores


Epoch 10/1000: 100%|██████████| 3000/3000 [45:34<00:00,  1.10it/s]


Epoch 10/1000, Loss: 1.5420, Accuracy: 51.21%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_10_trainloss_1.5420_trainacc_51.21.pt


Epoch 11/1000: 100%|██████████| 3000/3000 [45:33<00:00,  1.10it/s]


Epoch 11/1000, Loss: 1.5296, Accuracy: 51.81%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_11_trainloss_1.5296_trainacc_51.81.pt


Epoch 12/1000: 100%|██████████| 3000/3000 [45:34<00:00,  1.10it/s]


Epoch 12/1000, Loss: 1.5185, Accuracy: 52.62%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_12_trainloss_1.5185_trainacc_52.62.pt
Validation Loss: 1.4418, Validation Accuracy: 56.48%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_12_trainloss_1.5185_trainacc_52.62_valloss_1.4418_valacc_56.48.pt with validation scores


Epoch 13/1000: 100%|██████████| 3000/3000 [45:34<00:00,  1.10it/s]


Epoch 13/1000, Loss: 1.5054, Accuracy: 52.89%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_13_trainloss_1.5054_trainacc_52.89.pt


Epoch 14/1000: 100%|██████████| 3000/3000 [45:34<00:00,  1.10it/s]


Epoch 14/1000, Loss: 1.4962, Accuracy: 53.37%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_14_trainloss_1.4962_trainacc_53.37.pt


Epoch 15/1000: 100%|██████████| 3000/3000 [45:34<00:00,  1.10it/s]


Epoch 15/1000, Loss: 1.4834, Accuracy: 53.94%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_15_trainloss_1.4834_trainacc_53.94.pt
Validation Loss: 1.4282, Validation Accuracy: 57.52%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_15_trainloss_1.4834_trainacc_53.94_valloss_1.4282_valacc_57.52.pt with validation scores


Epoch 16/1000: 100%|██████████| 3000/3000 [45:34<00:00,  1.10it/s]


Epoch 16/1000, Loss: 1.4702, Accuracy: 55.09%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_16_trainloss_1.4702_trainacc_55.09.pt


Epoch 17/1000: 100%|██████████| 3000/3000 [45:34<00:00,  1.10it/s]


Epoch 17/1000, Loss: 1.4714, Accuracy: 54.52%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_17_trainloss_1.4714_trainacc_54.52.pt


Epoch 18/1000: 100%|██████████| 3000/3000 [45:33<00:00,  1.10it/s]


Epoch 18/1000, Loss: 1.4706, Accuracy: 54.73%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_18_trainloss_1.4706_trainacc_54.73.pt
Validation Loss: 1.4160, Validation Accuracy: 58.58%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_18_trainloss_1.4706_trainacc_54.73_valloss_1.4160_valacc_58.58.pt with validation scores


Epoch 19/1000: 100%|██████████| 3000/3000 [45:33<00:00,  1.10it/s]


Epoch 19/1000, Loss: 1.4694, Accuracy: 54.88%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_19_trainloss_1.4694_trainacc_54.88.pt


Epoch 20/1000: 100%|██████████| 3000/3000 [45:33<00:00,  1.10it/s]


Epoch 20/1000, Loss: 1.4655, Accuracy: 55.05%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_20_trainloss_1.4655_trainacc_55.05.pt


Epoch 21/1000: 100%|██████████| 3000/3000 [45:33<00:00,  1.10it/s]


Epoch 21/1000, Loss: 1.4656, Accuracy: 55.19%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_21_trainloss_1.4656_trainacc_55.19.pt
Validation Loss: 1.4171, Validation Accuracy: 58.64%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_21_trainloss_1.4656_trainacc_55.19_valloss_1.4171_valacc_58.64.pt with validation scores


Epoch 22/1000: 100%|██████████| 3000/3000 [45:33<00:00,  1.10it/s]


Epoch 22/1000, Loss: 1.4679, Accuracy: 54.92%
Model saved to /content/drive/MyDrive/Colab Notebooks/Deep Learning Project/vqa_model_checkpoint_with_confidence.pt_epoch_22_trainloss_1.4679_trainacc_54.92.pt


Epoch 23/1000:  66%|██████▋   | 1993/3000 [30:16<15:17,  1.10it/s]

### Older

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchvision import transforms, models
from torchvision.models import VGG16_Weights

###########################################
# Custom Dataset Class for Loading VQA Data
###########################################

class VQADataset(Dataset):
    def __init__(self, data_x, data_y, image_dict, augment=False, augment_prob=0.3):
        self.data_x = data_x
        self.data_y = data_y
        self.image_dict = image_dict
        self.augment = augment
        self.augment_prob = augment_prob  # Probability of applying augmentation

        # Less aggressive data augmentation
        self.train_transform = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(5),  # Reduced rotation
            transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),  # Less intense jitter
            transforms.RandomResizedCrop(224, scale=(0.95, 1.0))  # Less cropping
        ])

    def __len__(self):
        return len(self.data_x)

    def __getitem__(self, idx):
        # Get all input_ids and attention masks for the 18 options
        input_ids_list = torch.tensor(self.data_x['input_ids'].iloc[idx], dtype=torch.long)
        attention_mask_list = torch.tensor(self.data_x['attention_mask'].iloc[idx], dtype=torch.long)

        # Get image tensor and permute for correct dimensions (channels first)
        image_id = self.data_x['image_id'].iloc[idx]
        image_tensor = torch.tensor(self.image_dict[image_id], dtype=torch.float32).permute(2, 0, 1)

        # Apply data augmentation with a certain probability
        if self.augment and torch.rand(1).item() < self.augment_prob:
            image_tensor = self.train_transform(image_tensor)

        # Get label (index of the correct answer)
        label = torch.tensor(self.data_y['label'].iloc[idx], dtype=torch.long)

        return input_ids_list, attention_mask_list, image_tensor, label

###########################################
# BERT Model for Text Processing (with Fine-Tuning)
###########################################

class TextModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune=True, reduced_dim=512):
        """
        TextModel using BERT with dimensionality reduction.

        Args:
        - bert_model_name (str): Pretrained BERT model name.
        - fine_tune (bool): If True, all BERT layers are trainable. If False, they are frozen.
        - reduced_dim (int): Dimension to reduce BERT output to (default is 512).
        """
        super(TextModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)

        # If fine-tune is set to False, freeze all BERT layers
        if not fine_tune:
            for param in self.bert.parameters():
                param.requires_grad = False
        else:
            # Unfreeze all layers by ensuring requires_grad=True
            for param in self.bert.parameters():
                param.requires_grad = True

        # Add a fully connected layer to reduce dimensionality of BERT's output
        self.fc_reduce = nn.Linear(768, reduced_dim)  # 768 is the BERT hidden size

    def forward(self, input_ids, attention_mask):
        batch_size, num_choices, seq_len = input_ids.size()
        input_ids = input_ids.view(-1, seq_len)  # Flatten batch and choices for BERT
        attention_mask = attention_mask.view(-1, seq_len)

        # Get BERT outputs (allow gradients to be computed if fine-tune=True)
        text_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_output.last_hidden_state[:, 0, :]  # Extract [CLS] token output (batch_size * num_choices, 768)

        # Reduce dimensions from 768 to reduced_dim
        reduced_text_features = self.fc_reduce(text_features)  # (batch_size * num_choices, reduced_dim)

        # Reshape back to original shape (batch_size, num_choices, reduced_dim)
        reduced_text_features = reduced_text_features.view(batch_size, num_choices, -1)
        return reduced_text_features




###########################################
# Image Model (VGG-16) with Freezing and Unfreezing Layers
###########################################

class ImageModel(nn.Module):
    def __init__(self, fine_tune_layers=True, reduced_dim=512):
        super(ImageModel, self).__init__()

        # Load pre-trained VGG-16 with weights
        self.vgg16 = models.vgg16(weights=VGG16_Weights.IMAGENET1K_V1)

        # Freeze all layers by default
        for param in self.vgg16.parameters():
            param.requires_grad = False

        # Unfreeze the last few layers for fine-tuning
        if fine_tune_layers:
            for param in list(self.vgg16.features.parameters())[-9:]:
                param.requires_grad = True

        # Use a linear layer to reduce dimensionality of the image features
        self.reduced_dim = reduced_dim
        self.fc_reduce = nn.Linear(25088, reduced_dim)

    def forward(self, image_tensor):
        # Extract image features from VGG-16 and flatten the output
        image_features = self.vgg16.features(image_tensor)  # Output is 512 x 7 x 7
        image_features = torch.flatten(image_features, start_dim=1)  # Flatten to (batch_size, 25088)

        # Reduce dimensionality
        image_features = self.fc_reduce(image_features)  # Now the output is (batch_size, reduced_dim)
        return image_features

###########################################
# Fusion Model (Text + Image)
###########################################

class FusionModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune_bert=True, fine_tune_layers=True, dropout_prob=0.3, reduced_dim=512):
        super(FusionModel, self).__init__()

        # Load text and image models
        self.text_model = TextModel(bert_model_name=bert_model_name, fine_tune=fine_tune_bert)
        self.image_model = ImageModel(fine_tune_layers=fine_tune_layers, reduced_dim=reduced_dim)

        # Fusion layers for combining text and image features
        fusion_input_size = reduced_dim + reduced_dim  # Concatenated size of text and reduced image features
        self.fusion_layer = nn.Linear(fusion_input_size, 64)  # Reduce from large to smaller size
        self.batch_norm = nn.BatchNorm1d(64)  # Batch Normalization
        self.dropout = nn.Dropout(dropout_prob)  # Dropout for regularization

    def forward(self, input_ids, attention_mask, image_tensor):
        # Extract text features for all choices
        text_features = self.text_model(input_ids=input_ids, attention_mask=attention_mask)  # (batch_size, num_choices, 768)

        # Extract image features
        image_features = self.image_model(image_tensor=image_tensor)  # (batch_size, reduced_dim)

        # Expand image features to match the number of choices
        batch_size, num_choices, _ = text_features.size()
        image_features = image_features.unsqueeze(1).expand(batch_size, num_choices, -1)  # (batch_size, num_choices, reduced_dim)

        # Concatenate text and image features
        combined_features = torch.cat((text_features, image_features), dim=2)  # (batch_size, num_choices, fusion_input_size)

        # Apply fusion layer to each choice
        combined_features = combined_features.view(-1, combined_features.size(2))  # Flatten for linear layer
        fused_features = self.fusion_layer(combined_features)
        fused_features = self.batch_norm(fused_features)
        fused_features = self.dropout(fused_features)

        # Reshape back to (batch_size, num_choices, fusion_output_size)
        fused_features = fused_features.view(batch_size, num_choices, -1)

        return fused_features

###########################################
# VQA Model
###########################################

class VQA_Model(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', fine_tune_bert=True, fine_tune_layers=True, dropout_prob=0.3, reduced_dim=512):
        super(VQA_Model, self).__init__()

        # Combine text and image features with the FusionModel
        self.fusion_model = FusionModel(bert_model_name=bert_model_name, fine_tune_bert=fine_tune_bert, fine_tune_layers=fine_tune_layers, dropout_prob=dropout_prob, reduced_dim=reduced_dim)

        # Final classification layer: A single logit per choice
        self.classifier = nn.Linear(64, 1)  # Single logit per choice

    def forward(self, input_ids, attention_mask, image_tensor):
        # Get fused text and image features
        fused_features = self.fusion_model(input_ids=input_ids, attention_mask=attention_mask, image_tensor=image_tensor)

        # Apply the classifier to generate logits for each choice
        logits = self.classifier(fused_features).squeeze(-1)  # (batch_size * num_choices)

        # Reshape back to (batch_size, num_choices) to compare across the options for each question
        batch_size, num_choices = input_ids.size(0), fused_features.size(1)
        logits = logits.view(batch_size, num_choices)

        return logits

###########################################
# Early Stopping Class
###########################################

class EarlyStopping:
    def __init__(self, patience=5, delta=0, path='checkpoint.pt'):
        self.patience = patience
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(val_loss, model)
        elif val_loss > self.best_loss + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        torch.save(model.state_dict(), self.path)

###########################################
# Training Function
###########################################

def train_model(model, train_dataloader, val_dataloader, num_epochs=10, lr=1e-5, checkpoint_path='model_checkpoint.pt', early_stopping_patience=3):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # Optimizer and Learning Rate Scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)

    # Early stopping
    early_stopping = EarlyStopping(patience=early_stopping_patience, path=checkpoint_path)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        # Training Loop
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
            input_ids, attention_mask, image_tensor, labels = batch
            input_ids, attention_mask, image_tensor, labels = input_ids.to(device), attention_mask.to(device), image_tensor.to(device), labels.to(device)
            optimizer.zero_grad()

            logits = model(input_ids, attention_mask, image_tensor)
            loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
            loss = loss_fn(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

        # Validation Loop
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids, attention_mask, image_tensor, labels = batch
                input_ids, attention_mask, image_tensor, labels = input_ids.to(device), attention_mask.to(device), image_tensor.to(device), labels.to(device)
                logits = model(input_ids, attention_mask, image_tensor)
                val_loss += loss_fn(logits, labels).item()

        avg_val_loss = val_loss / len(val_dataloader)
        print(f"Validation Loss: {avg_val_loss:.4f}")
        scheduler.step(avg_val_loss)

        # Early stopping check
        early_stopping(avg_val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping triggered")
            break

    print("Training complete.")


In [None]:
# Assuming train_x is your original DataFrame
sampled_train_x = train_x.sample(n=9600, random_state=42)  # random_state is optional for reproducibility
sampled_test_x = test_x.sample(n=2400, random_state=42)  # random_state is optional for reproducibility
sampled_train_y = train_y.sample(n=9600, random_state=42)  # random_state is optional for reproducibility
sampled_test_y = test_y.sample(n=2400, random_state=42)  # random_state is optional for reproducibility

In [None]:
###########################################
# Example Usage
###########################################

# Initialize the dataset for training
train_dataset = VQADataset(data_x=sampled_train_x, data_y=sampled_train_y, image_dict=image_dict)

# Initialize the dataset for validation (optional)
val_dataset = VQADataset(data_x=sampled_test_x, data_y=sampled_test_y, image_dict=image_dict)

# Define Dataset and DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the model
model = VQA_Model(
    bert_model_name='bert-base-uncased',  # Pretrained BERT model
    fine_tune_bert=True,                 # Fine-tune BERT
    fine_tune_layers=True,               # Fine-tune layers of the image model
    dropout_prob=0.5,                    # Dropout for regularization
    reduced_dim=512                      # Dimension reduction in both text and image models
)

# Train the model for 10 epochs and save checkpoints
train_model(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,  # Optional: Pass validation DataLoader
    num_epochs=500,                  # Number of epochs
    lr=1e-5,                        # Learning rate (adjust as needed)
    checkpoint_path='vqa_model_checkpoint_v6.pt',  # Path to save the model checkpoints
    early_stopping_patience=10      # Early stopping patience
)


Epoch 1/500: 100%|██████████| 600/600 [11:42<00:00,  1.17s/it]


Epoch 1/500, Loss: 1.5121
Validation Loss: 1.1904


Epoch 2/500:  64%|██████▍   | 387/600 [07:34<04:09,  1.17s/it]


KeyboardInterrupt: 

In [None]:
import torch

# Initialize the dataset for training
train_dataset = VQADataset(data_x=train_x, data_y=train_y, image_dict=image_dict)

# Initialize the dataset for validation (optional)
val_dataset = VQADataset(data_x=test_x, data_y=test_y, image_dict=image_dict)

# Define Dataset and DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the model (same architecture as the one used during training)
model = VQA_Model(
    bert_model_name='bert-base-uncased',  # Pretrained BERT model
    fine_tune_bert=False,                 # Freeze BERT layers
    fine_tune_layers=True,                # Fine-tune layers of the image model
    dropout_prob=0.2,                     # Dropout for regularization
    reduced_dim=512                       # Dimension reduction in both text and image models
)

# Load the checkpoint (use the appropriate device, CPU or GPU)
checkpoint_path = 'vqa_model_checkpoint.pt'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the checkpoint file
checkpoint = torch.load(checkpoint_path, map_location=device)

# Load the model weights from the checkpoint
model.load_state_dict(checkpoint)

# Move the model to the correct device
model = model.to(device)

# Now the model is loaded with the checkpoint weights and ready for use
# Define the optimizer (same settings as used during training)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)

# Optionally load other state information (e.g., epoch)
epoch = checkpoint.get('epoch', 0)

# Continue training from this state
train_model(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    num_epochs=5000,  # Adjust based on how much training is left
    lr=1e-4,
    checkpoint_path='vqa_model_checkpoint_v8.pt',
    early_stopping_patience=10
)


  checkpoint = torch.load(checkpoint_path, map_location=device)
Epoch 1/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 1/5000, Loss: 0.9122
Validation Loss: 0.9351


Epoch 2/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 2/5000, Loss: 0.9046
Validation Loss: 0.9351


Epoch 3/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 3/5000, Loss: 0.9013
Validation Loss: 0.9291


Epoch 4/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 4/5000, Loss: 0.9006
Validation Loss: 0.9282


Epoch 5/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 5/5000, Loss: 0.8969
Validation Loss: 0.9279


Epoch 6/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 6/5000, Loss: 0.8988
Validation Loss: 0.9241


Epoch 7/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 7/5000, Loss: 0.8993
Validation Loss: 0.9306


Epoch 8/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 8/5000, Loss: 0.8974
Validation Loss: 0.9228


Epoch 9/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 9/5000, Loss: 0.8950
Validation Loss: 0.9301


Epoch 10/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 10/5000, Loss: 0.8962
Validation Loss: 0.9241


Epoch 11/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 11/5000, Loss: 0.8948
Validation Loss: 0.9269


Epoch 12/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 12/5000, Loss: 0.8963
Validation Loss: 0.9262


Epoch 13/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 13/5000, Loss: 0.8950
Validation Loss: 0.9263


Epoch 14/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 14/5000, Loss: 0.8943
Validation Loss: 0.9261


Epoch 15/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 15/5000, Loss: 0.8923
Validation Loss: 0.9262


Epoch 16/5000: 100%|██████████| 750/750 [19:17<00:00,  1.54s/it]


Epoch 16/5000, Loss: 0.8915
Validation Loss: 0.9264


Epoch 17/5000:   0%|          | 3/750 [00:06<25:35,  2.06s/it]


KeyboardInterrupt: 

In [None]:
# Global variables to store training progress
global_accuracy = 0
global_num_correct = 0
global_total_samples = 0
global_batch_index = 0

# Training Loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        input_ids, attention_mask, image_tensor, labels = batch
        input_ids, attention_mask, image_tensor, labels = input_ids.to(device), attention_mask.to(device), image_tensor.to(device), labels.to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, image_tensor)
        loss = nn.CrossEntropyLoss()(logits, labels)
        loss.backward()
        optimizer.step()

        # Calculate accuracy for this batch
        preds = torch.argmax(logits, dim=1)
        num_correct = (preds == labels).sum().item()
        total_samples += labels.size(0)
        correct_predictions += num_correct

        # Update global accuracy
        global_num_correct = correct_predictions
        global_total_samples = total_samples
        global_accuracy = correct_predictions / total_samples
        global_batch_index += 1

        # If you want to periodically display the accuracy every few batches, you could add a condition here.
import time
from IPython.display import clear_output

while global_batch_index < len(train_dataloader) * num_epochs:
    # Clear previous output
    clear_output(wait=True)

    # Print the current accuracy
    print(f"Training Progress - Batch {global_batch_index}:")
    print(f"Accuracy: {global_accuracy * 100:.2f}%")

    # Wait for some time before printing again to avoid overloading the output
    time.sleep(2)


NameError: name 'num_epochs' is not defined

In [None]:
###########################################
# Example Usage
###########################################

# Initialize the dataset for training
train_dataset = VQADataset(data_x=train_x, data_y=train_y, image_dict=image_dict)

# Initialize the dataset for validation (optional)
val_dataset = VQADataset(data_x=test_x, data_y=test_y, image_dict=image_dict)

# Define Dataset and DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the model
model = VQA_Model(
    bert_model_name='bert-base-uncased',  # Pretrained BERT model
    fine_tune_bert=False,                 # Fine-tune BERT
    fine_tune_layers=True,               # Fine-tune layers of the image model
    dropout_prob=0.3,                    # Dropout for regularization
    reduced_dim=512                      # Dimension reduction in both text and image models
)

# Train the model for 10 epochs and save checkpoints
train_model(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,  # Optional: Pass validation DataLoader
    num_epochs=5000,                  # Number of epochs
    lr=1e-4,                        # Learning rate (adjust as needed)
    checkpoint_path='vqa_model_checkpoint_v7_fullset.pt',  # Path to save the model checkpoints
    early_stopping_patience=10       # Early stopping patience
)


Epoch 1/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 1/5000, Loss: 2.2676
Validation Loss: 1.8259


Epoch 2/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 2/5000, Loss: 2.2025
Validation Loss: 1.8054


Epoch 3/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 3/5000, Loss: 2.1833
Validation Loss: 1.7972


Epoch 4/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 4/5000, Loss: 2.1868
Validation Loss: 1.8011


Epoch 5/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 5/5000, Loss: 2.1771
Validation Loss: 1.8048


Epoch 6/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 6/5000, Loss: 2.1710
Validation Loss: 1.8053


Epoch 7/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 7/5000, Loss: 2.1740
Validation Loss: 1.7970


Epoch 8/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 8/5000, Loss: 2.1609
Validation Loss: 1.7850


Epoch 9/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 9/5000, Loss: 2.1641
Validation Loss: 1.7874


Epoch 10/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 10/5000, Loss: 2.1492
Validation Loss: 1.7861


Epoch 11/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 11/5000, Loss: 2.1664
Validation Loss: 1.7914


Epoch 12/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 12/5000, Loss: 2.1561
Validation Loss: 1.7865


Epoch 13/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 13/5000, Loss: 2.1591
Validation Loss: 1.7894


Epoch 14/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 14/5000, Loss: 2.1576
Validation Loss: 1.7877


Epoch 15/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 15/5000, Loss: 2.1644
Validation Loss: 1.7873


Epoch 16/5000: 100%|██████████| 750/750 [19:16<00:00,  1.54s/it]


Epoch 16/5000, Loss: 2.1608
Validation Loss: 1.7891


Epoch 17/5000:  53%|█████▎    | 397/750 [10:13<09:05,  1.55s/it]


KeyboardInterrupt: 

### Memoery Clean

In [None]:
import multiprocessing

# Get the number of CPU cores
num_cores = multiprocessing.cpu_count()
print(f"Number of CPU cores: {num_cores}")


Number of CPU cores: 8


In [None]:
# To see processes and memory usage
!ps -aux

USER         PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
root           1  0.0  0.0   1076     8 ?        Ss   01:27   0:00 /sbin/docker-init -- /datalab/run
root           7 11.6  0.0 953464 81772 ?        Sl   01:27   6:25 /tools/node/bin/node /datalab/web
root           9  0.0  0.0   7376  3488 ?        S    01:27   0:01 /bin/bash -e /usr/local/colab/bin
root          11  0.0  0.0   7376  1940 ?        S    01:27   0:00 /bin/bash -e /datalab/run.sh
root          12  0.0  0.0 1238116 15280 ?       Sl   01:27   0:01 /usr/colab/bin/kernel_manager_pro
root          31  0.0  0.0   5808  1048 ?        Ss   01:27   0:00 tail -n +0 -F /root/.config/Googl
root          37  0.0  0.0   5808  1012 ?        Ss   01:27   0:00 tail -n +0 -F /root/.config/Googl
root          89  0.2  0.0      0     0 ?        Z    01:27   0:06 [python3] <defunct>
root          90  0.0  0.0  67740 52384 ?        S    01:27   0:00 python3 /usr/local/bin/colab-file
root         139  0.2  0.2 1065900 

In [None]:
import torch

# Delete your model, data, etc.
del train_model
del train_dataloader
del val_dataloader
del optimizer

# Clear the cache
torch.cuda.empty_cache()



In [None]:
!nvidia-smi


Fri Oct 11 17:23:51 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   57C    P0              29W /  72W |  21945MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()