In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Libraries and Define Dependencies
This section imports necessary libraries for data processing, model creation, and training. We use libraries like `pandas` for data manipulation, `torch` for model training, and `transformers` for BERT integration.


In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import models, transforms
from PIL import Image
from transformers import BertModel, BertTokenizer

# Load Data
Here, we load the training data and target data for our multimodal ranking task.


In [20]:
train_data = pd.read_csv('/kaggle/input/semval-dataset/train/subtask_a_train.csv')
target_data = pd.read_csv('/kaggle/input/target-column-dataset/target_t.csv')

# Define Image Transformations
We define a series of image transformations to preprocess images before feeding them into the ResNet model. These include resizing, normalization, and tensor conversion.


In [3]:
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


# Define the Dataset Class
This custom dataset class handles the loading of images and text data. It also tokenizes text inputs with BERT's tokenizer and retrieves image features for each idiom.


In [4]:
class IdiomImageDataset(Dataset):
    def __init__(self, dataframe, target_df, image_dir):
        self.dataframe = dataframe
        self.target_df = target_df
        self.image_dir = image_dir
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        target_row = self.target_df.iloc[index]
        sentence = row['sentence']
        idiom_name = row['compound'].replace("'", "_")
        image_names = [row[f'image{i}_name'] for i in range(1, 6)]

        expected_order = eval(target_row['target'])
        expected_order = [x - 1 for x in expected_order]

        inputs = self.tokenizer(sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        images = []
        for img_name in image_names:
            img_path = os.path.join(self.image_dir, idiom_name, img_name)
            img = Image.open(img_path).convert('RGB')
            img = image_transforms(img)
            images.append(img)
        images_tensor = torch.stack(images)
        expected_order_tensor = torch.tensor(expected_order, dtype=torch.long)
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0), images_tensor, expected_order_tensor


# Define the Model Class
We define a multimodal ranking model that combines text features from BERT and image features from ResNet. These features are merged, passed through fully connected layers, and used to predict the ranking of images.


In [5]:
class MultimodalRankingModel(nn.Module):
    def __init__(self):
        super(MultimodalRankingModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.resnet = models.resnet50(weights='DEFAULT')
        self.resnet.fc = nn.Identity()
        self.fc1 = nn.Linear(768 + 2048, 512)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, 5)

    def forward(self, input_ids, attention_mask, images):
        text_features = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        text_features = text_features.mean(dim=1)
        
        batch_size, num_images, channels, height, width = images.size()
        images = images.view(batch_size * num_images, channels, height, width)
        image_features = self.resnet(images)
        image_features = image_features.view(batch_size, num_images, -1)
        
        combined_features = torch.cat((text_features.unsqueeze(1).expand(-1, num_images, -1), image_features), dim=2)
        x = torch.relu(self.fc1(combined_features))
        x = self.dropout(x)
        rankings = self.fc2(x).squeeze(-1)  # Shape: [batch_size, num_images]
        
        return rankings


# Prepare Data and Split into Train and Test Sets
We initialize the dataset and split it into training and testing sets. The data loaders help with batch loading of data during training.


In [21]:
import torch

# Set a seed for reproducibility
torch.manual_seed(42)

# Define image folder and initialize dataset
image_folder = '/kaggle/input/semval-dataset/train'
dataset = IdiomImageDataset(train_data, target_data, image_folder)

# Define static train-test split
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Initialize data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


# Initialize Model, Optimizer, and Loss Function
We initialize our multimodal ranking model, set up an optimizer (Adam), and specify cross-entropy loss as the loss function for ranking.


In [8]:
model = MultimodalRankingModel()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 148MB/s] 


# Training Function
This function performs model training over a specified number of epochs, iterating through the data loader and updating model weights based on the calculated loss.


In [9]:
def train_model(model, data_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for input_ids, attention_mask, images, expected_order in data_loader:
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, images)
            loss = 0
            for i in range(outputs.size(1)):  # Loop over each image's rank
                loss += criterion(outputs[:, i], expected_order[:, i])
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(data_loader)}')


# Run Training
We now run the training process and save the trained model weights.


In [22]:
train_model(model, train_loader, criterion, optimizer, epochs=10)
torch.save(model.state_dict(), 'multimodal_ranking_model.pth')


Epoch 1/10, Loss: 4.664624571800232
Epoch 2/10, Loss: 3.915795624256134
Epoch 3/10, Loss: 3.0234537720680237
Epoch 4/10, Loss: 2.3471586108207703
Epoch 5/10, Loss: 1.7941093742847443
Epoch 6/10, Loss: 1.205925703048706
Epoch 7/10, Loss: 0.859320729970932
Epoch 8/10, Loss: 0.5839527547359467
Epoch 9/10, Loss: 0.41342754662036896
Epoch 10/10, Loss: 0.32215698063373566


# Model Evaluation
The evaluation function uses the test dataset to generate rankings and compares predicted rankings against the true rankings.


In [23]:
def evaluate_model(model, data_loader):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for input_ids, attention_mask, images, _ in data_loader:
            outputs = model(input_ids, attention_mask, images)
            rankings = torch.argsort(outputs, dim=1)
            all_predictions.extend(rankings.cpu().numpy())
    return all_predictions


# Calculate Ranking Accuracy
Here, we convert the predicted ranking matrix into ordered arrays and calculate metrics such as Mean Reciprocal Rank (MRR) and Kendall Tau score for ranking accuracy evaluation.


In [24]:
import numpy as np
predicted_rankings = evaluate_model(model, test_loader)

# Convert each (5x5) prediction array to a rank order
final_predicted_rankings = []
for prediction_matrix in predicted_rankings:
    image_scores = prediction_matrix.sum(axis=1)
    ranked_order = np.argsort(image_scores)[::-1] + 1
    final_predicted_rankings.append(ranked_order.tolist())

# Get the test indices
test_indices = test_dataset.indices
print("Test Indices:", test_indices)

# True rankings for comparison
true_test_rankings = [eval(target_data.iloc[idx]['target']) for idx in test_indices]


Test Indices: [46, 1, 35, 4, 40, 11, 8, 44, 34, 52, 21, 48, 53, 67]


In [25]:
final_predicted_rankings

[[1, 4, 5, 3, 2],
 [2, 3, 4, 1, 5],
 [4, 1, 5, 3, 2],
 [2, 3, 5, 4, 1],
 [4, 3, 2, 5, 1],
 [4, 5, 3, 1, 2],
 [3, 4, 5, 2, 1],
 [4, 5, 2, 1, 3],
 [4, 2, 3, 5, 1],
 [5, 3, 4, 2, 1],
 [1, 4, 5, 2, 3],
 [1, 5, 4, 2, 3],
 [1, 4, 5, 3, 2],
 [3, 5, 4, 1, 2]]

In [26]:
true_test_rankings 

[[1, 2, 5, 4, 3],
 [2, 4, 3, 1, 5],
 [3, 1, 5, 2, 4],
 [4, 3, 1, 2, 5],
 [1, 4, 5, 3, 2],
 [3, 1, 2, 5, 4],
 [2, 3, 4, 5, 1],
 [2, 4, 1, 3, 5],
 [3, 4, 1, 5, 2],
 [1, 3, 2, 4, 5],
 [1, 5, 2, 4, 3],
 [1, 3, 5, 2, 4],
 [2, 4, 1, 5, 3],
 [3, 2, 5, 1, 4]]

In [27]:
def mean_reciprocal_rank(true_rankings, predicted_rankings):
    reciprocal_ranks = []
    for true, pred in zip(true_rankings, predicted_rankings):
        for i, p in enumerate(pred):
            if p == true[i]:
                reciprocal_ranks.append(1 / (i + 1))
                break
        else:
            reciprocal_ranks.append(0)
    return np.mean(reciprocal_ranks)
print(mean_reciprocal_rank(true_test_rankings,final_predicted_rankings))

0.5321428571428571


In [30]:
import numpy as np

def calculate_ranking_accuracy(predicted_rankings, true_rankings):
    """
    Calculate the percentage of images ranked correctly.
    
    Parameters:
    predicted_rankings (list of list of int): The predicted ranking for each image set.
    true_rankings (list of list of int): The true ranking for each image set.
    
    Returns:
    float: The average percentage of images ranked correctly.
    """
    assert len(predicted_rankings) == len(true_rankings), "Predicted and true rankings must have the same length."
    
    total_correct = 0
    total_images = 0
    
    for pred_ranking, true_ranking in zip(predicted_rankings, true_rankings):
        # Count correctly ranked images
        correct = sum(1 for p, t in zip(pred_ranking, true_ranking) if p == t)
        total_correct += correct
        total_images += len(true_ranking)
    
    accuracy = (total_correct / total_images) * 100
    return accuracy

accuracy = calculate_ranking_accuracy(true_test_rankings,final_predicted_rankings)
print(f"Ranking Accuracy: {accuracy:.2f}%")


Ranking Accuracy: 25.71%
