In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import models, transforms
from PIL import Image
from transformers import BertModel, BertTokenizer

# Load data
train_data = pd.read_csv('/kaggle/input/ml-project-dataset/train/subtask_a_train.csv')
target_data = pd.read_csv('/kaggle/input/target-data-files/target_t.csv')

# Image transformations
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Dataset class
class IdiomImageDataset(Dataset):
    def __init__(self, dataframe, target_df, image_dir):
        self.dataframe = dataframe
        self.target_df = target_df
        self.image_dir = image_dir
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        target_row = self.target_df.iloc[index]
        sentence = row['sentence']
        idiom_name = row['compound'].replace("'", "_")
        image_names = [row[f'image{i}_name'] for i in range(1, 6)]

        expected_order = eval(target_row['target'])
        expected_order = [x - 1 for x in expected_order]

        inputs = self.tokenizer(sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        images = []
        for img_name in image_names:
            img_path = os.path.join(self.image_dir, idiom_name, img_name)
            img = Image.open(img_path).convert('RGB')
            img = image_transforms(img)
            images.append(img)
        images_tensor = torch.stack(images)
        expected_order_tensor = torch.tensor(expected_order, dtype=torch.long)
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0), images_tensor, expected_order_tensor

# Model class with dropout layers and cross-entropy for ranking
class MultimodalRankingModel(nn.Module):
    def __init__(self):
        super(MultimodalRankingModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.resnet = models.resnet50(weights='DEFAULT')
        self.resnet.fc = nn.Identity()
        self.fc1 = nn.Linear(768 + 2048, 512)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, 5)

    def forward(self, input_ids, attention_mask, images):
        text_features = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        text_features = text_features.mean(dim=1)
        
        batch_size, num_images, channels, height, width = images.size()
        images = images.view(batch_size * num_images, channels, height, width)
        image_features = self.resnet(images)
        image_features = image_features.view(batch_size, num_images, -1)
        
        combined_features = torch.cat((text_features.unsqueeze(1).expand(-1, num_images, -1), image_features), dim=2)
        x = torch.relu(self.fc1(combined_features))
        x = self.dropout(x)
        rankings = self.fc2(x).squeeze(-1)  # Shape: [batch_size, num_images]
        
        return rankings

# Data preparation
image_folder = '/kaggle/input/ml-project-dataset/train'
dataset = IdiomImageDataset(train_data, target_data, image_folder)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Training setup
model = MultimodalRankingModel()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# Training function
def train_model(model, data_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for input_ids, attention_mask, images, expected_order in data_loader:
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, images)
            loss = 0
            for i in range(outputs.size(1)):  # Loop over each image's rank
                loss += criterion(outputs[:, i], expected_order[:, i])
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(data_loader)}')

# Run training
train_model(model, train_loader, criterion, optimizer, epochs=10)
torch.save(model.state_dict(), 'multimodal_ranking_model.pth')

# Evaluation function
def evaluate_model(model, data_loader):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for input_ids, attention_mask, images, _ in data_loader:
            outputs = model(input_ids, attention_mask, images)
            rankings = torch.argsort(outputs, dim=1)
            all_predictions.extend(rankings.cpu().numpy())
    return all_predictions

# Calculate ranking accuracy
predicted_rankings = evaluate_model(model, test_loader)

Epoch 1/10, Loss: 8.043292045593262
Epoch 2/10, Loss: 7.724140167236328
Epoch 3/10, Loss: 7.4763606786727905
Epoch 4/10, Loss: 7.143829822540283
Epoch 5/10, Loss: 6.789019227027893
Epoch 6/10, Loss: 6.301056861877441
Epoch 7/10, Loss: 5.747622489929199
Epoch 8/10, Loss: 5.074761748313904
Epoch 9/10, Loss: 4.441108584403992
Epoch 10/10, Loss: 3.8312588334083557


In [3]:
predicted_rankings

[array([[1, 4, 0, 0, 0],
        [2, 2, 4, 3, 3],
        [4, 1, 3, 1, 2],
        [3, 3, 2, 2, 4],
        [0, 0, 1, 4, 1]]),
 array([[1, 4, 2, 3, 3],
        [4, 2, 0, 0, 0],
        [2, 0, 3, 1, 4],
        [0, 1, 4, 2, 1],
        [3, 3, 1, 4, 2]]),
 array([[0, 0, 3, 1, 2],
        [2, 4, 4, 3, 1],
        [1, 2, 0, 4, 3],
        [4, 3, 2, 2, 4],
        [3, 1, 1, 0, 0]]),
 array([[3, 0, 4, 4, 4],
        [2, 1, 1, 2, 2],
        [1, 3, 0, 1, 3],
        [0, 2, 3, 0, 0],
        [4, 4, 2, 3, 1]]),
 array([[2, 2, 1, 4, 4],
        [3, 1, 3, 1, 0],
        [0, 3, 0, 0, 3],
        [1, 0, 2, 3, 1],
        [4, 4, 4, 2, 2]]),
 array([[4, 4, 3, 1, 3],
        [0, 0, 4, 3, 1],
        [2, 2, 1, 0, 2],
        [1, 1, 0, 2, 0],
        [3, 3, 2, 4, 4]]),
 array([[2, 3, 0, 0, 0],
        [3, 1, 1, 1, 2],
        [0, 2, 4, 4, 4],
        [4, 4, 3, 2, 3],
        [1, 0, 2, 3, 1]]),
 array([[2, 4, 1, 1, 0],
        [3, 3, 2, 0, 1],
        [0, 2, 0, 3, 2],
        [4, 1, 4, 4, 3],
        [1,

In [5]:
import numpy as np
# Convert each (5x5) prediction array to a rank order
final_predicted_rankings = []

for prediction_matrix in predicted_rankings:
    # Sum across rows to get a single score for each image
    image_scores = prediction_matrix.sum(axis=1)
    # Get the ranking order based on scores (higher score means higher rank)
    ranked_order = np.argsort(image_scores)[::-1] + 1
    final_predicted_rankings.append(ranked_order.tolist())


In [6]:
final_predicted_rankings

[[4, 2, 3, 5, 1],
 [5, 1, 3, 4, 2],
 [4, 2, 3, 1, 5],
 [1, 5, 3, 2, 4],
 [5, 1, 2, 4, 3],
 [5, 1, 2, 3, 4],
 [4, 3, 2, 5, 1],
 [4, 5, 2, 1, 3],
 [1, 5, 4, 3, 2],
 [3, 4, 5, 1, 2],
 [5, 1, 4, 2, 3],
 [2, 5, 4, 3, 1],
 [5, 2, 4, 1, 3],
 [4, 2, 5, 3, 1]]

In [8]:
# Get the indices of the test data points
test_indices = test_dataset.indices

# Display the indices
print("Test Indices:", test_indices)

# Optionally, print the actual test data points (e.g., true rankings or other features)
true_test_rankings = [eval(target_data.iloc[idx]['target']) for idx in test_indices]
true_test_rankings

Test Indices: [40, 30, 8, 53, 21, 37, 18, 68, 61, 46, 11, 15, 12, 25]


[[1, 4, 5, 3, 2],
 [4, 1, 2, 5, 3],
 [2, 3, 4, 5, 1],
 [2, 4, 1, 5, 3],
 [1, 5, 2, 4, 3],
 [4, 2, 3, 1, 5],
 [2, 4, 3, 5, 1],
 [2, 5, 3, 1, 4],
 [3, 5, 1, 4, 2],
 [1, 2, 5, 4, 3],
 [3, 1, 2, 5, 4],
 [1, 2, 3, 4, 5],
 [5, 3, 4, 1, 2],
 [2, 3, 4, 5, 1]]

In [9]:
def mean_reciprocal_rank(true_rankings, predicted_rankings):
    reciprocal_ranks = []
    for true, pred in zip(true_rankings, predicted_rankings):
        for i, p in enumerate(pred):
            if p == true[i]:
                reciprocal_ranks.append(1 / (i + 1))
                break
        else:
            reciprocal_ranks.append(0)
    return np.mean(reciprocal_ranks)
print(mean_reciprocal_rank(true_test_rankings,final_predicted_rankings))

0.294047619047619


In [12]:
from scipy.stats import kendalltau

def kendall_tau_score(true_rankings, predicted_rankings):
    scores = [kendalltau(true, pred).correlation for true, pred in zip(true_rankings, predicted_rankings)]
    return np.mean([s for s in scores if s is not None])
print(kendall_tau_score(true_test_rankings,final_predicted_rankings))

0.1571428571428571


In [13]:
from scipy.stats import spearmanr

# Spearman correlation function
def spearman_correlation(true_rankings, predicted_rankings):
    correlations = []
    for true, pred in zip(true_rankings, predicted_rankings):
        correlation, _ = spearmanr(true, pred)
        correlations.append(correlation if not pd.isnull(correlation) else 0)  # Handle NaN cases
    average_correlation = sum(correlations) / len(correlations)
    return average_correlation
print(spearman_correlation(true_test_rankings,final_predicted_rankings))

0.20714285714285713
