## large

In [None]:
# !gdown --id "1W29UzDy0KUK4jpMPeSes9LN9QPRcvj3p" -O /content/your_data.zip

# !unzip /content/your_data.zip -d /content/extracted_data

## small

In [None]:
# !gdown --id "1Vh9rV1ctXVCrg3Zw0nVbUsvjtt5Re5qK" -O /content/your_data.zip

# !unzip /content/your_data.zip -d /content/extracted_data

## test

In [None]:
# !gdown --id "1DAYgTgOFli8rC-i-LM4k72D88waI7cJh" -O /content/your_data.zip

# !unzip /content/your_data.zip -d /content/extracted_data

## Import

In [None]:
!pip install rouge-score



In [None]:
import os
import glob
import pandas as pd
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel
from torchvision.models import resnet152
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from rouge_score import rouge_scorer

## Read data

In [None]:
# data = pd.read_csv('./train.csv')
data = pd.read_csv('./train_small.csv')
# data = pd.read_csv('./test.csv')

## load model

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
resnet_model = resnet152(pretrained=True)
resnet_model.eval()

## Preprocessing

In [None]:
def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    image = Image.open(image_path).convert('RGB')
    image = transform(image)
    return image

def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.squeeze(0)

## Feature extraction


In [None]:
def load_and_process_images(file_name, image_folder):
    pattern = os.path.join(image_folder, f"{file_name}_*.jpg")
    image_paths = glob.glob(pattern)
    if not image_paths:
        return torch.zeros((1, 2048))
    images = [preprocess_image(img_path) for img_path in image_paths]
    images_tensor = torch.stack(images)
    with torch.no_grad():
        features = resnet_model(images_tensor)
    return features.mean(dim=0).unsqueeze(0)

## Define Model

In [None]:
class Decoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1, seq_len=20):
        super(Decoder, self).__init__()
        self.seq_len = seq_len
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, combined_features, hidden=None):
        if combined_features.dim() == 2:
            combined_features = combined_features.unsqueeze(1).repeat(1, self.seq_len, 1)
        output, hidden = self.lstm(combined_features, hidden)
        output = self.fc(output.reshape(-1, hidden_dim))
        return output

class SummaryModel(nn.Module):
    def __init__(self, text_input_dim, image_input_dim, decoder_hidden_dim, vocab_size, seq_len=20):
        super(SummaryModel, self).__init__()
        self.decoder_input_dim = text_input_dim + image_input_dim
        self.decoder = Decoder(self.decoder_input_dim, decoder_hidden_dim, vocab_size, seq_len=seq_len)

    def forward(self, text_features, image_features):
        if text_features.dim() == 1:
            text_features = text_features.unsqueeze(0)
        if image_output.dim() == 1:
            image_features = image_features.unsqueeze(0)
        combined_features = torch.cat([text_features, image_features], dim=1)
        outputs = self.decoder(combined_features)
        return outputs


## Train

In [None]:
vocab_size = 30522
model = SummaryModel(text_input_dim=768, image_input_dim=2048, decoder_hidden_dim=512, vocab_size=30522, seq_len=20)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
data = pd.read_csv('./train_small.csv')
image_folder = '/content/extracted_data/img_small/'

In [None]:
def generate_summary(model, text, file_name, image_folder):
    text_feat = encode_text(text).unsqueeze(0)
    image_feat = load_and_process_images(file_name, image_folder)
    if image_feat.dim() == 1:
        image_feat = image_feat.unsqueeze(0)
    with torch.no_grad():
        output = model(text_feat, image_feat)
    summary_ids = output.argmax(dim=-1)
    return tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)

def find_most_relevant_image(file_name, summary):
    base_path = f'{image_folder}/{file_name}_'
    text_features = encode_text(summary).squeeze().numpy()
    best_image = None
    best_similarity = -1
    i = 1
    while os.path.exists(f'{base_path}{i}.jpg'):
        img_path = f'{base_path}{i}.jpg'
        image_features = preprocess_image(img_path).unsqueeze(0)
        with torch.no_grad():
            image_features = resnet_model(image_features).squeeze().numpy()
        similarity = cosine_similarity([text_features], [image_features])[0][0]
        if similarity > best_similarity:
            best_similarity = similarity
            best_image = img_path
        i += 1
    return best_image

## Rouge

In [None]:
data['generated_summary'] = data.apply(lambda row: generate_summary(model, row['article'], row['fileName'], image_folder), axis=1)
data['most_relevant_image'] = data.apply(lambda row: find_most_relevant_image(row['fileName'], row['generated_summary']), axis=1)

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
data['rouge_scores'] = data.apply(lambda row: scorer.score(row['summary'], row['generated_summary']), axis=1)

print(data[['generated_summary', 'most_relevant_image', 'rouge_scores']])

## Cosine similarity & Euclidean distance

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity_and_distance(row):
    text_features = encode_text(row['generated_summary']).detach().numpy()

    image_path = row['most_relevant_image']
    image = preprocess_image(image_path)
    image = image.unsqueeze(0)
    with torch.no_grad():
        image_features = resnet_model(image).squeeze(0).numpy()

    cosine_sim = cosine_similarity([text_features], [image_features])[0][0]

    euclidean_dist = np.linalg.norm(text_cent_features - image_features)

    return pd.Series([cosine_sim, euclidean_dist], index=['cosine_similarity', 'euclidean_distance'])

data[['cosine_similarity', 'euclidean_distance']] = data.apply(calculate_similarity_and_distance, axis=1)

print(data[['generated_summary', 'most_relevant_image', 'cosine_similarity', 'euclidean_distance']])