In [None]:
import streamlit as st
import requests
import os
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, BlipConfig
import torch.nn as nn

In [None]:
# Load the pre-trained model and processor
MODEL_DIRECTORY = "blip-image-captioning-base"
MODEL_PATH = os.path.join(MODEL_DIRECTORY)
PROCESSOR_PATH = os.path.join(MODEL_DIRECTORY)

In [None]:
processor = BlipProcessor.from_pretrained(PROCESSOR_PATH)
config = BlipConfig.from_pretrained(MODEL_PATH)


# Define a custom model architecture with two additional layers
class CustomBlipForConditionalGeneration(BlipForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)
        # Define additional layers
        self.additional_layer1 = nn.Linear(768, 768)  # Adjust the size as needed
        self.additional_layer2 = nn.Linear(768, 768)  # Adjust the size as needed
        # Initialize additional layers
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        decoder_input_ids=None,
        encoder_outputs=None,
        past_key_values=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            labels=labels,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            **kwargs,
        )
        # Apply additional layers
        sequence_output = outputs[0]
        sequence_output = self.additional_layer1(sequence_output)
        sequence_output = nn.functional.relu(sequence_output)
        sequence_output = self.additional_layer2(sequence_output)
        return sequence_output, outputs[1:]


model = CustomBlipForConditionalGeneration.from_pretrained(MODEL_PATH)

In [None]:
file_path = "Flicker8k_Dataset\\10815824_2997e03d76.jpg"
# Display the uploaded image on the sidebar
image = Image.open(file_path).convert("RGB")


# Function to generate unconditional caption
def generate_caption(image):
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs, max_new_tokens=50)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

In [None]:
generate_caption(image)

In [None]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

def load_references(file_path):
    references = {}
    with open(file_path, 'r') as file:
        for line in file:
            image_id = line.split()[0][:-2]
            caption = line.split()[1:]
            caption = ' '.join(caption)
            if image_id not in references:
                references[image_id] = []
            references[image_id].append(caption)
    return references

references = load_references('Flickr8k.token.txt')

# Generate captions for each image in the dataset
generated_captions = {}
for image_id in tqdm(references.keys()):
    file_path = f"Flicker8k_Dataset\\{image_id}"
    image = Image.open(file_path).convert("RGB")
    generated_caption = generate_caption(image)
    generated_captions[image_id] = generated_caption

# Prepare references and hypotheses for computing BLEU score
references_list = [
    [caption.split() for caption in captions] for captions in references.values()
]
hypotheses_list = [
    generated_captions[image_id].split() for image_id in references.keys()
]
# Compute BLEU score
bleu_score = corpus_bleu(references_list, hypotheses_list)
print("BLEU Score:", bleu_score)

In [45]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

def load_references(file_path):
    references = {}
    with open(file_path, 'r') as file:
        for line in file:
            image_id = line.split()[0][:-2]
            caption = line.split()[1:]
            caption = ' '.join(caption)
            if image_id not in references:
                references[image_id] = []
            references[image_id].append(caption)
    return references

references = load_references('Flickr8k.token.test.txt')

# Generate captions for each image in the dataset
generated_captions = {}
for image_id in tqdm(references.keys()):
    file_path = f"Flicker8k_Dataset_test\\{image_id}"
    image = Image.open(file_path).convert("RGB")
    generated_caption = generate_caption(image)
    generated_captions[image_id] = generated_caption

# Prepare references and hypotheses for computing BLEU score
references_list = [
    [caption.split() for caption in captions] for captions in references.values()
]
hypotheses_list = [
    generated_captions[image_id].split() for image_id in references.keys()
]
# Compute BLEU score
bleu_score = corpus_bleu(references_list, hypotheses_list)
print("BLEU Score:", bleu_score)

100%|██████████| 8/8 [00:37<00:00,  4.72s/it]

BLEU Score: 0.29443704667018933



