# COMP 588 Final Project Experiment

Author: Zhongjie Wu

Acknowledgments: Special thanks to the YaleNLP Lab and Yilun Zhao for their guidance on this project.


In [1]:
import json
import os
import shutil

In [2]:
def read_jsonl_file(path):
  data = []
  with open(path, 'r') as f:
    for line in f:
      data.append(json.loads(line))
  return data

def read_json(filename):
  with open(filename, 'r') as f:
    data = json.load(f)
  return data


path_jsonl = 'questions.jsonl'
qas = read_jsonl_file(path_jsonl)

path_json = 'papers.json'
paper_dict = read_json(path_json)

In [3]:
def search_and_store_img(path, qa_num=None):
    # Extract the required part of the path
    path_components = path.split("/")
    extracted_path = "../" + "/".join(path_components[-2:])

    # Check if qa_num is provided and not None
    if qa_num is not None:
        # Create a new directory name
        dir_name = f"q_{qa_num}"

        # Create the directory if it does not exist
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)

        # Copy the file to the new directory
        destination_path = os.path.join(dir_name, os.path.basename(path))
        shutil.copy(extracted_path, destination_path)

def get_related_info(paper_dict, qa, qa_num=None):
    q = qa['question']
    a = qa['answer']
    anchor_id = qa['anchor_arxiv_id']
    ref_id = qa['reference_arxiv_id']
    anchor_info = qa['source_anchor']
    ref_info = qa['source_reference']
    
    is_text_anchor = False if anchor_info[0] == '/' else True
    is_text_ref = False if ref_info[0] == '/' else True

    result = "Question: " + q + "\n" + "Text Context: \n"
    anchor_found = None
    ref_found = None
    if is_text_anchor:
        for section in paper_dict[anchor_id]['full_text']:
            if section['section_name'] ==  anchor_info:
                anchor_found = section['paragraphs']
                result += anchor_found

        if anchor_info == 'Abstract_1':
            anchor_found = paper_dict[anchor_id]['abstract']
            result += anchor_found

    if is_text_ref:
        for section in paper_dict[ref_id]['full_text']:
            if section['section_name'] ==  ref_info:
                ref_found = section['paragraphs']
                result += ref_found
        
        if ref_info == 'Abstract_1':
            ref_found = paper_dict[ref_id]['abstract']
            result += ref_found

    if anchor_found == None and qa_num is not None and not is_text_anchor:
        # print(f"extract QA pair - {qa_num} anchor image context... ")
        search_and_store_img(anchor_info, qa_num)
        # print("done!")


    if ref_found == None and qa_num is not None and not is_text_ref:
        # print(f"extract QA pair - {qa_num} ref image context... ")
        search_and_store_img(ref_info, qa_num)
        # print("done!")

    if not ref_found and not anchor_found and qa_num:
        print(f"Error in QA pair {qa_num}")

    return result


In [154]:
full_context = []
full_context_with_prompt = []

prompt = "Answer the following question using both the given text and image context. Remember to limit your response within 50 words and do not use any knowledge not given in the context."

for i in range(len(qas)):
    context = get_related_info(paper_dict, qas[i], i)
    full_context.append(context)
    full_context_with_prompt.append(prompt + '\n' + context)
    # print(f"QA pair {i} extraction complete")

In [156]:
def read_first_image(folder_path):
    # Iterate over files in the specified folder
    for file in os.listdir(folder_path):
        # Assuming the image files are in JPG, JPEG, or PNG format
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, file)

            # Read and return the image data
            with open(image_path, 'rb') as img_file:
                return img_file.read()

    # Return None if no image file is found
    return None

## Multi-modal BARD

In [162]:
from bardapi import Bard
from bardapi import BardCookies
import os

# Replace with your own cookie id.
cookie_dict = {
    "__Secure-1PSID": "##########################",
    "__Secure-1PSIDTS": "##########################"
}

bard = BardCookies(cookie_dict=cookie_dict)



In [158]:
def query(question, qid, bard):
    path = f"./q_{qid}"
    image_data = read_first_image(path)
    answer = bard.ask_about_image(question, image_data)['content']
    return answer

In [None]:
bard_answers = []
for i in range(len(full_context_with_prompt)):
    if len(full_context_with_prompt[i]) > 4096:
        truncate = full_context_with_prompt[i][:4096]
        answer = query(truncate, i, bard)
    else:
        answer = query(full_context_with_prompt[i], i, bard)
    pair = {
        'id': i,
        'prompt': full_context_with_prompt[i],
        'image_folder': i,
        'answer': answer  
    }
    bard_answers.append(pair)
    print(f"Bard answered QA pair {i+1} successfully.")

In [166]:
with open("bard_answer.jsonl", "w") as file:
    # Iterate over each item in the list
    for item in bard_answers:
        # Convert the item to a JSON string
        json_string = json.dumps(item)
        # Write the JSON string to the file with a newline
        file.write(json_string + "\n")

In [173]:
# Calculate how many examples are truncated due to bard token limit
l = [len(i) for i in full_context_with_prompt]
count = 0
for i in l:
    if i <= 4096:
        count += 1
count

24

In [170]:
for i in range(len(full_context_with_prompt)):
    if l[i] <= 4096:
        bard_answers[i]['truncated'] = False
    else:
        bard_answers[i]['truncated'] = True

In [189]:
with open("bard_answers_with_bertscore.jsonl", "w") as file:
    # Iterate over each item in the list
    for item in bard_answers:
        # Convert the item to a JSON string
        json_string = json.dumps(item)
        # Write the JSON string to the file with a newline
        file.write(json_string + "\n")

#### Multimodal-Bard BERTScore

In [184]:
from bert_score import score

for i in range(len(qas)):
    reference = [qas[i]['answer']]
    candidate = [bard_answers[i]['answer']]

    P, R, F1 = score(candidate, reference, lang="en", verbose=True)
    bertscores = [P.item(), R.item(), F1.item()]
    bard_answers[i]['bert_scores'] = bertscores

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  3.00it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 309.61it/s]


done in 0.35 seconds, 2.89 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.73it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 263.21it/s]


done in 0.38 seconds, 2.63 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 215.47it/s]


done in 1.13 seconds, 0.89 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.07it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 269.18it/s]


done in 0.94 seconds, 1.06 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  3.55it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 304.31it/s]


done in 0.29 seconds, 3.42 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.70it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 241.02it/s]


done in 0.39 seconds, 2.60 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.19it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 268.59it/s]


done in 0.85 seconds, 1.17 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.81it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 345.27it/s]


done in 0.57 seconds, 1.77 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  3.10it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 247.28it/s]


done in 0.34 seconds, 2.97 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:01<00:00,  1.44s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 214.31it/s]


done in 1.45 seconds, 0.69 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.07it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 230.41it/s]


done in 0.50 seconds, 2.00 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.71it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 234.14it/s]


done in 0.38 seconds, 2.60 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:02<00:00,  2.04s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 130.05it/s]


done in 2.06 seconds, 0.48 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.29it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 319.61it/s]


done in 0.45 seconds, 2.21 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.52it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 307.79it/s]


done in 0.41 seconds, 2.44 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  4.19it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 345.67it/s]


done in 0.25 seconds, 4.04 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.33it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 309.50it/s]


done in 0.77 seconds, 1.31 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  4.12it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 166.47it/s]


done in 0.26 seconds, 3.89 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.10it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 282.86it/s]


done in 0.49 seconds, 2.03 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.17it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 347.58it/s]


done in 0.87 seconds, 1.15 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  3.33it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 367.92it/s]


done in 0.31 seconds, 3.24 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.90it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 314.86it/s]


done in 0.54 seconds, 1.85 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.90it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 358.30it/s]


done in 0.36 seconds, 2.81 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  3.68it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 376.07it/s]


done in 0.28 seconds, 3.55 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.05it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 368.60it/s]


done in 0.97 seconds, 1.04 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.64it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 265.31it/s]


done in 0.39 seconds, 2.56 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.49it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 318.23it/s]


done in 0.68 seconds, 1.47 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.87it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 295.44it/s]


done in 0.55 seconds, 1.83 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.38it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 381.96it/s]


done in 0.43 seconds, 2.31 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.17it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 274.42it/s]


done in 0.86 seconds, 1.16 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.84it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 320.71it/s]


done in 0.56 seconds, 1.79 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.92it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 270.20it/s]


done in 0.35 seconds, 2.82 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.90it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 368.86it/s]


done in 0.54 seconds, 1.85 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:01<00:00,  1.20s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 378.07it/s]


done in 1.21 seconds, 0.83 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.68it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 260.89it/s]


done in 0.61 seconds, 1.64 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.51it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 346.72it/s]


done in 0.67 seconds, 1.48 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.55it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 333.57it/s]


done in 0.41 seconds, 2.47 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.67it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 294.36it/s]


done in 0.39 seconds, 2.57 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.99it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 323.71it/s]


done in 0.35 seconds, 2.89 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  3.14it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 371.31it/s]


done in 0.33 seconds, 3.04 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.42it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 264.32it/s]


done in 0.43 seconds, 2.33 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.27it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 129.17it/s]


done in 0.80 seconds, 1.25 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.70it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 359.19it/s]


done in 0.60 seconds, 1.66 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.12it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 312.89it/s]


done in 0.90 seconds, 1.11 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.83it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 227.85it/s]

done in 0.56 seconds, 1.79 sentences/sec





#### Multimodal-Bard ROUGE Score

In [192]:
from rouge import Rouge

rouge = Rouge()
# i = 0
for i in range(len(qas)):
    reference = [qas[i]['answer']]
    hypothesis = [bard_answers[i]['answer']]

    scores = rouge.get_scores(hypothesis, reference)
    bard_answers[i]['rouge_scores'] = scores

In [198]:
bard_answers[2]['rouge_scores']

[{'rouge-1': {'r': 0.5294117647058824,
   'p': 0.09782608695652174,
   'f': 0.1651376120461241},
  'rouge-2': {'r': 0.16666666666666666,
   'p': 0.02040816326530612,
   'f': 0.03636363441983481},
  'rouge-l': {'r': 0.47058823529411764,
   'p': 0.08695652173913043,
   'f': 0.14678898819291306}}]

In [199]:
with open("bard_answers_bert_rouge.jsonl", "w") as file:
    # Iterate over each item in the list
    for item in bard_answers:
        # Convert the item to a JSON string
        json_string = json.dumps(item)
        # Write the JSON string to the file with a newline
        file.write(json_string + "\n")

## GPT4v

In [208]:
import openai
import base64
import requests

openai.api_base = "https://api.keya.pw/v1"
# Replace with your gpt api_key.
api_key = "###############"

def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')
            
def encode_first_image(folder_path):
    # Iterate over files in the specified folder
    for file in os.listdir(folder_path):
        # Assuming the image files are in JPG, JPEG, or PNG format
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, file)

            # Read and return the encoded image data
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode('utf-8')
    # Return None if no image file is found
    return None

In [224]:
def query_4v(prompt, qid):
    path = f"./q_{qid}"
    image_data = encode_first_image(path)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_data}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 50
    }

    response = requests.post("https://api.keya.pw/v1/chat/completions", headers=headers, json=payload)
    result_json = response.json()
    
    return result_json['choices'][0]['message']['content']


In [None]:
gpt4v_answers = []
for i in range(len(full_context_with_prompt)):
    answer = query_4v(full_context_with_prompt[i], i)
    pair = {
        'id': i,
        'prompt': full_context_with_prompt[i],
        'image_folder': i,
        'answer': answer  
    }
    gpt4v_answers.append(pair)
    print(f"GPT4v answered QA pair {i+1} successfully.")

In [236]:
gpt4v_answers[44]

{'id': 44,
 'prompt': 'Answer the following question using both the given text and image context. Remember to limit your response within 50 words and do not use any knowledge not given in the context.\nQuestion: For the method represented by a line with lower perplexity than Sinusoidal with L=2048 across all measured training times in the Validation Perplexity Through Training graph, what are the test perplexity results achieved by this method on the Billion Word benchmark and the WikiText-103 benchmark compared to previously published work?\nText Context: \n Language modeling is a basic task in natural language processing, with many applications such as speech recognition \\citep{arisoy:2012:wfnlm} and statistical machine translation \\citep{schwenk:2012:wfnlm,vaswani:2013:emnlp,baltescu2014pragmatic}. Recently, much progress has been made by neural methods \\citep{bengio:2003:jmlr,mikolov:2010:interspeech} based on LSTMs \\citep{jozefowicz2016lm}, gated convolutional networks \\citep

In [237]:
with open("gpt4v_answer.jsonl", "w") as file:
    # Iterate over each item in the list
    for item in gpt4v_answers:
        # Convert the item to a JSON string
        json_string = json.dumps(item)
        # Write the JSON string to the file with a newline
        file.write(json_string + "\n")

In [None]:
from bert_score import score

for i in range(len(qas)):
    reference = [qas[i]['answer']]
    candidate = [gpt4v_answers[i]['answer']]

    P, R, F1 = score(candidate, reference, lang="en", verbose=True)
    bertscores = [P.item(), R.item(), F1.item()]
    gpt4v_answers[i]['bert_scores'] = bertscores

In [240]:
from rouge import Rouge

rouge = Rouge()
# i = 0
for i in range(len(qas)):
    reference = [qas[i]['answer']]
    hypothesis = [gpt4v_answers[i]['answer']]

    scores = rouge.get_scores(hypothesis, reference)
    gpt4v_answers[i]['rouge_scores'] = scores

In [241]:
with open("gpt4v_answers_bert_rouge.jsonl", "w") as file:
    # Iterate over each item in the list
    for item in gpt4v_answers:
        # Convert the item to a JSON string
        json_string = json.dumps(item)
        # Write the JSON string to the file with a newline
        file.write(json_string + "\n")

## Results Summary

In [247]:
gpt4v_score_bertf1 = 0
mbard_score_bertf1 = 0
for i in range(len(qas)):
    print(f"For QA pair {i}:")
    print(f"MMBard - BERTScore: P:{bard_answers[i]['bert_scores'][0]}, R:{bard_answers[i]['bert_scores'][1]}, F1:{bard_answers[i]['bert_scores'][2]}")
    print(f"GPT4v - BERTScore: P:{gpt4v_answers[i]['bert_scores'][0]}, R:{gpt4v_answers[i]['bert_scores'][1]}, F1:{gpt4v_answers[i]['bert_scores'][2]}")
    if bard_answers[i]['bert_scores'][2] > gpt4v_answers[i]['bert_scores'][2]:
        mbard_score_bertf1 += 1
    else:
        gpt4v_score_bertf1 += 1
    print("-*"*40)

print(f"GPT4v wins {gpt4v_score_bertf1}/45")
print(f"MMBard wins {mbard_score_bertf1}/45")

For QA pair 0:
MMBard - BERTScore: P:0.8914120197296143, R:0.9374126195907593, F1:0.9138337969779968
GPT4v - BERTScore: P:0.8705054521560669, R:0.8865202069282532, F1:0.8784397840499878
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
For QA pair 1:
MMBard - BERTScore: P:0.8289973735809326, R:0.9412931203842163, F1:0.881583571434021
GPT4v - BERTScore: P:0.8312262296676636, R:0.938875675201416, F1:0.8817775845527649
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
For QA pair 2:
MMBard - BERTScore: P:0.8085586428642273, R:0.8702982068061829, F1:0.838293194770813
GPT4v - BERTScore: P:0.8828139901161194, R:0.8905731439590454, F1:0.886676549911499
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
For QA pair 3:
MMBard - BERTScore: P:0.816047728061676, R:0.8842343688011169, F1:0.8487738370895386
GPT4v - BERTScore: P:0.8930950164794922, R:0.9521903395652771, F1:0.9216963648796082
-*-*-*-*-*-*-*-*-*

In [252]:
gpt4v_score_rouge = 0
mbard_score_rouge = 0
for i in range(len(qas)):
    print(f"For QA pair {i}:")
    print(f"MMBard - F1: R-1:{bard_answers[i]['rouge_scores'][0]['rouge-1']['f']}, R-2:{bard_answers[i]['rouge_scores'][0]['rouge-2']['f']}, R-L:{bard_answers[i]['rouge_scores'][0]['rouge-l']['f']}")
    print(f"GPT4v - F1: R-1:{gpt4v_answers[i]['rouge_scores'][0]['rouge-1']['f']}, R-2:{gpt4v_answers[i]['rouge_scores'][0]['rouge-2']['f']}, R-L:{gpt4v_answers[i]['rouge_scores'][0]['rouge-l']['f']}")
    for type in ['rouge-1', 'rouge-2', 'rouge-l']:
        if bard_answers[i]['rouge_scores'][0][type]['f'] > gpt4v_answers[i]['rouge_scores'][0][type]['f']:
            mbard_score_rouge += 1
        else:
            gpt4v_score_rouge += 1
    print("-*"*40)

print(f"GPT4v wins {gpt4v_score_rouge}/135")
print(f"MMBard wins {mbard_score_rouge}/135")

For QA pair 0:
MMBard - F1: R-1:0.4745762667049699, R-2:0.29999999567222224, R-L:0.4067796565354784
GPT4v - F1: R-1:0.31111110617283955, R-2:0.18604650669551123, R-L:0.2666666617283951
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
For QA pair 1:
MMBard - F1: R-1:0.28070175172668516, R-2:0.16666666435555558, R-L:0.28070175172668516
GPT4v - F1: R-1:0.42105262796398896, R-2:0.22222221876543213, R-L:0.3684210490166206
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
For QA pair 2:
MMBard - F1: R-1:0.1651376120461241, R-2:0.03636363441983481, R-L:0.14678898819291306
GPT4v - F1: R-1:0.3999999951125, R-2:0.09090908607438043, R-L:0.29999999511250003
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
For QA pair 3:
MMBard - F1: R-1:0.08080807867768601, R-2:0.0, R-L:0.08080807867768601
GPT4v - F1: R-1:0.41025640599605523, R-2:0.15789473272853197, R-L:0.358974354714004
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*

In [269]:
def calculate_average_scores(data, include_truncation=True, token_size_index=None, truncate_condition=4096):
    # Initialize accumulators for each score
    total_bert_precision, total_bert_recall, total_bert_f1 = 0, 0, 0
    total_rouge_1_f, total_rouge_2_f, total_rouge_l_f = 0, 0, 0
    count = 0

    for i, item in enumerate(data):
        if not include_truncation and (token_size_index[i]>truncate_condition):
            continue
        # Accumulate BERT scores
        total_bert_precision += item['bert_scores'][0]
        total_bert_recall += item['bert_scores'][1]
        total_bert_f1 += item['bert_scores'][2]

        # Accumulate ROUGE scores
        total_rouge_1_f += item['rouge_scores'][0]['rouge-1']['f']
        total_rouge_2_f += item['rouge_scores'][0]['rouge-2']['f']
        total_rouge_l_f += item['rouge_scores'][0]['rouge-l']['f']

        count += 1

    # Calculate averages
    avg_bert_precision = total_bert_precision / count
    avg_bert_recall = total_bert_recall / count
    avg_bert_f1 = total_bert_f1 / count
    avg_rouge_1_f = total_rouge_1_f / count
    avg_rouge_2_f = total_rouge_2_f / count
    avg_rouge_l_f = total_rouge_l_f / count

    return {
        "average_bert_precision": avg_bert_precision,
        "average_bert_recall": avg_bert_recall,
        "average_bert_f1": avg_bert_f1,
        "average_rouge_1_f": avg_rouge_1_f,
        "average_rouge_2_f": avg_rouge_2_f,
        "average_rouge_l_f": avg_rouge_l_f
    }

In [275]:
bard_result_all = calculate_average_scores(bard_answers, True)
print(f"Multimodal-Bard (all datapoints):{bard_result_all}")

bard_result_ex_trunc = calculate_average_scores(bard_answers, False, l)
print(f"Multimodal-Bard (exclude truncated datapoints):{bard_result_ex_trunc}")

gpt4v_result_all = calculate_average_scores(gpt4v_answers, True)
print(f"GPT4v (all datapoints):{gpt4v_result_all}")

gpt4v_result_ex_trunc = calculate_average_scores(gpt4v_answers, False, l)
print(f"GPT4v (exclude truncated datapoints):{gpt4v_result_ex_trunc}")

Multimodal-Bard (all datapoints):{'average_bert_precision': 0.8465603391329447, 'average_bert_recall': 0.8935284455617268, 'average_bert_f1': 0.8690625919236077, 'average_rouge_1_f': 0.2931313522052871, 'average_rouge_2_f': 0.1252911789289194, 'average_rouge_l_f': 0.2611285443246064}
Multimodal-Bard (exclude truncated datapoints):{'average_bert_precision': 0.8581861654917399, 'average_bert_recall': 0.9099420557419459, 'average_bert_f1': 0.8831019798914591, 'average_rouge_1_f': 0.3651664114435156, 'average_rouge_2_f': 0.18760686989924547, 'average_rouge_l_f': 0.3341256398095118}
GPT4v (all datapoints):{'average_bert_precision': 0.8861588305897183, 'average_bert_recall': 0.9116761181089613, 'average_bert_f1': 0.898409370581309, 'average_rouge_1_f': 0.4353121719335574, 'average_rouge_2_f': 0.22958218499428318, 'average_rouge_l_f': 0.39559463647476456}
GPT4v (exclude truncated datapoints):{'average_bert_precision': 0.8936805551250776, 'average_bert_recall': 0.9144758308927218, 'average_ber