# Target experiments

- [1] o4 + Caption + QA + No RAG
- [2] o4 + Image + QA + No RAG
- [3] o4 + Image + Caption + QA + No RAG
- [4] o4 + Caption + QA + Ideal RAG
- [5] o4 + Image + QA + Ideal RAG
- [6] o4 + Image + Caption + QA + Ideal RAG

In [1]:
import pandas as pd

df = pd.read_csv("../data/dataset_V3/corpus.csv", index_col="id")
laws = {}
for idx, row in df.iterrows():
    laws[idx] = f"{row['title_metadata']} | {row['content']}"

In [2]:
import pandas as pd

df = pd.read_csv("../data/dataset_V3/corpus_indicators.csv", index_col="id")
indicators = {}
for idx, row in df.iterrows():
    indicators[idx] = f"{row['category']} | {row['title']} |  {row['content']}"

In [3]:
import pandas as pd

df1 = pd.read_csv("../results/captions-split-3.csv", index_col="id")
df2 = pd.read_csv("../results/captions-split-4.csv", index_col="id")
captions = {}
for idx, row in df1.iterrows():
    captions[idx] = row['captions']
for idx, row in df2.iterrows():
    captions[idx] = row['captions']

In [None]:
from langchain_openai import ChatOpenAI

reasoning = {
    "effort": "medium",  # 'low', 'medium', or 'high'
    "summary": "detailed",  # 'detailed', 'auto', or None
}

llm_o4 = ChatOpenAI(model_name="o4-mini", api_key="", reasoning=reasoning, output_version="responses/v1")

In [5]:

import ast
from tqdm import tqdm
import re
import base64
from langchain_core.messages import HumanMessage
import asyncio
import pandas as pd

# top_k_search = 10
MAX_CONCURRENT_REQUESTS = 60
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

async def mass_vqa_runner(data: pd.DataFrame, strategy):
    tasks = []
    for real_idx, (idx, item) in tqdm(enumerate(data.iterrows())):
        # Create tasks for parallel processing
        tasks.append(strategy(item))
    
    results = await asyncio.gather(*tasks)
    final_input_prompt = []
    final_output_prompt = []
    final_qa_result = []
    final_correct_answers = []
    for result, (idx, item) in zip(results, data.iterrows()):
        input_prompt, output_prompt, qa_result, correct_answers = result
        final_input_prompt.append(input_prompt)
        final_output_prompt.append(output_prompt)
        final_qa_result.append(qa_result)
        final_correct_answers.append(correct_answers)

    data['input_prompt'] = final_input_prompt
    data['output_prompt'] = final_output_prompt
    data['qa_result'] = final_qa_result
    data['correct_answers'] = final_correct_answers
    return data[['id', 'input_prompt', 'output_prompt', 'qa_result', 'correct_answers']]

In [6]:
def run_stats(data: pd.DataFrame):
    final_em = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    exact_match_count = 0
    total_questions = 0
    for real_idx, (idx, item) in tqdm(enumerate(data.iterrows())):
        given = item["qa_result"]
        real = item["correct_answers"]
        if set(given) == set(real):
            exact_match_count += 1

        final_em.append(set(given) == set(real))

        true_positives = len([x for x in given if x in real])
        false_positives = len([x for x in given if x not in real])
        false_negatives = len([x for x in real if x not in given])

        precision = true_positives / (true_positives + false_positives + 1e-8)
        recall = true_positives / (true_positives + false_negatives + 1e-8)
        f1 = 2 * precision * recall / (precision + recall + 1e-8)

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

        total_questions += 1
    
    data['exact_match'] = final_em
    
    avg_precision = sum(precision_scores) / len(precision_scores)
    avg_recall = sum(recall_scores) / len(recall_scores)
    avg_f1 = sum(f1_scores) / len(f1_scores)
    exact_match = exact_match_count / total_questions

    print(f"\n====== EVALUATION RESULTS ======")
    print(f"Precision:    {avg_precision:.3f}")
    print(f"Recall:       {avg_recall:.3f}")
    print(f"F1 Score:     {avg_f1:.3f}")
    print(f"Exact Match:  {exact_match:.3f} ({exact_match_count}/{total_questions})")

    return data 


In [7]:
def load_img(href):
    local_url = href.split('/')[-1]
    local_url = local_url.split('.')[0]
    local_url = f"../data/dataset_V3/images/{local_url}.jpg"
    with open(local_url, "rb") as f:
        image_data = base64.b64encode(f.read()).decode("utf-8")
        return image_data
    
def load_caption(id):
    return captions[id]

In [8]:
def retrieve_ideal_laws(documents):
    return [(ideal, laws[ideal]) for ideal in ast.literal_eval(documents)]

def retrieve_ideal_indicators(documents):
    return [(ideal, indicators[ideal]) for ideal in ast.literal_eval(documents)]


In [9]:
async def strategy_1(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])
    image = load_img(item['image'])

    ideal_docs = retrieve_ideal_laws(item['legislation'])
    ideal_indicators = retrieve_ideal_indicators(item['indicators'])

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_docs])
    formated_indicators = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_indicators])
    caption = load_caption(item['id'])

    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.
Vei primi o intrebare alaturi de o imagine, intrebarea avand stransa legatura cu intrebarea.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Argumenteaza clar de ce ai ales fiecare raspuns corect. Daca exista mai multe raspunsuri corecte, explica fiecare alegere separat.
7. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}

Aceasta este descrierea imaginii:
{caption}
===================
""".format(question = question, answers = answers, caption = caption)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt},
            # {
            #     "type": "image_url",
            #     "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": "high"},
            # }
        ]
    )

    async with semaphore:
        res = await llm_o4.ainvoke([message])
        res = res.content
    
    reasoning = [x['text'] for x in res[0]['summary']]
    output = res[1]['text']

    output_full = "\n".join([f"[REASONING]{x}" for x in reasoning]) + f"\n[OUTPUT]{output}"
    
    
    final_res = [x.strip() for x in output_full.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, output_full, final_res

    return full_prompt, output_full, final_res, correct_answers

In [10]:
async def strategy_2(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])
    image = load_img(item['image'])

    ideal_docs = retrieve_ideal_laws(item['legislation'])
    ideal_indicators = retrieve_ideal_indicators(item['indicators'])

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_docs])
    formated_indicators = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_indicators])


    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.
Vei primi o intrebare alaturi de o imagine, intrebarea avand stransa legatura cu intrebarea.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Argumenteaza clar de ce ai ales fiecare raspuns corect. Daca exista mai multe raspunsuri corecte, explica fiecare alegere separat.
7. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}
===================
""".format(question = question, answers = answers)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt},
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": "high"},
            }
        ]
    )

    async with semaphore:
        res = await llm_o4.ainvoke([message])
        res = res.content
    
    reasoning = [x['text'] for x in res[0]['summary']]
    output = res[1]['text']

    output_full = "\n".join([f"[REASONING]{x}" for x in reasoning]) + f"\n[OUTPUT]{output}"
    
    
    final_res = [x.strip() for x in output_full.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, output_full, final_res

    return full_prompt, output_full, final_res, correct_answers

In [11]:
async def strategy_3(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])
    image = load_img(item['image'])

    ideal_docs = retrieve_ideal_laws(item['legislation'])
    ideal_indicators = retrieve_ideal_indicators(item['indicators'])

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_docs])
    formated_indicators = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_indicators])
    caption = load_caption(item['id'])

    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.
Vei primi o intrebare alaturi de o imagine, intrebarea avand stransa legatura cu intrebarea.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Argumenteaza clar de ce ai ales fiecare raspuns corect. Daca exista mai multe raspunsuri corecte, explica fiecare alegere separat.
7. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}

Aceasta este descrierea imaginii:
{caption}
===================
""".format(question = question, answers = answers, caption=caption)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt},
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": "high"},
            }
        ]
    )

    async with semaphore:
        res = await llm_o4.ainvoke([message])
        res = res.content
    
    reasoning = [x['text'] for x in res[0]['summary']]
    output = res[1]['text']

    output_full = "\n".join([f"[REASONING]{x}" for x in reasoning]) + f"\n[OUTPUT]{output}"
    
    
    final_res = [x.strip() for x in output_full.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, output_full, final_res

    return full_prompt, output_full, final_res, correct_answers

In [12]:
async def strategy_4(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])
    image = load_img(item['image'])

    ideal_docs = retrieve_ideal_laws(item['legislation'])
    ideal_indicators = retrieve_ideal_indicators(item['indicators'])

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_docs])
    formated_indicators = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_indicators])
    caption = load_caption(item['id'])

    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.
Vei primi o intrebare alaturi de o imagine, intrebarea avand stransa legatura cu intrebarea.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Argumenteaza clar de ce ai ales fiecare raspuns corect. Daca exista mai multe raspunsuri corecte, explica fiecare alegere separat.
7. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}

Aceasta este descrierea imaginii:
{caption}

Aceastea sunt legile relevante, dar nu neaparat toate sunt relevante:
{documents_laws}


{documents_indicators}
===================
""".format(question = question, answers = answers, caption = caption, documents_laws=formated_docs, documents_indicators = formated_indicators)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt},
            # {
            #     "type": "image_url",
            #     "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": "high"},
            # }
        ]
    )

    async with semaphore:
        res = await llm_o4.ainvoke([message])
        res = res.content
    
    reasoning = [x['text'] for x in res[0]['summary']]
    output = res[1]['text']

    output_full = "\n".join([f"[REASONING]{x}" for x in reasoning]) + f"\n[OUTPUT]{output}"
    
    
    final_res = [x.strip() for x in output_full.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, output_full, final_res

    return full_prompt, output_full, final_res, correct_answers

In [13]:
async def strategy_5(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])
    image = load_img(item['image'])

    ideal_docs = retrieve_ideal_laws(item['legislation'])
    ideal_indicators = retrieve_ideal_indicators(item['indicators'])

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_docs])
    formated_indicators = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_indicators])


    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.
Vei primi o intrebare alaturi de o imagine, intrebarea avand stransa legatura cu intrebarea.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Argumenteaza clar de ce ai ales fiecare raspuns corect. Daca exista mai multe raspunsuri corecte, explica fiecare alegere separat.
7. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}

Aceastea sunt legile relevante, dar nu neaparat toate sunt relevante:
{documents_laws}


{documents_indicators}
===================
""".format(question = question, answers = answers, documents_laws=formated_docs, documents_indicators = formated_indicators)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt},
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": "high"},
            }
        ]
    )

    async with semaphore:
        res = await llm_o4.ainvoke([message])
        res = res.content
    
    reasoning = [x['text'] for x in res[0]['summary']]
    output = res[1]['text']

    output_full = "\n".join([f"[REASONING]{x}" for x in reasoning]) + f"\n[OUTPUT]{output}"
    
    
    final_res = [x.strip() for x in output_full.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, output_full, final_res

    return full_prompt, output_full, final_res, correct_answers

In [14]:
async def strategy_6(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])
    image = load_img(item['image'])

    ideal_docs = retrieve_ideal_laws(item['legislation'])
    ideal_indicators = retrieve_ideal_indicators(item['indicators'])

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_docs])
    formated_indicators = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_indicators])
    caption = load_caption(item['id'])

    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.
Vei primi o intrebare alaturi de o imagine, intrebarea avand stransa legatura cu intrebarea.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Argumenteaza clar de ce ai ales fiecare raspuns corect. Daca exista mai multe raspunsuri corecte, explica fiecare alegere separat.
7. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}

Aceasta este descrierea imaginii:
{caption}

Aceastea sunt legile relevante, dar nu neaparat toate sunt relevante:
{documents_laws}


{documents_indicators}
===================
""".format(question = question, answers = answers, caption=caption, documents_laws=formated_docs, documents_indicators = formated_indicators)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt},
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": "high"},
            }
        ]
    )

    async with semaphore:
        res = await llm_o4.ainvoke([message])
        res = res.content
    
    reasoning = [x['text'] for x in res[0]['summary']]
    output = res[1]['text']

    output_full = "\n".join([f"[REASONING]{x}" for x in reasoning]) + f"\n[OUTPUT]{output}"
    
    
    final_res = [x.strip() for x in output_full.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, output_full, final_res

    return full_prompt, output_full, final_res, correct_answers

In [15]:
laws1 = pd.read_csv('../results/vir/vir_strat_5_split_3_laws.csv', index_col="id")
laws2 = pd.read_csv('../results/vir/vir_strat_5_split_4_laws.csv', index_col="id")

laws_dict = {}
for idx, row in laws1.iterrows():
    laws_dict[idx] = row['retrieved_documents']
for idx, row in laws2.iterrows():
    laws_dict[idx] = row['retrieved_documents']
# law retrieval - strat 5
print(laws_dict)
def retrieve_laws(id):
    return laws_dict[id]


# 1. '../results/vir/vir_strat_8_split_3_ind.csv'
# 2. '../results/vir/vir_strat_8_split_4_ind.csv'
# indicator retrieval - strat 2*
ind1 = pd.read_csv('../results/vir/vir_strat_8_split_3_ind.csv', index_col="id")
ind2 = pd.read_csv('../results/vir/vir_strat_8_split_4_ind.csv', index_col="id")

ind_dict = {}
for idx, row in ind1.iterrows():
    ind_dict[idx] = row['retrieved_documents']
for idx, row in ind2.iterrows():
    ind_dict[idx] = row['retrieved_documents']

print(ind_dict)
def retrieve_indicators(id):
    return ind_dict[id]

{'9316e9f5': "['Regulament-143', 'OUG-63', 'Regulament-142', 'Regulament-141', 'Regulament-115', 'Regulament-66', 'Regulament-151', 'Regulament-6', 'Regulament-126', 'Regulament-131']", '74db76ba': "['Regulament-64', 'OUG-31', 'Regulament-3', 'Regulament-135', 'Regulament-66', 'OUG-133', 'Regulament-129', 'Regulament-65', 'Regulament-123', 'Regulament-111']", '1b4d81f6': "['OUG-6', 'OUG-71', 'Regulament-64', 'OUG-31', 'Regulament-20', 'Regulament-58', 'OUG-30', 'OUG-29', 'OUG-72', 'Regulament-3']", 'c0a6cff3': "['Regulament-64', 'Regulament-123', 'Regulament-3', 'OUG-29', 'Regulament-121', 'Regulament-124', 'Regulament-8', 'OUG-30', 'OUG-53', 'Regulament-46']", 'aa276ef6': "['OUG-31', 'OUG-59', 'Regulament-49', 'Regulament-129', 'Regulament-135', 'Regulament-51', 'OUG-29', 'OUG-43', 'Regulament-64', 'Regulament-83']", '76896b09': "['Regulament-129', 'OUG-59', 'OUG-56', 'Regulament-135', 'OUG-57', 'OUG-62', 'Regulament-134', 'Regulament-64', 'Regulament-133', 'OUG-31']", '6981b907': "['

In [16]:
async def strategy_7(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])
    image = load_img(item['image'])

    ideal_docs = retrieve_ideal_laws(retrieve_laws(item['id']))
    ideal_indicators = retrieve_ideal_indicators(retrieve_indicators(item['id']))

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_docs])
    formated_indicators = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_indicators])
    caption = load_caption(item['id'])

    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.
Vei primi o intrebare alaturi de o imagine, intrebarea avand stransa legatura cu intrebarea.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Argumenteaza clar de ce ai ales fiecare raspuns corect. Daca exista mai multe raspunsuri corecte, explica fiecare alegere separat.
7. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}

Aceasta este descrierea imaginii:
{caption}

Aceastea sunt legile relevante, dar nu neaparat toate sunt relevante:
{documents_laws}


{documents_indicators}
===================
""".format(question = question, answers = answers, caption = caption, documents_laws=formated_docs, documents_indicators = formated_indicators)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt},
            # {
            #     "type": "image_url",
            #     "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": "high"},
            # }
        ]
    )

    async with semaphore:
        res = await llm_o4.ainvoke([message])
        res = res.content
    
    reasoning = [x['text'] for x in res[0]['summary']]
    output = res[1]['text']

    output_full = "\n".join([f"[REASONING]{x}" for x in reasoning]) + f"\n[OUTPUT]{output}"
    
    
    final_res = [x.strip() for x in output_full.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, output_full, final_res

    return full_prompt, output_full, final_res, correct_answers

In [17]:
async def strategy_8(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])
    image = load_img(item['image'])

    ideal_docs = retrieve_ideal_laws(retrieve_laws(item['id']))
    ideal_indicators = retrieve_ideal_indicators(retrieve_indicators(item['id']))

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_docs])
    formated_indicators = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_indicators])


    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.
Vei primi o intrebare alaturi de o imagine, intrebarea avand stransa legatura cu intrebarea.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Argumenteaza clar de ce ai ales fiecare raspuns corect. Daca exista mai multe raspunsuri corecte, explica fiecare alegere separat.
7. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}

Aceastea sunt legile relevante, dar nu neaparat toate sunt relevante:
{documents_laws}


{documents_indicators}
===================
""".format(question = question, answers = answers, documents_laws=formated_docs, documents_indicators = formated_indicators)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt},
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": "high"},
            }
        ]
    )

    async with semaphore:
        res = await llm_o4.ainvoke([message])
        res = res.content
    
    reasoning = [x['text'] for x in res[0]['summary']]
    output = res[1]['text']

    output_full = "\n".join([f"[REASONING]{x}" for x in reasoning]) + f"\n[OUTPUT]{output}"
    
    
    final_res = [x.strip() for x in output_full.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, output_full, final_res

    return full_prompt, output_full, final_res, correct_answers

In [18]:
async def strategy_9(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])
    image = load_img(item['image'])

    ideal_docs = retrieve_ideal_laws(retrieve_laws(item['id']))
    ideal_indicators = retrieve_ideal_indicators(retrieve_indicators(item['id']))

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_docs])
    formated_indicators = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in ideal_indicators])
    caption = load_caption(item['id'])

    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.
Vei primi o intrebare alaturi de o imagine, intrebarea avand stransa legatura cu intrebarea.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Argumenteaza clar de ce ai ales fiecare raspuns corect. Daca exista mai multe raspunsuri corecte, explica fiecare alegere separat.
7. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}

Aceasta este descrierea imaginii:
{caption}

Aceastea sunt legile relevante, dar nu neaparat toate sunt relevante:
{documents_laws}


{documents_indicators}
===================
""".format(question = question, answers = answers, caption=caption, documents_laws=formated_docs, documents_indicators = formated_indicators)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt},
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": "high"},
            }
        ]
    )

    async with semaphore:
        res = await llm_o4.ainvoke([message])
        res = res.content
    
    reasoning = [x['text'] for x in res[0]['summary']]
    output = res[1]['text']

    output_full = "\n".join([f"[REASONING]{x}" for x in reasoning]) + f"\n[OUTPUT]{output}"
    
    
    final_res = [x.strip() for x in output_full.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, output_full, final_res

    return full_prompt, output_full, final_res, correct_answers

In [19]:
# split_3 = pd.read_csv('../data/dataset_V3/split_3.csv')
# split_4 = pd.read_csv('../data/dataset_V3/split_4.csv')

# result_df = await mass_vqa_runner(split_3, strategy_1)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_1_split_3.csv', index=False)

# result_df = await mass_vqa_runner(split_4, strategy_1)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_1_split_4.csv', index=False)

In [20]:
# split_3 = pd.read_csv('../data/dataset_V3/split_3.csv')
# split_4 = pd.read_csv('../data/dataset_V3/split_4.csv')

# result_df = await mass_vqa_runner(split_3, strategy_2)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_2_split_3.csv', index=False)

# result_df = await mass_vqa_runner(split_4, strategy_2)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_2_split_4.csv', index=False)

In [21]:
# split_3 = pd.read_csv('../data/dataset_V3/split_3.csv')
# split_4 = pd.read_csv('../data/dataset_V3/split_4.csv')

# result_df = await mass_vqa_runner(split_3, strategy_3)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_3_split_3.csv', index=False)

# result_df = await mass_vqa_runner(split_4, strategy_3)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_3_split_4.csv', index=False)

In [22]:
# split_3 = pd.read_csv('../data/dataset_V3/split_3.csv')
# split_4 = pd.read_csv('../data/dataset_V3/split_4.csv')

# result_df = await mass_vqa_runner(split_3, strategy_4)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_4_split_3.csv', index=False)

# result_df = await mass_vqa_runner(split_4, strategy_4)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_4_split_4.csv', index=False)

In [23]:
# split_3 = pd.read_csv('../data/dataset_V3/split_3.csv')
# split_4 = pd.read_csv('../data/dataset_V3/split_4.csv')

# result_df = await mass_vqa_runner(split_3, strategy_5)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_5_split_3.csv', index=False)

# result_df = await mass_vqa_runner(split_4, strategy_5)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_5_split_4.csv', index=False)

In [24]:
# split_3 = pd.read_csv('../data/dataset_V3/split_3.csv')
# split_4 = pd.read_csv('../data/dataset_V3/split_4.csv')

# result_df = await mass_vqa_runner(split_3, strategy_6)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_6_split_3.csv', index=False)

# result_df = await mass_vqa_runner(split_4, strategy_6)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_6_split_4.csv', index=False)

In [None]:
# split_3 = pd.read_csv('../data/dataset_V3/split_3.csv')
# split_4 = pd.read_csv('../data/dataset_V3/split_4.csv')

# result_df = await mass_vqa_runner(split_3, strategy_7)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_7_split_3.csv', index=False)

# result_df = await mass_vqa_runner(split_4, strategy_7)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/vqa/vqa_strat_7_split_4.csv', index=False)

316it [00:00, 61273.18it/s]
316it [00:00, 39328.21it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['exact_match'] = final_em



Precision:    0.685
Recall:       0.696
F1 Score:     0.689
Exact Match:  0.674 (213/316)


71it [00:00, 68934.16it/s]
71it [00:00, 23485.46it/s]


Precision:    0.761
Recall:       0.761
F1 Score:     0.761
Exact Match:  0.761 (54/71)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['exact_match'] = final_em


In [29]:
split_3 = pd.read_csv('../data/dataset_V3/split_3.csv')
split_4 = pd.read_csv('../data/dataset_V3/split_4.csv')

result_df = await mass_vqa_runner(split_3, strategy_8)
result_df = run_stats(result_df)
result_df.to_csv('../results/vqa/vqa_strat_8_split_3.csv', index=False)

result_df = await mass_vqa_runner(split_4, strategy_8)
result_df = run_stats(result_df)
result_df.to_csv('../results/vqa/vqa_strat_8_split_4.csv', index=False)

316it [00:00, 31047.81it/s]
316it [00:00, 39939.73it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['exact_match'] = final_em



Precision:    0.763
Recall:       0.769
F1 Score:     0.765
Exact Match:  0.756 (239/316)


71it [00:00, 53744.01it/s]
71it [00:00, 22370.46it/s]


Precision:    0.901
Recall:       0.901
F1 Score:     0.901
Exact Match:  0.901 (64/71)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['exact_match'] = final_em


In [30]:
split_3 = pd.read_csv('../data/dataset_V3/split_3.csv')
split_4 = pd.read_csv('../data/dataset_V3/split_4.csv')

result_df = await mass_vqa_runner(split_3, strategy_9)
result_df = run_stats(result_df)
result_df.to_csv('../results/vqa/vqa_strat_9_split_3.csv', index=False)

result_df = await mass_vqa_runner(split_4, strategy_9)
result_df = run_stats(result_df)
result_df.to_csv('../results/vqa/vqa_strat_9_split_4.csv', index=False)

316it [00:00, 42054.83it/s]
316it [00:00, 42760.36it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['exact_match'] = final_em



Precision:    0.698
Recall:       0.703
F1 Score:     0.699
Exact Match:  0.693 (219/316)


71it [00:00, 77429.95it/s]
71it [00:00, 29013.60it/s]


Precision:    0.782
Recall:       0.789
F1 Score:     0.784
Exact Match:  0.775 (55/71)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['exact_match'] = final_em
