# Target experiments

- [ ] 4o + CoT + RAG
- [ ] 4o + CoT + No-RAG  
- [ ] 4o + CoT + Ideal RAG
- [ ] 4o + CoT + RAG + better prompt
- [ ] 4o + RAG + better prompt (without CoT)
- [ ] o4 + CoT + RAG + better prompt
- [ ] o4 + CoT + No-RAG + better prompt


In [None]:
import torch
from sentence_transformers import SentenceTransformer, CrossEncoder

class VectorStore():
    def __init__(self, embedding_model: SentenceTransformer, rerank_model: CrossEncoder):
        self.embedding_model = embedding_model
        self.rerank_model = rerank_model
        self.vector_store = []
        self.embedding_matrix = None

    def index_documents(self, documents):
        # Encode and normalize all document embeddings
        embeddings = self.embedding_model.encode(
            [doc['payload'] for doc in documents],
            convert_to_tensor=True,
            normalize_embeddings=True
        )

        # Store each document along with its embedding
        self.vector_store = []
        for idx, embedding in enumerate(embeddings):
            self.vector_store.append({
                'id': documents[idx]['id'],
                'payload': documents[idx]['payload'],
                'vector': embedding  # keep it as a torch tensor
            })

        # Stack all embeddings into a single tensor for search
        self.embedding_matrix = torch.stack([entry['vector'] for entry in self.vector_store])

    def search(self, query, top_k=6):
        # Encode and normalize the query
        query_embedding = self.embedding_model.encode(
            [query],
            convert_to_tensor=True,
            normalize_embeddings=True,
            show_progress_bar=False
        )  # shape: (1, dim)

        # Compute cosine similarity (dot product of normalized vectors)
        scores = torch.matmul(query_embedding, self.embedding_matrix.T)  # shape: (1, num_docs)

        # Get top_k scores and corresponding indices
        top_k_scores, top_k_indices = torch.topk(scores, k=top_k, dim=1)

        # Retrieve top_k documents
        top_k_documents = [self.vector_store[idx] for idx in top_k_indices[0].tolist()]

        return top_k_documents, top_k_scores[0].tolist()

    def rerank(self, query, top_k_documents, top_k=6):
        scores = self.rerank_model.rank(
            query=query,
            documents=[doc['payload'] for doc in top_k_documents]
        )
        return [top_k_documents[idx['corpus_id']] for idx in scores][:top_k]


In [None]:
import pandas as pd

df = pd.read_csv("../data/dataset_V3/corpus.csv", index_col="id")
documents = []
for idx, row in df.iterrows():
    documents.append({"id": idx, "payload": f"{row['title_metadata']} | {row['content']}"})

In [None]:
import pandas as pd

df = pd.read_csv("../data/dataset_V3/corpus.csv", index_col="id")
laws = {}
for idx, row in df.iterrows():
    laws[idx] = f"{row['title_metadata']} | {row['content']}"

In [None]:
vector_store_finetuned = VectorStore(SentenceTransformer('../models/data_trained_V2') , None)
vector_store_finetuned.index_documents(documents)

In [None]:
def retrieve_strategy_7(question, possible_answer, id):
    top_k_documents, top_k_scores = vector_store_finetuned.search(f"{question} | {possible_answer}", top_k=10)
    retrieved_set = [doc['id'] for doc in top_k_documents]
    return retrieved_set

In [None]:

import ast
from tqdm import tqdm
import re
import base64
from langchain_core.messages import HumanMessage
import asyncio
from langchain_openai import ChatOpenAI
import pandas as pd

llm_4o = ChatOpenAI(model_name="gpt-4o-mini", api_key="", seed=25, temperature=0)
llm_o4 = ChatOpenAI(model_name="o4-mini", api_key="", seed=25, temperature=0, output_version="responses/v1")

# top_k_search = 10
MAX_CONCURRENT_REQUESTS = 15
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

async def mass_qa_runner(data: pd.DataFrame, strategy):
    tasks = []
    for real_idx, (idx, item) in tqdm(enumerate(data.iterrows())):
        # Create tasks for parallel processing
        tasks.append(strategy(item))
    
    results = await asyncio.gather(*tasks)
    final_input_prompt = []
    final_output_prompt = []
    final_qa_result = []
    final_correct_answers = []
    for result, (idx, item) in zip(results, data.iterrows()):
        input_prompt, output_prompt, qa_result, correct_answers = result
        final_input_prompt.append(input_prompt)
        final_output_prompt.append(output_prompt)
        final_qa_result.append(qa_result)
        final_correct_answers.append(correct_answers)

    data['input_prompt'] = final_input_prompt
    data['output_prompt'] = final_output_prompt
    data['qa_result'] = final_qa_result
    data['correct_answers'] = final_correct_answers
    return data[['id', 'input_prompt', 'output_prompt', 'qa_result', 'correct_answers']]

In [None]:
reasoning = {
    "effort": "medium",  # 'low', 'medium', or 'high'
    "summary": "detailed",  # 'detailed', 'auto', or None
}

llm_o4 = ChatOpenAI(model_name="o4-mini", api_key="", reasoning=reasoning, output_version="responses/v1")

In [None]:
import pandas as pd

def run_stats(data: pd.DataFrame):
    final_em = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    exact_match_count = 0
    total_questions = 0
    for real_idx, (idx, item) in tqdm(enumerate(data.iterrows())):
        given = item["qa_result"]
        real = item["correct_answers"]
        if set(given) == set(real):
            exact_match_count += 1

        final_em.append(set(given) == set(real))

        true_positives = len([x for x in given if x in real])
        false_positives = len([x for x in given if x not in real])
        false_negatives = len([x for x in real if x not in given])

        precision = true_positives / (true_positives + false_positives + 1e-8)
        recall = true_positives / (true_positives + false_negatives + 1e-8)
        f1 = 2 * precision * recall / (precision + recall + 1e-8)

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

        total_questions += 1
    
    data['exact_match'] = final_em
    
    avg_precision = sum(precision_scores) / len(precision_scores)
    avg_recall = sum(recall_scores) / len(recall_scores)
    avg_f1 = sum(f1_scores) / len(f1_scores)
    exact_match = exact_match_count / total_questions

    print(f"\n====== EVALUATION RESULTS ======")
    print(f"Precision:    {avg_precision:.3f}")
    print(f"Recall:       {avg_recall:.3f}")
    print(f"F1 Score:     {avg_f1:.3f}")
    print(f"Exact Match:  {exact_match:.3f} ({exact_match_count}/{total_questions})")

    return data 


In [None]:
async def strategy_1(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])


    top_k_documents = retrieve_strategy_7(question, answers, None)
    retrieved_set = [(doc, laws[doc]) for doc in top_k_documents]
    ideal_rag = [(ideal, laws[ideal]) for ideal in ast.literal_eval(item['legislation'])]

    docs = retrieved_set

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in docs])


    full_prompt = """Esti un politist rutier. Vorbesti doar Limba romana.
Trebuie sa rezolvi o grila de la un test auto. Aceasta grila poate avea unul sau mai multe raspunsuri corecte.
Gandestete care e raspunsul corect si raspunde la intrebare. La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu

"Raspuns corect: A"
sau
"Raspuns corect: A,B"

Acesta este modul in care trebuie sa gandesti:
1. Citeste atent intrebare si variantele de raspuns.
2. Identifica ce informatii din legislatie ar putea fi relevante. (Legislatia Romaniei)
3. Daca ai mai multe raspunsuri corecte, argumenteaza fiecare alegere.

Aceasta este intrebarea:
{question}

Aceastea sunt variantele de raspuns:
{answers}

Aceastea sunt legile relevante, dar nu neaparat toate sunt relevante:
{documents}
===================
""".format(question = question, answers = answers, documents = formated_docs)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt}
        ]
    )

    async with semaphore:
        res = await llm_4o.ainvoke([message])
        res = res.content
    
    final_res = [x.strip() for x in res.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, res, final_res

    return full_prompt, res, final_res, correct_answers

In [None]:
async def strategy_2(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])


    top_k_documents = retrieve_strategy_7(question, answers, None)
    retrieved_set = [(doc, laws[doc]) for doc in top_k_documents]
    ideal_rag = [(ideal, laws[ideal]) for ideal in ast.literal_eval(item['legislation'])]

    docs = retrieved_set

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in docs])


    full_prompt = """Esti un politist rutier. Vorbesti doar Limba romana.
Trebuie sa rezolvi o grila de la un test auto. Aceasta grila poate avea unul sau mai multe raspunsuri corecte.
Gandestete care e raspunsul corect si raspunde la intrebare. La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu

"Raspuns corect: A"
sau
"Raspuns corect: A,B"

Acesta este modul in care trebuie sa gandesti:
1. Citeste atent intrebare si variantele de raspuns.
2. Identifica ce informatii din legislatie ar putea fi relevante. (Legislatia Romaniei)
3. Daca ai mai multe raspunsuri corecte, argumenteaza fiecare alegere.

Aceasta este intrebarea:
{question}

Aceastea sunt variantele de raspuns:
{answers}
===================
""".format(question = question, answers = answers)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt}
        ]
    )

    async with semaphore:
        res = await llm_4o.ainvoke([message])
        res = res.content
    
    final_res = [x.strip() for x in res.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, res, final_res

    return full_prompt, res, final_res, correct_answers

In [None]:
async def strategy_3(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])


    top_k_documents = retrieve_strategy_7(question, answers, None)
    retrieved_set = [(doc, laws[doc]) for doc in top_k_documents]
    ideal_rag = [(ideal, laws[ideal]) for ideal in ast.literal_eval(item['legislation'])]

    docs = ideal_rag

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in docs])


    full_prompt = """Esti un politist rutier. Vorbesti doar Limba romana.
Trebuie sa rezolvi o grila de la un test auto. Aceasta grila poate avea unul sau mai multe raspunsuri corecte.
Gandestete care e raspunsul corect si raspunde la intrebare. La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu

"Raspuns corect: A"
sau
"Raspuns corect: A,B"

Acesta este modul in care trebuie sa gandesti:
1. Citeste atent intrebare si variantele de raspuns.
2. Identifica ce informatii din legislatie ar putea fi relevante. (Legislatia Romaniei)
3. Daca ai mai multe raspunsuri corecte, argumenteaza fiecare alegere.

Aceasta este intrebarea:
{question}

Aceastea sunt variantele de raspuns:
{answers}

Aceastea sunt legile relevante, dar nu neaparat toate sunt relevante:
{documents}
===================
""".format(question = question, answers = answers, documents = formated_docs)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt}
        ]
    )

    async with semaphore:
        res = await llm_4o.ainvoke([message])
        res = res.content
    
    final_res = [x.strip() for x in res.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, res, final_res

    return full_prompt, res, final_res, correct_answers

In [None]:
async def strategy_4(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])


    top_k_documents = retrieve_strategy_7(question, answers, None)
    retrieved_set = [(doc, laws[doc]) for doc in top_k_documents]
    ideal_rag = [(ideal, laws[ideal]) for ideal in ast.literal_eval(item['legislation'])]

    docs = retrieved_set

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in docs])


    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Argumenteaza clar de ce ai ales fiecare raspuns corect. Daca exista mai multe raspunsuri corecte, explica fiecare alegere separat.
7. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}

Aceastea sunt legile relevante, dar nu neaparat toate sunt relevante:
{documents}
===================
""".format(question = question, answers = answers, documents = formated_docs)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt}
        ]
    )

    async with semaphore:
        res = await llm_4o.ainvoke([message])
        res = res.content
    
    final_res = [x.strip() for x in res.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, res, final_res

    return full_prompt, res, final_res, correct_answers

In [None]:
async def strategy_5(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])


    top_k_documents = retrieve_strategy_7(question, answers, None)
    retrieved_set = [(doc, laws[doc]) for doc in top_k_documents]
    ideal_rag = [(ideal, laws[ideal]) for ideal in ast.literal_eval(item['legislation'])]

    docs = retrieved_set

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in docs])


    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


Raspunde direct cu variantale corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}

Aceastea sunt legile relevante, dar nu neaparat toate sunt relevante:
{documents}
===================
""".format(question = question, answers = answers, documents = formated_docs)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt}
        ]
    )

    async with semaphore:
        res = await llm_4o.ainvoke([message])
        res = res.content
    
    final_res = [x.strip() for x in res.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, res, final_res

    return full_prompt, res, final_res, correct_answers

In [None]:
reasoning = {
    "effort": "medium",  # 'low', 'medium', or 'high'
    "summary": "detailed",  # 'detailed', 'auto', or None
}

llm_o4 = ChatOpenAI(model_name="o4-mini", api_key="", reasoning=reasoning, output_version="responses/v1")

async def strategy_6(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])


    top_k_documents = retrieve_strategy_7(question, answers, None)
    retrieved_set = [(doc, laws[doc]) for doc in top_k_documents]
    ideal_rag = [(ideal, laws[ideal]) for ideal in ast.literal_eval(item['legislation'])]

    docs = retrieved_set

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in docs])


    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Argumenteaza clar de ce ai ales fiecare raspuns corect. Daca exista mai multe raspunsuri corecte, explica fiecare alegere separat.
7. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}

Aceastea sunt legile relevante, dar nu neaparat toate sunt relevante:
{documents}
===================
""".format(question = question, answers = answers, documents = formated_docs)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt}
        ]
    )

    async with semaphore:
        res = await llm_o4.ainvoke([message])
        res = res.content
    
    reasoning = [x['text'] for x in res[0]['summary']]
    output = res[1]['text']

    output_full = "\n".join([f"[REASONING]{x}" for x in reasoning]) + f"\n[OUTPUT]{output}"
    
    
    final_res = [x.strip() for x in output_full.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, res, final_res

    return full_prompt, output_full, final_res, correct_answers

In [None]:
async def strategy_7(item):
    question=item['question']
    answers = " | ".join([entry['answer_text'] 
                                    for entry in ast.literal_eval(item['answers'])])


    top_k_documents = retrieve_strategy_7(question, answers, None)
    retrieved_set = [(doc, laws[doc]) for doc in top_k_documents]
    ideal_rag = [(ideal, laws[ideal]) for ideal in ast.literal_eval(item['legislation'])]

    docs = retrieved_set

    formated_docs = "\n\n".join([f"[{entry[0]}]: {entry[1]}" for entry in docs])


    full_prompt = """Esti un politist rutier. Vorbesti doar in limba romana.
Trebuie sa rezolvi o grila de la un test auto. Grila poate avea unul sau mai multe raspunsuri corecte. Vei folosi strict legile din Romania.

Gandeste logic, dar nu extrapola peste informatiile oferite. Judeca doar momentul descris, nu presupune alte situatii.

Reguli de gandire:
1. Citeste cu maxima atentie intrebarea si variantele de raspuns.
2. Identifica strict ce prevederi din legislatia rutiera din Romania se aplica situatiei date.
3. Daca raspunsul pare "mai sigur" dar este contrar legislatiei, urmeaza legea, nu instinctul de precautie.
4. Alege DOAR raspunsurile care sunt complet corecte conform textului legii — nu ghici, nu completa informatii lipsa.
5. Daca un raspuns corect este mai bun decat altul dat ca si corect, include mai multe situatii specifice sau exceptii, atunci trebuie ales doar acela.
6. Argumenteaza clar de ce ai ales fiecare raspuns corect. Daca exista mai multe raspunsuri corecte, explica fiecare alegere separat.
7. Fii atent la mici detalii care pot schimba sensul intrebarii sau al raspunsurilor (exista intrebari-capcana).


La final, ultima parte din raspuns trebuie sa fie litera sau literele corecte.
De exemplu, raspunsul tau se va incheia cu:

"Raspuns corect: A"  
sau  
"Raspuns corect: A,B"

Aceasta este intrebarea:
{question}

Acestea sunt variantele de raspuns:
{answers}
===================
""".format(question = question, answers = answers)

    message = HumanMessage(
        content=[
            {"type": "text", "text": full_prompt}
        ]
    )

    async with semaphore:
        res = await llm_o4.ainvoke([message])
        res = res.content
    
    reasoning = [x['text'] for x in res[0]['summary']]
    output = res[1]['text']

    output_full = "\n".join([f"[REASONING]{x}" for x in reasoning]) + f"\n[OUTPUT]{output}"
    
    
    final_res = [x.strip() for x in output_full.split('corect:')[-1].split(',')]
    final_res = [re.sub(r'[^\w\s]', '', x).strip() for x in final_res]

    correct_answers = [entry['answer_text'][0] for entry in ast.literal_eval(item['answers']) if entry['is_correct']]
    # return full_prompt, res, final_res

    return full_prompt, output_full, final_res, correct_answers

# Experiments

In [None]:
split_1_train = pd.read_csv('../data/dataset_V3/split_1_train.csv')
split_1_test = pd.read_csv('../data/dataset_V3/split_1_test.csv')
split_2 = pd.read_csv('../data/dataset_V3/split_2.csv')

result_df = await mass_qa_runner(split_1_train, strategy_1)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_1_split_1_train.csv', index=False)

result_df = await mass_qa_runner(split_1_test, strategy_1)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_1_split_1_test.csv', index=False)

result_df = await mass_qa_runner(split_2, strategy_1)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_1_split_2.csv', index=False)

In [None]:
split_1_train = pd.read_csv('../data/dataset_V3/split_1_train.csv')
split_1_test = pd.read_csv('../data/dataset_V3/split_1_test.csv')
split_2 = pd.read_csv('../data/dataset_V3/split_2.csv')

result_df = await mass_qa_runner(split_1_train, strategy_2)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_2_split_1_train.csv', index=False)

result_df = await mass_qa_runner(split_1_test, strategy_2)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_2_split_1_test.csv', index=False)

result_df = await mass_qa_runner(split_2, strategy_2)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_2_split_2.csv', index=False)

In [None]:
# split_1_train = pd.read_csv('../data/dataset_V3/split_1_train.csv')
# split_1_test = pd.read_csv('../data/dataset_V3/split_1_test.csv')
# split_2 = pd.read_csv('../data/dataset_V3/split_2.csv')

# result_df = await mass_qa_runner(split_1_train, strategy_3)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/qa/qa_strat_3_split_1_train.csv', index=False)

# result_df = await mass_qa_runner(split_1_test, strategy_3)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/qa/qa_strat_3_split_1_test.csv', index=False)

# result_df = await mass_qa_runner(split_2, strategy_3)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/qa/qa_strat_3_split_2.csv', index=False)

In [None]:
split_1_train = pd.read_csv('../data/dataset_V3/split_1_train.csv')
split_1_test = pd.read_csv('../data/dataset_V3/split_1_test.csv')
split_2 = pd.read_csv('../data/dataset_V3/split_2.csv')

result_df = await mass_qa_runner(split_1_train, strategy_4)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_4_split_1_train.csv', index=False)

result_df = await mass_qa_runner(split_1_test, strategy_4)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_4_split_1_test.csv', index=False)

result_df = await mass_qa_runner(split_2, strategy_4)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_4_split_2.csv', index=False)

In [None]:
split_1_train = pd.read_csv('../data/dataset_V3/split_1_train.csv')
split_1_test = pd.read_csv('../data/dataset_V3/split_1_test.csv')
split_2 = pd.read_csv('../data/dataset_V3/split_2.csv')

# result_df = await mass_qa_runner(split_1_train, strategy_5)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/qa/qa_strat_5_split_1_train.csv', index=False)

# result_df = await mass_qa_runner(split_1_test, strategy_5)
# result_df = run_stats(result_df)
# result_df.to_csv('../results/qa/qa_strat_5_split_1_test.csv', index=False)

result_df = await mass_qa_runner(split_2, strategy_5)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_5_split_2.csv', index=False)

In [None]:
split_1_train = pd.read_csv('../data/dataset_V3/split_1_train.csv')
split_1_test = pd.read_csv('../data/dataset_V3/split_1_test.csv')
split_2 = pd.read_csv('../data/dataset_V3/split_2.csv')

result_df = await mass_qa_runner(split_1_train, strategy_6)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_6_split_1_train.csv', index=False)

result_df = await mass_qa_runner(split_1_test, strategy_6)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_6_split_1_test.csv', index=False)

result_df = await mass_qa_runner(split_2, strategy_6)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_6_split_2.csv', index=False)

In [None]:
split_1_train = pd.read_csv('../data/dataset_V3/split_1_train.csv')
split_1_test = pd.read_csv('../data/dataset_V3/split_1_test.csv')
split_2 = pd.read_csv('../data/dataset_V3/split_2.csv')

result_df = await mass_qa_runner(split_1_train, strategy_7)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_7_split_1_train.csv', index=False)

result_df = await mass_qa_runner(split_1_test, strategy_7)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_7_split_1_test.csv', index=False)

result_df = await mass_qa_runner(split_2, strategy_7)
result_df = run_stats(result_df)
result_df.to_csv('../results/qa/qa_strat_7_split_2.csv', index=False)

In [None]:
sp1 = pd.read_csv('../results/qa_vllm/qa_strat_1_split_1_train_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm/qa_strat_1_split_1_test_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm/qa_strat_1_split_2_vllm.csv')
_ = run_stats(sp1)


In [None]:
sp1 = pd.read_csv('../results/qa_vllm/qa_strat_2_split_1_train_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm/qa_strat_2_split_1_test_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm/qa_strat_2_split_2_vllm.csv')
_ = run_stats(sp1)


In [None]:
sp1 = pd.read_csv('../results/qa_vllm/qa_strat_3_split_1_train_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm/qa_strat_3_split_1_test_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm/qa_strat_3_split_2_vllm.csv')
_ = run_stats(sp1)


In [None]:
sp1 = pd.read_csv('../results/qa_vllm/qa_strat_4_split_1_train_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm/qa_strat_4_split_1_test_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm/qa_strat_4_split_2_vllm.csv')
_ = run_stats(sp1)


In [None]:
sp1 = pd.read_csv('../results/qa_vllm/qa_strat_5_split_1_train_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm/qa_strat_5_split_1_test_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm/qa_strat_5_split_2_vllm.csv')
_ = run_stats(sp1)


gemma

In [None]:
sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_1_split_1_train_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_1_split_1_test_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_1_split_2_vllm.csv')
_ = run_stats(sp1)


In [None]:
sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_2_split_1_train_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_2_split_1_test_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_2_split_2_vllm.csv')
_ = run_stats(sp1)


In [None]:
sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_3_split_1_train_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_3_split_1_test_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_3_split_2_vllm.csv')
_ = run_stats(sp1)


In [None]:
sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_4_split_1_train_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_4_split_1_test_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_4_split_2_vllm.csv')
_ = run_stats(sp1)


In [None]:
from tqdm import tqdm

sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_5_split_1_train_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_5_split_1_test_vllm.csv')
_ = run_stats(sp1)

sp1 = pd.read_csv('../results/qa_vllm1/qa_strat_5_split_2_vllm.csv')
_ = run_stats(sp1)
