In [None]:
import os
if os.getcwd() == '/home/user/code':
    os.chdir('/home/user/code/nlp2024_ClefTask4SOTA')

In [None]:
from TexSoup import TexSoup
import re
from src.dataset import PATH, UNANSWERABLE
from src.models import Model
import json
from src.content_extraction import all_sections

def convert_tdms_to_tuple(model_output_parsed):
    tuples = []
    for item in model_output_parsed:
        try:
            t = ((item["Task"], item["Dataset"],item["Metric"],item["Score"]))
            tuples.append(t)
        except:
            # parse error, ignore instance
            pass
    return tuples

def format_tdms(tuples):
    """make unique, format as string"""
    unique = set(tuples)
    dicts = [{"LEADERBOARD": {
        "Task": t,
        "Dataset":d,
        "Metric":m,
        "Score":s
    }} for t,d,m,s in unique]
    return str(dicts)


def section_wise_extraction(model: Model, prompt_template, tex):
    """takes a tex file, returns an annotation"""
    found_tdms = []
    sections = all_sections(str(tex))
    for section_name, section_text in sections:
        prompt = prompt_template(section_text)
        response = model.generate(prompt)
        response = "["+ response.split('[', 1)[-1].rsplit(']', 1)[0] + "]"
        try:
            response = json.loads(response)
            parsed = convert_tdms_to_tuple(response)
            found_tdms= [*found_tdms, *parsed]
        except:
            pass # no tuples found in section

    # print(found_tdms)
    if not found_tdms:
        annotation = UNANSWERABLE # found_tdms are empty -> unanswerable
    else:
        # dedupe and format
        annotation = format_tdms(found_tdms)
    return annotation

In [None]:
from src.dataset import PATH
from src.experiment_runner import Experiment, run
from src.models import LLama

from src.prompt_templates import few_shot_template_initial

llama3_8b = LLama("llama3:8b")
exp = Experiment(llama3_8b, few_shot_template_initial, section_wise_extraction, "llama3_8b_fewshot_initial")

# # Run on Test
# df_test = run(lambda tex: section_wise_extraction(tex, zero_shot_template_initial, model), PATH.TEST, "llama3_70b_zeroshot_initial")
# df_test = run(lambda tex: section_wise_extraction(tex, few_shot_template_initial, model), PATH.TEST, "llama3_70b_fewshot_initial")
# df_test = run(lambda tex: section_wise_extraction(tex, zero_shot_template_optimized01, model), PATH.TEST, "llama3_70b_zeroshot_optimized01")
# df_test = run(lambda tex: section_wise_extraction(tex, few_shot_template_optimized01, model), PATH.TEST, "llama3_70b_fewshot_optimized01")
# df_test = run(lambda tex: section_wise_extraction(tex, zero_shot_template_optimized02, model), PATH.TEST, "llama3_70b_zeroshot_optimized02")
# df_test = run(lambda tex: section_wise_extraction(tex, few_shot_template_optimized02, model), PATH.TEST, "llama3_70b_fewshot_optimized02")


In [5]:
df = run(exp, PATH.TEST, 1)