In [1]:
import os
if os.getcwd() == '/home/user/code':
    os.chdir('/home/user/code/nlp2024_ClefTask4SOTA')

In [2]:
from TexSoup import TexSoup
import re

def find_sections(tex):
    return [match.group(1) for match in re.finditer("\\\section\{(.*)\}", tex)]


def extract_content(node, name):
    if node:
        return ((name, node.contents[0]))


def section_split(tex):
    """returns: List of tuples (section_name, section_text)"""
    doc_text = tex
    sections = []
    # parsing with TexSoup
    # TODO: get rid of this and use regex only (brittle)
    try:
        soup = TexSoup(tex, tolerance=1)
        if title:=extract_content(soup.title, "title"):
            sections.append(title)
        if abstract:=extract_content(soup.abstract, "abstract"):
            sections.append(abstract)
        sections.append(("tables", "\n".join([str(node) for node in soup.find_all("table")]))) if soup.find_all("table") else None
    except:
        pass # could not parse tex

    # extract latex sections and corresponding text
    prev_section = "pre"

    for section in find_sections(tex):
        section_text, doc_text = doc_text.split(f"\\section{{{section}}}", 1)
        sections.append((prev_section, section_text))
        prev_section = section
    sections.append((prev_section, doc_text))
    return sections
    # except:
    #     return (("full", tex))
    


In [3]:
import ollama
def pass_to_ollama(prompt, model):
    try:
        res = ollama.generate(model=model, prompt=prompt, options={"temperature": 0})
        return res["response"]
    except Exception as ex:
        print(ex)
        return f"ollama error: {ex}"

In [None]:
import json
import tqdm
from tqdm import tqdm
from src.dataset import TDMSDataset, PATH, UNANSWERABLE, LogResult
from datetime import datetime


def extract_tdms(tex):
    return f'If the text reports benchmark leaderboard results, extract the reported Tasks, Datasets, Metrics and corresponding Scores.\
     \n \
    Text: \n {tex} \n \
    Return the tasks, datasets, metrics and scores as reported in the text in a JSON array, \
    for example [{{"Task": "example Task 1", "Dataset": "example Dataset 1", "Metric": example metric 1", "Score": "score"}}, {{"Task": "example Task 1","Dataset": "example Dataset 2", "Metric": example metric 2", "Score": "score"}}] and provide the JSON Array only. \n \
    Do not include precision information in the reported score. \
    Entries: '

def convert_tdms_to_tuple(model_output_parsed):
    tuples = []
    for item in model_output_parsed:
        try:
            t = ((item["Task"], item["Dataset"],item["Metric"],item["Score"]))
            tuples.append(t)
        except:
            # parse error, ignore instance
            pass
    return tuples

def format_tdms(tuples):
    """make unique, format as string"""
    unique = set(tuples)
    dicts = [{"LEADERBOARD": {
        "Task": t,
        "Dataset":d,
        "Metric":m,
        "Score":s
    }} for t,d,m,s in unique]
    return str(dicts)



# Run on validation data
valid_dataset = TDMSDataset(PATH.VAL)
model = "llama3:70b"

run_id = f"baseline-tdms-valid_{model.replace(':', '_')}-{datetime.now().strftime('%m%d%Y-%H%M%S')}"
llama3_fn = lambda prompt: pass_to_ollama(prompt, model)


logger = LogResult(run_id, do_write=True)
indexes = len(valid_dataset)

for i in tqdm(range(indexes)):
    f, tex, ground_truth = valid_dataset.__getitem__(i)
    found_tdms = []
    sections = section_split(str(tex))
    for section_name, section_text in sections:
        response = llama3_fn(extract_tdms(section_text))

        try:
            response = json.loads(response)
            parsed = convert_tdms_to_tuple(response)
            found_tdms= [*found_tdms, *parsed]
        except:
            pass # no tuples found in section

    # print(found_tdms)
    if not found_tdms:
        annotation = UNANSWERABLE # found_tdms are empty -> unanswerable
    else:
        # dedupe and format
        annotation = format_tdms(found_tdms)
    # log
    logger.log(f, annotation)

df = logger.save()


df

  0%|                                                                                                  | 0/100 [00:00<?, ?it/s]

In [5]:
tex

In [None]:
parsed

In [None]:
import pandas as pd
df_ground_truth = pd.DataFrame([{"f":f, "ground_truth":gt} for f, _, gt in [valid_dataset.__getitem__(i) for i in range(indexes)]])
df_ground_truth

In [None]:
response

In [None]:
for i in range(10):
    i, tex, jsn = valid_dataset.__getitem__(i)
    print(jsn)

In [None]:
set([(r["Task"], r["Dataset"],r["Metric"],r["Score"]) for r in result])

In [None]:
ground_truth

# Evaluation

In [None]:
# pct of success

df["success"] =df["ground_truth"]==df["pred"]
len(df[df["success"]]) / len(df)

# .76

In [None]:
# confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(df["ground_truth"], df["pred"])
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
# random trial
import random

num_true = len(df[df["ground_truth"]])
num_tot = len(df)
pct_true = num_true / num_tot


df["random"] = random.uniform(0, 1)
df["random_success"] = df["ground_truth"] == (df["random"] <= pct_true)
len(df[df["random_success"]]) / len(df)

# duh..
# .634 (random with distribution), .758 (baseline llama)
# codalab: .53 (llama 2 baseline), .83 (top score)

# Run on Test

In [None]:
# run on train data to evaluate method
from tqdm import tqdm
import pandas as pd
from src.dataset import BinaryTDMSDataset, PATH, write_annotation_file, UNANSWERABLE
from datetime import datetime


test_dataset = BinaryTDMSDataset(PATH.TEST)
model = "llama3:70b"

run_id = f"baseline-test_{model.replace(':', '_')}-{datetime.now().strftime('%m%d%Y-%H%M%S')}"
llama3_fn = lambda prompt: pass_to_ollama(prompt, model)

results = []
for i in tqdm(range(len(test_dataset))):
# for i in tqdm(range(1)):
    f, tex, _ = test_dataset.__getitem__(i)
    pred = section_wise_detection(tex, llama3_fn)
    write_annotation_file(run_id, f, UNANSWERABLE if not pred else "Something was found")
    # print(f"{f}: {pred} ({ground_truth})")
    results.append((f, pred))


df = pd.DataFrame(results)
df.columns = ["file", "pred"]
df.to_feather(f"results/{run_id}/df.feather")

In [None]:
# # TODO: Test set evalutation

# from src.dataset import BinaryTDMSDataset, PATH

# test_dataset = BinaryTDMSDataset(PATH.TEST)

# def get_index(folder):
#     return [i for i, t, j in test_dataset.all_paths].index(folder)

# i, tex, jsn = test_dataset.__getitem__(get_index("0706.0014"))


In [None]:
res_path = "first_baseline_70b.feather"

import pandas as pd

df = pd.read_feather(res_path)



In [None]:
len(df)

In [None]:
len(df[df["pred"]])