In [1]:
import os
if os.getcwd() == '/home/user/code':
    os.chdir('/home/user/code/nlp2024_ClefTask4SOTA')

In [2]:
from TexSoup import TexSoup

def section_split(tex):
    """returns: List of tuples (section_name, section_text)"""
    soup = TexSoup(tex)

    doc_text = tex

    sections = []

    sections.append(("title", soup.title.contents[0])) if soup.title else None
    sections.append(("abstract", soup.abstract.contents[0])) if soup.abstract else None
    sections.append(("tables", "\n".join([str(node) for node in soup.find_all("table")]))) if soup.find_all("table") else None


    # extract latex sections and corresponding text
    prev_section = "pre"

    for node in soup.find_all("section"):
        section_text, doc_text = doc_text.split(str(node), 1)
        sections.append((prev_section, section_text))
        prev_section = node.contents[0]
    sections.append((prev_section, doc_text))
    return sections

In [3]:
# TODO: Define Prompt function
import ollama

def contains_qradruple(tex):
    return f"Decide if if the following text mentions a result on a benchmark leaderboard, including the task, dataset, metric and the score. \
    Ignore the content of the text, only output yes, if it contains a leaderboard mention or no, if no such information is included. \n \
    Text: \n {tex} \n \
    Answer with yes or no only without providing any other text. "

def pass_to_ollama(prompt, model):
    try:
        res = ollama.generate(model=model, prompt=prompt)
        return res["response"]
    except Exception as ex:
        print(ex)
        return ""

def section_wise_detection(tex, llama_fn):
    answers = []
    try:
        sections = section_split(tex)
    except:
        sections = [("full", tex)] # in the case the structure cannot be parsed:
    # print(f"{len(sections)} sections")
    for section_name, section_text in sections:
        response = llama_fn(contains_qradruple(section_text))
        answers.append(str(response).lower())

    return any([answer == 'yes' for answer in answers])



In [10]:
# run on train data to evaluate method
from tqdm import tqdm
import pandas as pd
from src.dataset import BinaryTDMSDataset, PATH, UNANSWERABLE, LogResult
from datetime import datetime


train_dataset = BinaryTDMSDataset(PATH.TRAIN)
model = "llama3:70b"

run_id = f"baseline-train_{model.replace(':', '_')}-{datetime.now().strftime('%m%d%Y-%H%M%S')}"
llama3_fn = lambda prompt: pass_to_ollama(prompt, model)


# indexes= range(len(train_dataset))
indexes= 1

df_ground_truth = pd.DataFrame([{"f":f, "ground_truth":gt} for f, _, gt in [train_dataset.__getitem__(i) for i in range(indexes)]])

logger = LogResult(run_id, do_write = False)
for i in tqdm(range(indexes)):
    f, tex, ground_truth = train_dataset.__getitem__(i)
    pred = section_wise_detection(tex, llama3_fn)
    logger.log(f, pred)

df = logger.save()
df = df_ground_truth.merge(df, on="f")


100%|██████████| 1/1 [00:01<00:00,  1.89s/it]

model 'llama3:70b' not found, try pulling it first





In [12]:
df

Unnamed: 0,f,ground_truth,run,annotation
0,704.1707,False,baseline-train_llama3_70b-05202024-145041,False


# Evaluation

In [6]:
import pandas as pd

df = pd.read_feather("results\zs_train_05182024-132645.feather")

  df = pd.read_feather("results\zs_train_05182024-132645.feather")


In [7]:
# pct of success

df["success"] =df["ground_truth"]==df["pred"]
len(df[df["success"]]) / len(df)

# .76

0.7583333333333333

In [8]:
# Using offical scoring prog:
from scoring_program.evaluation import Metrics

dummy_jsn = "{'Task': 'Causal Inference', 'Dataset': 'Jobs', 'Metric': 'Average Treatment Effect on the Treated Error', 'Score': '0.08'}"

labels_list = list(df["ground_truth"])
preds_list = list(df["pred"].apply(lambda c: dummy_jsn if c else "unanswerable"))

Metrics.general_accuracy_text_based(labels_list, preds_list)

KeyboardInterrupt: 

In [None]:
# confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(df["ground_truth"], df["pred"])
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
# random trial
import random

num_true = len(df[df["ground_truth"]])
num_tot = len(df)
pct_true = num_true / num_tot


df["random"] = random.uniform(0, 1)
df["random_success"] = df["ground_truth"] == (df["random"] <= pct_true)
len(df[df["random_success"]]) / len(df)

# duh..
# .634 (random with distribution), .758 (baseline llama)
# codalab: .53 (llama 2 baseline), .83 (top score)

# Run on Test

In [None]:
# run on train data to evaluate method
from tqdm import tqdm
import pandas as pd
from src.dataset import BinaryTDMSDataset, PATH, write_annotation_file, UNANSWERABLE
from datetime import datetime


test_dataset = BinaryTDMSDataset(PATH.TEST)
model = "llama3:70b"

run_id = f"baseline-test_{model.replace(':', '_')}-{datetime.now().strftime('%m%d%Y-%H%M%S')}"
llama3_fn = lambda prompt: pass_to_ollama(prompt, model)

results = []
for i in tqdm(range(len(test_dataset))):
# for i in tqdm(range(1)):
    f, tex, _ = test_dataset.__getitem__(i)
    pred = section_wise_detection(tex, llama3_fn)
    write_annotation_file(run_id, f, UNANSWERABLE if not pred else "Something was found")
    # print(f"{f}: {pred} ({ground_truth})")
    results.append((f, pred))


df = pd.DataFrame(results)
df.columns = ["file", "pred"]
df.to_feather(f"results/{run_id}/df.feather")

 14%|████████████                                                                          | 111/789 [46:19<6:42:14, 35.60s/it]

: 

In [None]:
# # TODO: Test set evalutation

# from src.dataset import BinaryTDMSDataset, PATH

# test_dataset = BinaryTDMSDataset(PATH.TEST)

# def get_index(folder):
#     return [i for i, t, j in test_dataset.all_paths].index(folder)

# i, tex, jsn = test_dataset.__getitem__(get_index("0706.0014"))


In [None]:
res_path = "first_baseline_70b.feather"

import pandas as pd

df = pd.read_feather(res_path)



In [None]:
len(df)

789

In [None]:
len(df[df["pred"]])

302