In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('experiments/extractive-qa/task2report.csv')
df.head()

Unnamed: 0,Model,Dataset,Language,Task,Exact Match,F1
0,gemma2,SQUAD1,en,extractive-qa,75.73,87.19
1,llama3.1,SQUAD1,en,extractive-qa,70.39,82.64
2,mistral-nemo,SQUAD1,en,extractive-qa,67.48,83.35
3,gemma2,SQUAD2,en,extractive-qa,63.85,69.8
4,llama3.1,SQUAD2,en,extractive-qa,48.82,56.25


In [4]:
# drop columns Dataset and Language
df = df.drop(columns=['Dataset', 'Language'])
df.head()

Unnamed: 0,Model,Task,Exact Match,F1
0,gemma2,extractive-qa,75.73,87.19
1,llama3.1,extractive-qa,70.39,82.64
2,mistral-nemo,extractive-qa,67.48,83.35
3,gemma2,extractive-qa,63.85,69.8
4,llama3.1,extractive-qa,48.82,56.25


In [6]:
df2 = df.groupby(['Task', 'Model']).mean().reset_index().round(2)
df2

Unnamed: 0,Task,Model,Exact Match,F1
0,extractive-qa,gemma2,66.53,79.41
1,extractive-qa,llama3.1,56.04,70.64
2,extractive-qa,mistral-nemo,55.21,72.39
3,template-filling,gemma2,36.69,36.7
4,template-filling,llama3.1,40.62,40.63
5,template-filling,mistral,23.86,23.87


In [7]:
df2.to_csv('experiments/extractive-qa/task2report_aggregated.csv', index=False)

In [2]:
from datasets import load_dataset

In [7]:
a = "Tim"
b = "im"

b in a

True

In [20]:
squad2 = load_dataset("squad_v2")

Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [30]:
squad2["validation"].to_pandas()[5:9]

Unnamed: 0,id,title,context,question,answers
5,5ad39d53604f3c001a3fe8d1,Normans,The Normans (Norman: Nourmands; French: Norman...,Who gave their name to Normandy in the 1000's ...,"{'text': [], 'answer_start': []}"
6,5ad39d53604f3c001a3fe8d2,Normans,The Normans (Norman: Nourmands; French: Norman...,What is France a region of?,"{'text': [], 'answer_start': []}"
7,5ad39d53604f3c001a3fe8d3,Normans,The Normans (Norman: Nourmands; French: Norman...,Who did King Charles III swear fealty to?,"{'text': [], 'answer_start': []}"
8,5ad39d53604f3c001a3fe8d4,Normans,The Normans (Norman: Nourmands; French: Norman...,When did the Frankish identity emerge?,"{'text': [], 'answer_start': []}"


In [3]:
squad = load_dataset("squad")

In [6]:
type(squad["validation"])

datasets.arrow_dataset.Dataset

In [None]:
squad = load_dataset("squad", split="train[:5000]")

In [None]:
from ollama import Client
client = Client(host='http://localhost:19290')
client.list()

In [None]:
user_prompt = """
Please extract the answer to the following question from the text below:

Context: {}
Question: {}

Please answer in this format. You are not required to provide any reasoning for your answer.
Answer: <answer>
Reasoning: <reasoning>

Remember, the answer must be verbatim from the text. You have to extract the answer from the text, not generate it.
"""

In [None]:
import re
from typing import Optional, Tuple


answer_word = "Answer"
reasoning_word = "Reasoning"

def parse_response(response: str) -> Tuple[Optional[str], str]:
    # Check if the response is in the correct format.
    if not response.startswith(f"{answer_word}: "):
        return None, "Answer has not been provided"
    
    if "\n" in response:
        # reasoning has been provided
        components = re.split(r"\n+", response)

        # extract the answer
        answer = components[0].split(": ")[1].strip()

        # exctract the reasoning
        if not components[1].startswith(f"{reasoning_word}:"):
            return answer, f"The reasoning has to start with '{reasoning_word}:'."

        reasoning = components[1].split(": ")[1]
        return answer, reasoning

    else:
        # reasoning has not been provided
        answer = response.split(": ")[1].strip()
        return answer, "Reasoning has not been provided"


In [None]:
def compute_f1(prediction: str, gold: str) -> float:
    prediction_tokens = prediction.lower().split()
    true_tokens = gold.lower().split()
    common_tokens = set(prediction_tokens) & set(true_tokens)
    if len(common_tokens) == 0:
        return 0
    precision = len(common_tokens) / len(prediction_tokens)
    recall = len(common_tokens) / len(true_tokens)
    return 2 * (precision * recall) / (precision + recall)

In [None]:
def compute_em(prediction: str, gold: str) -> float:
    return float(prediction.lower() == gold.lower())

In [None]:
from typing import List, Tuple
import numpy as np


def eval(predictions: List[str], golds: List[str]) -> Tuple[float, float]:
    f1_scores = []
    em_scores = []
    for prediction, gold in zip(predictions, golds):
        f1_scores.append(compute_f1(prediction, gold))
        em_scores.append(compute_em(prediction, gold))
    return np.mean(f1_scores), np.mean(em_scores)

In [None]:
from tqdm import tqdm


questions = squad["question"][:10]
contexts = squad["context"][:10]
answers = [answer["text"][0] for answer in squad["answers"][:10]]
predictions = []
for i in tqdm(range(10), desc="Evaluating"):
    response = client.chat(model='gemma2', messages=[
      {
          'role': 'system',
          'content': 'You are a system to support the analysis of large amounts of text. You will assist the user by answering all questions correctly.',
      },
      {
        'role': 'user',
        'content': user_prompt.format(contexts[i], questions[i]).strip(),
      },
    ])
    prediction, reasoning = parse_response(response["message"]["content"])
    predictions.append(prediction)

    print(f"Question: {questions[i]}")
    print(f"Prediction: {prediction}")
    print(f"Answer: {answers[i]}")
    print(f"Reasoning: {reasoning}")
    print()

In [None]:
print("Evaluation:")
f1, em = eval(predictions, answers)
print(f"F1: {f1}")
print(f"EM: {em}")

In [None]:
from datasets import load_dataset

In [None]:
germanquad = load_dataset("deepset/germanquad")

In [None]:
germanquad

In [161]:
df = germanquad["test"].to_pandas()
df[100:150]

Unnamed: 0,id,context,question,answers
100,41225,Gletscher\n\n=== Gleichgewichtslinie ===\nDie ...,Was ist der Fachbegriff für die Gleichgewichts...,"{'text': ['Equilibrium Line Altitude', ' ''Equ..."
101,41226,Gletscher\n\n=== Gleichgewichtslinie ===\nDie ...,Was wird als Gleichgewichtslinie eines Gletsch...,"{'text': ['Höhengrenze', ' eine Höhengrenze de..."
102,41228,Gletscher\n\n=== Gleichgewichtslinie ===\nDie ...,Wie ist das Zehrgebiet des Gletschers definiert?,{'text': ['im sogenannten Zehrgebiet (Ablation...
103,41229,Gletscher\n\n=== Gleichgewichtslinie ===\nDie ...,Wie heißt der Bereich oberhalb der Gleichgewic...,"{'text': ['Nährgebiet', 'Akkumulationsgebiet',..."
104,41227,Gletscher\n\n=== Gleichgewichtslinie ===\nDie ...,Wie heißt der Bereich unterhalb der Gleichgewi...,"{'text': ['Zehrgebiet ', 'Ablationsgebiet', 'Z..."
105,41230,Gletscher\n\n=== Gleichgewichtslinie ===\nDie ...,Wie ist das Akkumulationsgebiet des Gletschers...,{'text': ['mehr Gletschereis gebildet als durc...
106,41279,Osmanisches_Reich\n\n=== Reform des Millet-Sys...,In welchem Edikt wurde im osmanischen Reich di...,"{'text': ['''Hatt-ı Şerif'' von Gülhane ', ' '..."
107,41280,Osmanisches_Reich\n\n=== Reform des Millet-Sys...,Wann wurde das Edikt von Gülhane erlassen?,"{'text': ['1839', '1839', '1839'], 'answer_sta..."
108,41282,Osmanisches_Reich\n\n=== Reform des Millet-Sys...,Wann wurde der Hatt-i Hümayun erlassen?,"{'text': ['1856', '1856', '1856 '], 'answer_st..."
109,41283,Osmanisches_Reich\n\n=== Reform des Millet-Sys...,Wann wurde die griechische Bevölkerung aus dem...,"{'text': ['1914–1923', '1914–1923', '1914–1923..."


In [147]:
for sample in germanquad["test"]:
    print(sample["question"])
    print(sample["context"])
    print(sample["answers"])
    print()
    break

Was kann den Verschleiß des seillosen Aufzuges minimieren?
Aufzugsanlage

=== Seilloser Aufzug ===
An der RWTH Aachen im Institut für Elektrische Maschinen wurde ein seilloser Aufzug entwickelt und ein Prototyp aufgebaut. Die Kabine wird hierbei durch zwei elektromagnetische Synchron-Linearmotoren angetrieben und somit nur durch ein vertikal bewegliches Magnetfeld gehalten bzw. bewegt. Diese Arbeit soll der Entwicklung von Aufzugsanlagen für sehr hohe Gebäude dienen. Ein Ziel ist der Einsatz mehrerer Kabinen pro Schacht, die sich unabhängig voneinander steuern lassen. Bei Auswahl des Fahrtziels vor Fahrtantritt (d. h. noch außerhalb des Aufzug) wird ein bestimmter Fahrkorb in einem der Aufzugsschächte für die Fahrt ausgewählt, mit der sich der geplante Transport am schnellsten durchführen lässt. Der Platzbedarf für die gesamte Aufzugsanlage könnte somit um ein oder mehrere Schächte reduziert werden. Da die Kabinen seillos betrieben werden, ist ein Schachtwechsel ebenfalls denkbar. Hier

In [27]:
import evaluate

metric = evaluate.load("squad_v2")

In [None]:
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in germanquad["test"].to_list()[:10]
]

In [None]:
theoretical_answers

In [None]:
predicted_answers = [
    {"id": ex["id"], "prediction_text": "tim"} for ex in theoretical_answers
]

In [None]:
germanquad["test"].to_list()[:2]

In [151]:
germanquad["test"]["context"].

AttributeError: 'list' object has no attribute 'to_list'

In [None]:


# squad_v2_metric = evaluate.load("squad_v2")
# results = squad_v2_metric.compute(predictions=predictions, references=references)
# print(results)

In [None]:
print(metric.inputs_description)

In [29]:
predictions = [{'prediction_text': '', 'id': 'asdf', 'no_answer_probability': 1.}]
references = [{'answers': {'answer_start': [], 'text': []}, 'id': 'asdf'}]
squad_v2_metric = evaluate.load("squad_v2")
results = squad_v2_metric.compute(predictions=predictions, references=references)
print(results)

{'exact': 100.0, 'f1': 100.0, 'total': 1, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 1, 'best_exact': 100.0, 'best_exact_thresh': 0.0, 'best_f1': 100.0, 'best_f1_thresh': 0.0}


In [None]:
references

In [None]:
references = [{"answers": ex["answers"], "id": str(ex["id"])} for ex in germanquad["test"].to_list()[:1]]
predictions = [
    {"id": ex["id"], "prediction_text": "elektromagnetischer Linearführungen", "no_answer_probability": 0.0} for ex in references
]

In [None]:
references

In [None]:
predictions

In [None]:
squad_v2_metric = evaluate.load("squad_v2")
results = squad_v2_metric.compute(predictions=predictions, references=references)
print(results)

In [34]:
import pandas as pd

df = pd.read_parquet("experiments/extractive-qa/extractive_qa_squad2.parquet")

In [35]:
df

Unnamed: 0,Answer,Prediction,Reason,Message
0,"{'answers': {'answer_start': [], 'text': []}, ...","{'id': '5ad39d53604f3c001a3fe8d1', 'no_answer_...",The Normans (Norman,Answer: The Normans \nReasoning: The Normans ...
1,"{'answers': {'answer_start': [], 'text': []}, ...","{'id': '5ad39d53604f3c001a3fe8d2', 'no_answer_...",The Normans in the 10th and 11th centuries gav...,Answer: France\nReasoning: The Normans in the...
2,"{'answers': {'answer_start': [], 'text': []}, ...","{'id': '5ad39d53604f3c001a3fe8d3', 'no_answer_...",Through generations of assimilation and mixing...,Answer: Rollo \nReasoning: Through generation...
3,"{'answers': {'answer_start': [], 'text': []}, ...","{'id': '5ad39d53604f3c001a3fe8d4', 'no_answer_...",The provided text does not mention when the Fr...,Answer: Not answerable \nReasoning: The provid...


In [19]:
df[1:3]

Unnamed: 0,Answer,Prediction,Reason,Message
1,Carolina Panthers,Carolina Panthers,The American Football Conference (AFC) champio...,Answer: Carolina Panthers \nReasoning: The Am...
2,"Santa Clara, California",Levi's Stadium in the San Francisco Bay Area a...,Not answerable.,Answer: Levi's Stadium in the San Francisco Ba...


In [11]:
df["Answer"] = df["Answer"].apply(lambda x: x["answers"]["text"][0])

In [13]:
df["Prediction"] = df["Prediction"].apply(lambda x: x["prediction_text"])