# import libraries

In [1]:
import os
import logging
import json
import pickle
from tqdm import tqdm
import glob
import re
from itertools import chain
import numpy as np
import pandas as pd
import collections

# setup directories and prepare primary datasets

In [9]:
def create_directories(directories):
    for directory in directories:
        if not os.path.exists(directory):
            os.makedirs(directory)
            print(f"Created directory: {directory}")


def extract_pmid(links):
    return [link.split("/")[-1] for link in links]

def dataset_info(dataset, dataset_name):
    datasets_info = []

    # check if dataset is a string
    if isinstance(dataset, str):
        # load json and create dataframe
        for json_file in glob.glob(os.path.join(dataset, "*.json")):
            # dataset_name = json_file.split("/")[-1].split(".")[0]
            with open(json_file) as fp:
                json_data = "".join(fp)
            data = json.loads(json_data)
            data = data["questions"]
            df = pd.DataFrame(data)
            df.reset_index(drop=True, inplace=True)
            
            # rename 'body' column to 'query'
            df.rename(columns={"body": "query"}, inplace=True)
    # check if dataset is a dataframe
    elif isinstance(dataset, pd.DataFrame):
        # check if these columns ['query', 'type', 'documents', 'exact_answer'] exist
        if not set(["query", "type", "documents", "exact_answer"]).issubset(
            set(dataset.columns)
        ):
            raise ValueError(
                "dataset must contain these columns ['query', 'type', 'documents', 'exact_answer']"
            )
        else:
            df = dataset.copy()
    else:
        raise ValueError("dataset must be a string or dataframe")


    # extract pmids
    df["pmids"] = df["documents"].apply(extract_pmid)
    pmids = list(chain.from_iterable(df["pmids"].to_list()))

    # query by type
    queries_by_type = df.groupby("type").size().to_dict()
    info = {
        "dataset_name": dataset_name,
        "num_queries": df["query"].shape[0],
        "num_docs": len(set(pmids)),
    }
    info.update(queries_by_type)
    datasets_info.append(info)
    
    # create dataframe from list of dictionaries
    datasets_info_df = pd.DataFrame(datasets_info)
    datasets_info_df.sort_values(by="dataset_name", inplace=True)
    return datasets_info_df

def load_dataset(dataset_name, dataset_dir, logger):
    # load json and create dataframe
    dfs = []
    for json_file in glob.glob(os.path.join(dataset_dir, "*.json")):
        with open(json_file) as fp:
            json_data = "".join(fp)
        data = json.loads(json_data)
        data = data["questions"]
        dfs.append(pd.DataFrame(data))
    df = pd.concat(dfs)
    df.reset_index(drop=True, inplace=True)
    
    # rename 'body' column to 'query'
    df.rename(columns={"body": "query"}, inplace=True)
    print(f"{dataset_name} - query: {df['query'].shape[0]}")
    logger.info(
        "%s - query: %d", dataset_name, df["query"].shape[0]
    )

    # extract pmids and add to dataframe
    df["pmids"] = df["documents"].apply(extract_pmid)
    pmids = list(chain.from_iterable(df["pmids"].to_list()))
    print(
        f"{dataset_name} - unique docs: {len(set(pmids))}"
    )
    logger.info(
        "%s - unique docs: %d",
        dataset_name,
        len(set(pmids)),
    )

    # show number of query by type
    print(f"{dataset_name} - queries by type: {df.groupby('type').size()}")
    logger.info(
        "%s - queries by type: %s",
        dataset_name,
        df.groupby("type").size(),
    )
    return df, pmids


def prepare_dataset(dataset_name, dataset_dir, doc_df, logger):
    # load dataset
    df, pmids= load_dataset(dataset_name, dataset_dir, logger)
    
    # filter out queries with no docs in corpus  
    corpus_df = doc_df[doc_df["pmid"].isin(pmids)]
    print(f"num of docs found in corpus:{corpus_df.shape[0]}")
    logger.info("num of docs found in corpus: %d", corpus_df.shape[0])

    def filter_pmid(pmids):
        filtered_pmids = [
            pmid for pmid in pmids if pmid in doc_df["pmid"].to_list()
        ]
        return filtered_pmids

    df["pmids_found"] = df["pmids"].apply(filter_pmid)

    filtered_df = df[df["pmids_found"].apply(len) > 0]
    # average number of docs per query
    total_num_docs = sum(filtered_df["pmids_found"].apply(len))
    total_num_queries = filtered_df["query"].shape[0]
    avg_num_docs_per_query = total_num_docs / total_num_queries
    print(f"{dataset_name} - docs per query: {avg_num_docs_per_query}")
    logger.info(
        "%s - docs per query: %d",
        dataset_name,
        avg_num_docs_per_query,
    )
    return filtered_df

# function to filter factoid questions from the dataset
def filter_factoid_questions(df):
    return df[df["type"] == "factoid"]


# function to filter list questions from the dataset
def filter_list_questions(df):
    return df[df["type"] == "list"]

In [None]:

try:
    # load dir_dict from json file in home directory
    home_dir = os.path.expanduser("~")
    with open(f"{home_dir}/.biomedqa_dir.json", encoding="utf-8") as fp:
        dir_dict = json.load(fp)
except Exception as exc:
    print("Error: unable to load directory dictionary. Please run setup.py")
    raise exc

# set directories
BASE_DIR = dir_dict["base_dir"]
DATA_DIR = dir_dict["data_dir"]
MODEL_DIR = dir_dict["model_dir"]
LOG_DIR = dir_dict["log_dir"]
RESULTS_DIR = dir_dict["results_dir"]

DATASET = "bioasq"
YEAR = "2022"
TRAIN_DATASET_NAME = "BioASQ-training10b"
TEST_DATASET_NAME = "Task10BGoldenEnriched"
__file__ = "notebooks/answer_extraction.ipynb"

TRAIN_DATASET_DIR = f"{DATA_DIR}/raw/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}"
TRAIN_DOC_DIR = (
    f"{DATA_DIR}/processed/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}_documents/"
)
print(f"train dataset name:{TRAIN_DATASET_NAME}")
print(f"train dataset dir:{TRAIN_DATASET_DIR}")
print(f"train doc dir:{TRAIN_DOC_DIR}")

TEST_DATASET_DIR = f"{DATA_DIR}/raw/{DATASET}/{YEAR}/{TEST_DATASET_NAME}"
TEST_DOC_DIR = (
    f"{DATA_DIR}/processed/{DATASET}/{YEAR}/{TEST_DATASET_NAME}_documents/"
)
print(f"test dataset name:{TEST_DATASET_NAME}")
print(f"test dataset dir:{TEST_DATASET_DIR}")
print(f"test doc dir:{TEST_DOC_DIR}")

# get file directory
# FILE_DIR = os.path.dirname(os.path.relpath(__file__))
FILE_DIR = "biomed_qa/answer_extraction/transformer/minilm_ft"

# set log dir directory according to current file directory
LOG_DIR = f"{LOG_DIR}/{FILE_DIR}"
print(f"log dir:{LOG_DIR}")

# set model directory according to current file directory
MODEL_DIR = f"{MODEL_DIR}/{FILE_DIR}/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}/"
print(f"model dir:{MODEL_DIR}")

# set results directory according to current file directory
RESULTS_DIR = f"{RESULTS_DIR}/{FILE_DIR}/{DATASET}/{YEAR}/{TEST_DATASET_NAME}/"
print(f"results dir:{RESULTS_DIR}")

# create directories
create_directories([LOG_DIR, MODEL_DIR, RESULTS_DIR])

# set log file name
log_file = os.path.join(
    LOG_DIR, os.path.basename(__file__).split(".")[0] + ".log"
)
print(f"LOG_FILE: {log_file}")

# initialize logger

logging.basicConfig(
    filename=log_file,
    format="%(process)d\t%(asctime)s\t%(levelname)s\t%(message)s",
    level=logging.DEBUG,
)
logger = logging.getLogger(__name__)
logger.info("Logger initialized")

# load documents
logger.info("loading documents")

train_doc_df = pd.read_pickle(
    f"{TRAIN_DOC_DIR}{TRAIN_DATASET_NAME}_documents_df.pkl"
)


test_doc_df = pd.read_pickle(
    f"{TEST_DOC_DIR}{TEST_DATASET_NAME}_documents_df.pkl"
)

# load datasets
logger.info("loading datasets")
train_df, _ = load_dataset(TRAIN_DATASET_NAME, TRAIN_DATASET_DIR, logger)
test_df, _ = load_dataset(TEST_DATASET_NAME, TEST_DATASET_DIR, logger)

# prepare datasets
train_df = prepare_dataset(
    TRAIN_DATASET_NAME, TRAIN_DATASET_DIR, train_doc_df, logger
)
test_df = prepare_dataset(
    TEST_DATASET_NAME, TEST_DATASET_DIR, test_doc_df, logger
)

# filter factoid questions from train_filtered_df
train_factoid_df = filter_factoid_questions(train_df)

# filter list questions from train_filtered_df
train_list_df = filter_list_questions(train_df)

# filter factoid questions from test_filtered_df
test_factoid_df = filter_factoid_questions(test_df)

# filter list questions from test_filtered_df
test_list_df = filter_list_questions(test_df)

In [None]:
display(dataset_info(train_factoid_df, TRAIN_DATASET_NAME+"_factoid"))
display(dataset_info(train_list_df, TRAIN_DATASET_NAME+"_list"))
display(dataset_info(test_factoid_df, TEST_DATASET_NAME+"_factoid"))
display(dataset_info(test_list_df, TEST_DATASET_NAME+"_list"))

# create squad formatted datasets from primary datasets

In [49]:
import nltk
from nltk.corpus import wordnet
from tqdm import tqdm

nltk.download('wordnet')

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace('_', ' '))
    return list(synonyms)

def find_synonym_in_text(word, text):
    start_idx = text.lower().find(word.lower())
    if start_idx != -1:
        return start_idx, word

    synonyms = get_synonyms(word)
    for synonym in synonyms:
        idx = text.lower().find(synonym.lower())
        if idx != -1:
            return idx, synonym
    return -1, word  # Return original word if not found


def create_squad_dataset(doc_df, df):
    squad_data = []
    for index, row in tqdm(df.iterrows(), total=len(df)):
        query = row["query"]
        pmids = row["pmids_found"]
        exact_answers = row["exact_answer"]

        for pmid in pmids:
            abstract_text = doc_df.loc[
                doc_df["pmid"] == pmid, "abstractText"
            ].iloc[0]
            title = doc_df.loc[doc_df["pmid"] == pmid, "title"].iloc[0]

            data = {"context": abstract_text, "id": f"{pmid}_{row['id']}", "question": query, "title": title}

            for answer in exact_answers:
                if isinstance(answer, list):
                    for ans in answer:
                        if ans == "":
                            continue
                        start_idx, found_synonym = find_synonym_in_text(ans, abstract_text)
                        if start_idx != -1:
                            data["answers"] = {"text": [found_synonym], "answer_start": [start_idx]}
                            squad_data.append(data)
                else:
                    if answer == "":
                        continue
                    start_idx, found_synonym = find_synonym_in_text(answer, abstract_text)
                    if start_idx != -1:
                        data["answers"] = {"text": [found_synonym], "answer_start": [start_idx]}
                        squad_data.append(data)
    return squad_data

train_factoid_squad = create_squad_dataset(train_doc_df, train_factoid_df)
test_factoid_squad = create_squad_dataset(test_doc_df, test_factoid_df)

train_list_squad = create_squad_dataset(train_doc_df, train_list_df)
test_list_squad = create_squad_dataset(test_doc_df, test_list_df)

[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|██████████| 1252/1252 [00:37<00:00, 33.28it/s]
100%|██████████| 166/166 [00:00<00:00, 232.82it/s]
100%|██████████| 816/816 [00:29<00:00, 27.87it/s]
100%|██████████| 85/85 [00:00<00:00, 139.87it/s]


In [50]:
# save to json
with open("data/train_factoid_squad.json", "w") as f:
    json.dump({"data": train_factoid_squad}, f)

with open("data/test_factoid_squad.json", "w") as f:
    json.dump({"data": test_factoid_squad}, f)

with open("data/train_list_squad.json", "w") as f:
    json.dump({"data": train_list_squad}, f)

with open("data/test_list_squad.json", "w") as f:
    json.dump({"data": test_list_squad}, f)

# preprocessing datasets for fine-tuning

In [None]:
import transformers
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric, Dataset

squad_v2 = False
# model_checkpoint = "distilbert-base-uncased"
# model_checkpoint = "deepset/minilm-uncased-squad2"
model_checkpoint = "test-factoid-trained"
batch_size = 16
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
pad_on_right = tokenizer.padding_side == "right"

# Load the dataset
datasets = load_dataset("json", data_files={"train": "data/train_factoid_squad.json", "validation": "data/test_factoid_squad.json"}, field="data")
datasets

In [8]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples


In [None]:
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

# Fine-tuning

In [10]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [12]:
from transformers import default_data_collator

data_collator = default_data_collator

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [13]:
trainer.train()
trainer.save_model("test-factoid-trained")



Epoch,Training Loss,Validation Loss
1,2.2441,2.671231
2,1.065,2.552712
3,0.7431,2.802917


TrainOutput(global_step=1557, training_loss=1.3230432719853573, metrics={'train_runtime': 5584.5782, 'train_samples_per_second': 4.454, 'train_steps_per_second': 0.279, 'total_flos': 2437300917821952.0, 'train_loss': 1.3230432719853573, 'epoch': 3.0})

# Evaluation(squad)

In [None]:
import torch

def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

validation_features = datasets["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=datasets["validation"].column_names
)

raw_predictions = trainer.predict(validation_features)

validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

In [None]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = []


    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions.append({"id":example["id"],"prediction_text": best_answer["text"]})        
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

    return predictions

final_predictions = postprocess_qa_predictions(datasets["validation"], validation_features, raw_predictions.predictions)

references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"]]

metric = load_metric("squad_v2" if squad_v2 else "squad")
metric.compute(predictions=final_predictions, references=references)

# Evaluation(BioAsq)

In [None]:
from haystack import Document

def convert_docs_to_haystack_docs(docs):
    hs_docs = []
    for doc in docs:
        document = Document(content=doc)
        hs_docs.append(document)
    return hs_docs

def predict(reader,query,k):
    minimal_answers = []
    docs = run_query_and_get_documents(query,10)
    docs = convert_docs_to_haystack_docs(docs)
    answers = reader.predict(query=query, documents=docs, top_k=k)
    [ minimal_answers.append([ans.answer]) for ans in answers['answers'] if [ans.answer] not in minimal_answers]
    return minimal_answers

# function for evaluating factoid questions
def evaluate_factoid(DATASET_NAME, DATA_DIR, MODEL_DIR, RESULTS_DIR):
    from haystack.nodes import FARMReader
    reader = FARMReader(model_name_or_path=MODEL_DIR, use_gpu=True)
    scores = []
    for json_file in glob.glob(os.path.join(DATA_DIR, '*.json')):
        print(json_file)
        with open(json_file) as fp:
            json_data = ''.join(fp)

        data = json.loads(json_data)
        data = data["questions"]
        df = pd.DataFrame(data)
        factoid_df = df[df['type'] == 'factoid']
        factoid_df['predicted_answers'] = factoid_df.apply( lambda x: predict(reader, x.body, 5), axis = 1 )

        # Calculate BioAsq metrics
        strict_accuracy = calculate_strict_accuracy(factoid_df)
        lenient_accuracy_5 = calculate_lenient_accuracy_5(factoid_df)
        mrr = calculate_mrr(factoid_df)

        # create score dict
        score = {
            'dataset': DATASET_NAME,
            'model': MODEL_DIR,
            'batch': json_file.replace(DATA_DIR, '').replace('.json', ''),
            'strict_accuracy': strict_accuracy,
            'lenient_accuracy_5': lenient_accuracy_5,
            'mrr': mrr
        }
        scores.append(score)

    # save scores
    scores_df = pd.DataFrame(scores)
    scores_df.to_csv(os.path.join(RESULTS_DIR, 'scores.csv'), index=False)


# function for evaluating list questions
def evaluate_list(DATASET_NAME, DATA_DIR, MODEL_DIR, RESULTS_DIR):
    from haystack.nodes import FARMReader
    reader = FARMReader(model_name_or_path=MODEL_DIR, use_gpu=True)
    scores = []
    for json_file in glob.glob(os.path.join(DATA_DIR, '*.json')):
        print(json_file)
        with open(json_file) as fp:
            json_data = ''.join(fp)

        data = json.loads(json_data)
        data = data["questions"]
        df = pd.DataFrame(data)
        list_df = df[df['type'] == 'factoid']
        list_df['predicted_answers'] = list_df.apply( lambda x: predict(reader, x.body, 10), axis = 1 )

        # Calculate BioAsq metrics precision, recall and f-measure
        precision = calculate_precision(list_df)
        recall = calculate_recall(list_df)
        f_measure = calculate_f_measure(list_df)

        # create score dict
        score = {
            'dataset': DATASET_NAME,
            'model': MODEL_DIR,
            'batch': json_file.replace(DATA_DIR, '').replace('.json', ''),
            'precision': precision,
            'recall': recall,
            'f_measure': f_measure
        }
        scores.append(score)

    # save scores
    scores_df = pd.DataFrame(scores)
    scores_df.to_csv(os.path.join(RESULTS_DIR, 'scores.csv'), index=False)

# Haystack

In [13]:
import nltk
from nltk.corpus import wordnet
from tqdm import tqdm

nltk.download('wordnet')

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace('_', ' '))
    return list(synonyms)

def find_synonym_in_text(word, text):
    start_idx = text.lower().find(word.lower())
    if start_idx != -1:
        return start_idx, word

    synonyms = get_synonyms(word)
    for synonym in synonyms:
        idx = text.lower().find(synonym.lower())
        if idx != -1:
            return idx, synonym
    return -1, word  # Return original word if not found

def create_squad_dataset(doc_df, df):
    squad_data = {"data": []}

    for index, row in tqdm(df.iterrows(), total=len(df)):
        query = row["query"]
        pmids = row["pmids_found"]
        exact_answers = row["exact_answer"]

        squad_paragraphs = []

        for pmid in pmids:
            abstract_text = doc_df.loc[
                doc_df["pmid"] == pmid, "abstractText"
            ].iloc[0]

            paragraph = {"context": abstract_text, "qas": []}

            for answer in exact_answers:
                qas = {
                    "question": query,
                    "id": f"{pmid}_{row['id']}",
                    "answers": [],
                }

                if isinstance(answer, list):
                    for ans in answer:
                        if ans == "":
                            continue
                        start_idx, found_synonym = find_synonym_in_text(ans, abstract_text)
                        if start_idx != -1:
                            qas["answers"].append(
                                {"text": found_synonym, "answer_start": start_idx}
                            )
                else:
                    if answer == "":
                        continue
                    start_idx, found_synonym = find_synonym_in_text(answer, abstract_text)
                    if start_idx != -1:
                        qas["answers"].append(
                            {"text": found_synonym, "answer_start": start_idx}
                        )

                if len(qas["answers"]) > 0:
                    paragraph["qas"].append(qas)

            if len(paragraph["qas"]) > 0:
                squad_paragraphs.append(paragraph)

        if len(squad_paragraphs) > 0:
            squad_data["data"].append({"title": query, "paragraphs": squad_paragraphs})

    return squad_data

[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# train_factoid_squad = create_squad_dataset(train_doc_df, train_factoid_df)
test_factoid_squad = create_squad_dataset(test_doc_df, test_factoid_df)

# train_list_squad = create_squad_dataset(train_doc_df, train_list_df)
test_list_squad = create_squad_dataset(test_doc_df, test_list_df)

100%|██████████| 150/150 [00:02<00:00, 73.36it/s] 
100%|██████████| 87/87 [00:00<00:00, 130.80it/s]


In [6]:
# save to json
# with open("data_hs/train_factoid_squad.json", "w") as f:
#     json.dump(train_factoid_squad, f)

with open("data_hs/test_factoid_squad.json", "w") as f:
    json.dump(test_factoid_squad, f)

# with open("data_hs/train_list_squad.json", "w") as f:
#     json.dump(train_list_squad, f)

with open("data_hs/test_list_squad.json", "w") as f:
    json.dump(test_list_squad, f)

# init base model

In [7]:
from haystack.nodes import FARMReader
reader = FARMReader(model_name_or_path='deepset/minilm-uncased-squad2', use_gpu=True)

# Evaluate base model

In [8]:
factoid_score = reader.eval_on_file(data_dir="data_hs",test_filename="test_factoid_squad.json")
print(factoid_score)

Preprocessing dataset:   0%|          | 0/2 [00:00<?, ? Dicts/s]

Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]

{'EM': 26.730310262529834, 'f1': 35.687203276702085, 'top_n_accuracy': 69.92840095465394, 'top_n': 4, 'EM_text_answer': 26.730310262529834, 'f1_text_answer': 35.687203276702085, 'top_n_accuracy_text_answer': 69.92840095465394, 'top_n_EM_text_answer': 43.43675417661098, 'top_n_f1_text_answer': 58.45608750180259, 'Total_text_answer': 419, 'EM_no_answer': 0, 'f1_no_answer': nan, 'top_n_accuracy_no_answer': nan, 'Total_no_answer': 0}


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [9]:
list_score = reader.eval_on_file(data_dir="data_hs",test_filename="test_list_squad.json")
print(list_score)

Preprocessing dataset:   0%|          | 0/2 [00:00<?, ? Dicts/s]

Evaluating:   0%|          | 0/47 [00:00<?, ?it/s]

{'EM': 2.095808383233533, 'f1': 13.926738140182563, 'top_n_accuracy': 68.76247504990019, 'top_n': 4, 'EM_text_answer': 2.095808383233533, 'f1_text_answer': 13.926738140182563, 'top_n_accuracy_text_answer': 68.76247504990019, 'top_n_EM_text_answer': 6.686626746506986, 'top_n_f1_text_answer': 27.198530583176417, 'Total_text_answer': 1002, 'EM_no_answer': 0, 'f1_no_answer': nan, 'top_n_accuracy_no_answer': nan, 'Total_no_answer': 0}


# Finetuning

## factoid

In [2]:
from haystack.nodes import FARMReader
reader = FARMReader(model_name_or_path='factoid_model_10', use_gpu=True)
score_list = []
model_no = 1

  return self.fget.__get__(instance, owner)()
Some unused parameters are passed to the QuestionAnsweringHead. Might not be a problem. Params: {"training": false, "num_labels": 2, "ph_output_type": "per_token_squad", "model_type": "span_classification", "label_tensor_name": "question_answering_label_ids", "label_list": ["start_token", "end_token"], "metric": "squad", "name": "QuestionAnsweringHead"}


In [None]:
for i in range(3):
    model_name = f"factoid_model_{model_no}"
    print(model_name)

    reader.train(
        data_dir="data_hs",
        train_filename="train_factoid_squad.json",
        # dev_filename="test_factoid_squad.json",
        # test_filename="test_factoid.json",
        use_gpu=True,
        n_epochs=1,
        batch_size=12,
        save_dir=model_name
        )
    factoid_reader = FARMReader(model_name_or_path=model_name, use_gpu=True)
    score_dict = factoid_reader.eval_on_file(data_dir="data_hs",test_filename="test_factoid_squad.json")
    score_dict["model_name"] = model_name
    print(score_dict)
    score_list.append(score_dict)
    model_no += 1

In [7]:
import pandas as pd
score_df = pd.DataFrame(score_list)
score_df = score_df.sort_values(by="EM", ascending=False)
score_df

Unnamed: 0,EM,f1,top_n_accuracy,top_n,EM_text_answer,f1_text_answer,top_n_accuracy_text_answer,top_n_EM_text_answer,top_n_f1_text_answer,Total_text_answer,EM_no_answer,f1_no_answer,top_n_accuracy_no_answer,Total_no_answer,model_name
4,51.145038,62.358833,79.389313,4,51.145038,62.358833,79.389313,57.506361,70.333415,393,0,,,0,factoid_model_5
3,50.381679,62.989113,79.898219,4,50.381679,62.989113,79.898219,58.015267,71.406307,393,0,,,0,factoid_model_4
5,49.872774,61.152394,79.389313,4,49.872774,61.152394,79.389313,56.997455,70.391484,393,0,,,0,factoid_model_6
1,49.109415,58.633348,80.407125,4,49.109415,58.633348,80.407125,58.524173,72.106945,393,0,,,0,factoid_model_2
6,49.109415,59.372556,77.608142,4,49.109415,59.372556,77.608142,54.961832,68.085139,393,0,,,0,factoid_model_7
2,48.854962,61.06265,80.661578,4,48.854962,61.06265,80.661578,58.26972,72.023872,393,0,,,0,factoid_model_3
12,48.346056,59.105611,77.35369,4,48.346056,59.105611,77.35369,54.452926,67.651636,393,0,,,0,factoid_model_13
7,48.091603,59.068378,77.862595,4,48.091603,59.068378,77.862595,54.198473,68.19383,393,0,,,0,factoid_model_8
8,47.83715,59.218506,78.117048,4,47.83715,59.218506,78.117048,53.689567,67.92119,393,0,,,0,factoid_model_9
10,47.073791,58.151068,78.625954,4,47.073791,58.151068,78.625954,52.926209,66.926344,393,0,,,0,factoid_model_11


## List

In [None]:
from haystack.nodes import FARMReader
reader = FARMReader(model_name_or_path='list_model_7', use_gpu=True)
score_list = []
model_no = 1

In [None]:
for i in range(1):
    model_name = f"list_model_{model_no}"
    print(model_name)

    reader.train(
        data_dir="data_hs",
        train_filename="train_list_squad.json",
        # dev_filename="test_factoid_squad.json",
        # test_filename="test_factoid.json",
        use_gpu=True,
        n_epochs=1,
        batch_size=48,
        save_dir=model_name
        )
    list_reader = FARMReader(model_name_or_path=model_name, use_gpu=True)
    score_dict = list_reader.eval_on_file(data_dir="data_hs",test_filename="test_list_squad.json")
    score_dict["model_name"] = model_name
    print(score_dict)
    score_list.append(score_dict)
    model_no += 1

In [18]:
import pandas as pd
score_df = pd.DataFrame(score_list)
score_df = score_df.sort_values(by="EM", ascending=False)
score_df

Unnamed: 0,EM,f1,top_n_accuracy,top_n,EM_text_answer,f1_text_answer,top_n_accuracy_text_answer,top_n_EM_text_answer,top_n_f1_text_answer,Total_text_answer,EM_no_answer,f1_no_answer,top_n_accuracy_no_answer,Total_no_answer,model_name
14,20.681551,24.421407,38.072855,4,20.681551,24.421407,38.072855,26.556992,32.183946,851,0,,,0,list_model_15
11,20.564042,25.290357,39.012926,4,20.564042,25.290357,39.012926,25.499412,32.40503,851,0,,,0,list_model_12
12,19.858989,24.674174,39.717979,4,19.858989,24.674174,39.717979,25.616921,32.685919,851,0,,,0,list_model_13
13,19.741481,24.372501,39.365452,4,19.741481,24.372501,39.365452,25.381904,32.272345,851,0,,,0,list_model_14
10,19.623972,24.185435,40.423032,4,19.623972,24.185435,40.423032,24.911868,32.279198,851,0,,,0,list_model_11
9,18.80141,22.764281,40.658049,4,18.80141,22.764281,40.658049,25.73443,32.692414,851,0,,,0,list_model_10
8,17.743831,21.803746,40.423032,4,17.743831,21.803746,40.423032,25.969448,33.361955,851,0,,,0,list_model_9
7,16.686251,19.977085,41.363102,4,16.686251,19.977085,41.363102,26.439483,33.731517,851,0,,,0,list_model_8
6,15.393655,18.703338,41.010576,4,15.393655,18.703338,41.010576,26.792009,34.253663,851,0,,,0,list_model_7
5,13.160987,15.332819,41.010576,4,13.160987,15.332819,41.010576,26.556992,33.863,851,0,,,0,list_model_6


# Evaluation (BioAsq)

In [6]:
from haystack import Document

def convert_docs_to_haystack_docs(docs):
    hs_docs = []
    for doc in docs:
        document = Document(content=doc)
        hs_docs.append(document)
    return hs_docs

def run_query_and_get_documents(query):
    pmids = test_df[test_df['query']==query]['pmid'].values
    docs = test_doc_df[test_doc_df['pmid'].isin(pmids)]['abstract'].values
    return docs

def predict(reader,query,k):
    minimal_answers = []
    docs = run_query_and_get_documents(query)
    docs = convert_docs_to_haystack_docs(docs)
    answers = reader.predict(query=query, documents=docs, top_k=k)
    [ minimal_answers.append([ans.answer]) for ans in answers['answers'] if [ans.answer] not in minimal_answers]
    return minimal_answers