In [2]:
# import libraries
import os
import logging
import json
import pickle
import glob
import re
from itertools import chain
import numpy as np
import pandas as pd

from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    average_precision_score,
)
from scipy.stats import gmean
from sklearn.preprocessing import MultiLabelBinarizer
# import gensim libraries
from gensim import corpora
from gensim.models import LdaModel

# from gensim.models.ldamulticore import LdaMulticore
from gensim.similarities import MatrixSimilarity
from gensim.parsing.preprocessing import preprocess_documents

- include load_dataset and load_document inside prepare_dataset

In [8]:
def create_directories(directories):
    for directory in directories:
        if not os.path.exists(directory):
            os.makedirs(directory)
            print(f"Created directory: {directory}")


def extract_pmid(links):
    return [link.split("/")[-1] for link in links]

def dataset_info(dataset_dir):
    datasets_info = []
    # load json and create dataframe
    for json_file in glob.glob(os.path.join(dataset_dir, "*.json")):
        dataset_name = json_file.split("/")[-1].split(".")[0]
        with open(json_file) as fp:
            json_data = "".join(fp)
        data = json.loads(json_data)
        data = data["questions"]
        df = pd.DataFrame(data)
        df.reset_index(drop=True, inplace=True)
        
        # rename 'body' column to 'query'
        df.rename(columns={"body": "query"}, inplace=True)

        # extract pmids
        df["pmids"] = df["documents"].apply(extract_pmid)
        pmids = list(chain.from_iterable(df["pmids"].to_list()))

        # query by type
        queries_by_type = df.groupby("type").size().to_dict()
        info = {
            "dataset_name": dataset_name,
            "num_queries": df["query"].shape[0],
            "num_docs": len(set(pmids)),
        }
        info.update(queries_by_type)
        datasets_info.append(info)
    
    # create dataframe from list of dictionaries
    datasets_info_df = pd.DataFrame(datasets_info)
    datasets_info_df.sort_values(by="dataset_name", inplace=True)
    return datasets_info_df

def load_dataset(dataset_name, dataset_dir, logger):
    # load json and create dataframe
    dfs = []
    for json_file in glob.glob(os.path.join(dataset_dir, "*.json")):
        with open(json_file) as fp:
            json_data = "".join(fp)
        data = json.loads(json_data)
        data = data["questions"]
        dfs.append(pd.DataFrame(data))
    df = pd.concat(dfs)
    df.reset_index(drop=True, inplace=True)
    
    # rename 'body' column to 'query'
    df.rename(columns={"body": "query"}, inplace=True)
    print(f"{dataset_name} - query: {df['query'].shape[0]}")
    logger.info(
        "%s - query: %d", dataset_name, df["query"].shape[0]
    )

    # extract pmids and add to dataframe
    df["pmids"] = df["documents"].apply(extract_pmid)
    pmids = list(chain.from_iterable(df["pmids"].to_list()))
    print(
        f"{dataset_name} - unique docs: {len(set(pmids))}"
    )
    logger.info(
        "%s - unique docs: %d",
        dataset_name,
        len(set(pmids)),
    )

    # show number of query by type
    print(f"{dataset_name} - queries by type: {df.groupby('type').size()}")
    logger.info(
        "%s - queries by type: %s",
        dataset_name,
        df.groupby("type").size(),
    )
    return df, pmids


def prepare_dataset(dataset_name, dataset_dir, doc_df, logger):
    # load dataset
    df, pmids= load_dataset(dataset_name, dataset_dir, logger)
    
    # filter out queries with no docs in corpus  
    corpus_df = doc_df[doc_df["pmid"].isin(pmids)]
    print(f"num of docs found in corpus:{corpus_df.shape[0]}")
    logger.info("num of docs found in corpus: %d", corpus_df.shape[0])

    def filter_pmid(pmids):
        filtered_pmids = [
            pmid for pmid in pmids if pmid in doc_df["pmid"].to_list()
        ]
        return filtered_pmids

    df["pmids_found"] = df["pmids"].apply(filter_pmid)

    filtered_df = df[df["pmids_found"].apply(len) > 0]
    # average number of docs per query
    total_num_docs = sum(filtered_df["pmids_found"].apply(len))
    total_num_queries = filtered_df["query"].shape[0]
    avg_num_docs_per_query = total_num_docs / total_num_queries
    print(f"{dataset_name} - docs per query: {avg_num_docs_per_query}")
    logger.info(
        "%s - docs per query: %d",
        dataset_name,
        avg_num_docs_per_query,
    )
    return filtered_df


# retrieve the top N similar documents for a given document or query
def retrieve_doc_indexes(query, lda_model, sim_matrix, dictionary, topn=10):
    vec_bow = dictionary.doc2bow(query)
    vec_lda = lda_model[vec_bow]
    sims = sim_matrix[vec_lda]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    return sims[:topn]


# get pmids from doc indexes
def get_pmids_from_doc_indexes(doc_indexes, doc_df):
    return [doc_df["pmid"].iloc[doc_idx[0]] for doc_idx in doc_indexes]

# get abstractText from doc indexes
def get_text_from_doc_indexes(doc_indexes, doc_df):
    return [doc_df["abstractText"].iloc[doc_idx[0]] for doc_idx in doc_indexes]

def calculate_metrics(df, true_col, pred_col):
    # Calculate precision, recall, f1, and average precision for each row
    df["precision"] = 0
    df["recall"] = 0
    df["f1"] = 0
    df["avg_precision"] = 0

    for i in range(len(df)):
        # Fit MultiLabelBinarizer on each row separately
        mlb = MultiLabelBinarizer()
        mlb.fit(
            [df[true_col].iloc[i] + df[pred_col].iloc[i]]
        )  # Combining true and predicted labels

        # Transform true and predicted columns separately
        X_true = mlb.transform([df[true_col].iloc[i]])
        X_pred = mlb.transform([df[pred_col].iloc[i]])

        # Calculate precision, recall, f1, and average precision for the current row
        df.at[i, "precision"] = precision_score(
            X_true[0], X_pred[0], zero_division=0
        )
        df.at[i, "recall"] = recall_score(X_true[0], X_pred[0], zero_division=0)
        df.at[i, "f1"] = f1_score(X_true[0], X_pred[0], zero_division=0)
        df.at[i, "avg_precision"] = average_precision_score(
            X_true[0], X_pred[0]
        )

    # Calculate mean precision, mean recall, and mean f1
    mean_precision = df["precision"].mean()
    mean_recall = df["recall"].mean()
    mean_f1 = df["f1"].mean()

    # Calculate MAP and GMAP
    map_score = df["avg_precision"].mean()
    gmap_score = gmean(df["avg_precision"])

    # Create a new dataframe to store the mean scores
    mean_scores_df = pd.DataFrame(
        {
            "mean_precision": mean_precision,
            "mean_recall": mean_recall,
            "mean_f1": mean_f1,
            "MAP": map_score,
            "GMAP": gmap_score,
        },
        index=[0],
    )

    # Return both dataframes
    return mean_scores_df


def evaluate(
    lda_model, test_corpus, test_df, test_doc_df, dictionary, metric, logger
):
    # Create a similarity matrix using the trained LDA model
    logger.info("creating similarity matrix")
    sim_matrix = MatrixSimilarity(
        lda_model[test_corpus], num_features=len(dictionary)
    )

    # get top 10 similar documents for each question
    logger.info("retrieving top similar documents for each question")
    test_df = test_df.copy()
    test_df["top10_docs"] = test_df["query_preprocessed"].apply(
        retrieve_doc_indexes, args=(lda_model, sim_matrix, dictionary)
    )

    test_df["top10_pmids"] = test_df["top10_docs"].apply(
        get_pmids_from_doc_indexes, args=(test_doc_df,)
    )

    # calculate metrics
    logger.info("calculating metrics")
    eval_df_summary = calculate_metrics(test_df, "pmids_found", "top10_pmids")
    # Return the mean mean_f1 score
    score = eval_df_summary[metric].iloc[0]
    logger.info("%s: %s", metric, score)
    return score


def get_max(logs, metric):
    df = pd.concat(logs)
    print(f"max {metric}:")
    df = df.sort_values(by=[metric], ascending=False)
    # return df[df[metric] == df[metric].max()]
    return df.head()

# function to filter factoid questions from the dataset
def filter_factoid_questions(df):
    return df[df["type"] == "factoid"]


# function to filter list questions from the dataset
def filter_list_questions(df):
    return df[df["type"] == "list"]

In [9]:

try:
    # load dir_dict from json file in home directory
    home_dir = os.path.expanduser("~")
    with open(f"{home_dir}/.biomedqa_dir.json", encoding="utf-8") as fp:
        dir_dict = json.load(fp)
except Exception as exc:
    print("Error: unable to load directory dictionary. Please run setup.py")
    raise exc

# set directories
BASE_DIR = dir_dict["base_dir"]
DATA_DIR = dir_dict["data_dir"]
MODEL_DIR = dir_dict["model_dir"]
LOG_DIR = dir_dict["log_dir"]
RESULTS_DIR = dir_dict["results_dir"]

DATASET = "bioasq"
YEAR = "2022"
TRAIN_DATASET_NAME = "Task10BGoldenEnriched"
TEST_DATASET_NAME = "Task10BGoldenEnriched"
__file__ = "notebooks/answer_extraction.ipynb"
# get file directory
# FILE_DIR = os.path.dirname(os.path.relpath(__file__))
FILE_DIR = "biomed_qa/answer_extraction/transformer/minilm_ft"

TRAIN_DATASET_DIR = f"{DATA_DIR}/raw/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}"
TRAIN_DOC_DIR = (
    f"{DATA_DIR}/processed/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}_documents/"
)
print(f"train dataset name:{TRAIN_DATASET_NAME}")
print(f"train dataset dir:{TRAIN_DATASET_DIR}")
print(f"train doc dir:{TRAIN_DOC_DIR}")

TEST_DATASET_DIR = f"{DATA_DIR}/raw/{DATASET}/{YEAR}/{TEST_DATASET_NAME}"
TEST_DOC_DIR = (
    f"{DATA_DIR}/processed/{DATASET}/{YEAR}/{TEST_DATASET_NAME}_documents/"
)
print(f"test dataset name:{TEST_DATASET_NAME}")
print(f"test dataset dir:{TEST_DATASET_DIR}")
print(f"test doc dir:{TEST_DOC_DIR}")

# set log dir directory according to current file directory
LOG_DIR = f"{LOG_DIR}/{FILE_DIR}"
print(f"log dir:{LOG_DIR}")

# set model directory according to current file directory
MODEL_DIR = f"{MODEL_DIR}/{FILE_DIR}/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}/"
print(f"model dir:{MODEL_DIR}")

# set results directory according to current file directory
RESULTS_DIR = f"{RESULTS_DIR}/{FILE_DIR}/{DATASET}/{YEAR}/{TEST_DATASET_NAME}/"
print(f"results dir:{RESULTS_DIR}")

# create directories
create_directories([LOG_DIR, MODEL_DIR, RESULTS_DIR])

# set log file name
log_file = os.path.join(
    LOG_DIR, os.path.basename(__file__).split(".")[0] + ".log"
)
print(f"LOG_FILE: {log_file}")

# initialize logger

logging.basicConfig(
    filename=log_file,
    format="%(process)d\t%(asctime)s\t%(levelname)s\t%(message)s",
    level=logging.DEBUG,
)
logger = logging.getLogger(__name__)
print("Logger initialized")
logger.info("Logger initialized")

train dataset name:Task10BGoldenEnriched
train dataset dir:/workspace/data/raw/bioasq/2022/Task10BGoldenEnriched
train doc dir:/workspace/data/processed/bioasq/2022/Task10BGoldenEnriched_documents/
test dataset name:Task10BGoldenEnriched
test dataset dir:/workspace/data/raw/bioasq/2022/Task10BGoldenEnriched
test doc dir:/workspace/data/processed/bioasq/2022/Task10BGoldenEnriched_documents/
log dir:/workspace/logs/biomed_qa/answer_extraction/transformer/minilm_ft
model dir:/workspace/models/biomed_qa/answer_extraction/transformer/minilm_ft/bioasq/2022/Task10BGoldenEnriched/
results dir:/workspace/results/biomed_qa/answer_extraction/transformer/minilm_ft/bioasq/2022/Task10BGoldenEnriched/
LOG_FILE: /workspace/logs/biomed_qa/answer_extraction/transformer/minilm_ft/answer_extraction.log
Logger initialized


In [10]:
# load documents
logger.info("loading documents")

train_doc_df = pd.read_pickle(
    f"{TRAIN_DOC_DIR}{TRAIN_DATASET_NAME}_documents_df.pkl"
)

test_doc_df = pd.read_pickle(
    f"{TEST_DOC_DIR}{TEST_DATASET_NAME}_documents_df.pkl"
)

# load datasets
logger.info("loading datasets")
train_df, _ = load_dataset(TRAIN_DATASET_NAME, TRAIN_DATASET_DIR, logger)
test_df, _ = load_dataset(TEST_DATASET_NAME, TEST_DATASET_DIR, logger)

# prepare datasets
train_df = prepare_dataset(
    TRAIN_DATASET_NAME, TRAIN_DATASET_DIR, train_doc_df, logger
)
test_df = prepare_dataset(
    TEST_DATASET_NAME, TEST_DATASET_DIR, test_doc_df, logger
)



Task10BGoldenEnriched - query: 486
Task10BGoldenEnriched - unique docs: 3478
Task10BGoldenEnriched - queries by type: type
factoid    166
list        85
summary    112
yesno      123
dtype: int64
Task10BGoldenEnriched - query: 486
Task10BGoldenEnriched - unique docs: 3478
Task10BGoldenEnriched - queries by type: type
factoid    166
list        85
summary    112
yesno      123
dtype: int64
Task10BGoldenEnriched - query: 486
Task10BGoldenEnriched - unique docs: 3478
Task10BGoldenEnriched - queries by type: type
factoid    166
list        85
summary    112
yesno      123
dtype: int64
num of docs found in corpus:3476
Task10BGoldenEnriched - docs per query: 7.292181069958848
Task10BGoldenEnriched - query: 486
Task10BGoldenEnriched - unique docs: 3478
Task10BGoldenEnriched - queries by type: type
factoid    166
list        85
summary    112
yesno      123
dtype: int64
num of docs found in corpus:3476
Task10BGoldenEnriched - docs per query: 7.292181069958848


In [11]:
# preprocess documents using gensim's preprocess_documents function
logger.info("preprocessing train documents")
train_doc_df["abstractText_preprocessed"] = preprocess_documents(
    train_doc_df["abstractText"]
)
logger.info("preprocessing test documents")
test_doc_df["abstractText_preprocessed"] = preprocess_documents(
    test_doc_df["abstractText"]
)

# Create a dictionary from the preprocessed documents of the training set
logger.info("creating dictionary")
dictionary = corpora.Dictionary(train_doc_df["abstractText_preprocessed"])

# create bag of words corpus of the training set
logger.info("creating bag of words for train documents")
train_corpus = [
    dictionary.doc2bow(text)
    for text in train_doc_df["abstractText_preprocessed"]
]
# Create bag of words corpus of the test set
logger.info("creating bag of words for test documents")
test_corpus = [
    dictionary.doc2bow(text)
    for text in test_doc_df["abstractText_preprocessed"]
]

# preprocess questions
logger.info("preprocessing test questions")
test_df["query_preprocessed"] = preprocess_documents(
    test_df["query"].to_list()
)

In [None]:
# train LDA model
lda_model = LdaModel(
    corpus=train_corpus,
    id2word=dictionary,
    num_topics=1241,
    chunksize=2877,
    passes=5,
    update_every=1,
    alpha="symmetric",
    eta="symmetric",
    decay=0.5,
    offset=1,
    eval_every=10,
    iterations=188,
    gamma_threshold=0.001,
    minimum_probability=0.01,
    random_state=1,
    minimum_phi_value=0.01,
)
# save model
logger.info("saving model")
LDA_MODEL_DIR = f"{BASE_DIR}/models/biomed_qa/document_retrieval/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}"
create_directories([LDA_MODEL_DIR])
lda_model.save(f"{MODEL_DIR}/lda")

In [6]:
# load model
logger.info("loading model")
LDA_MODEL_DIR = f"{BASE_DIR}/models/biomed_qa/document_retrieval/{DATASET}/{YEAR}/{TRAIN_DATASET_NAME}"
lda_model = LdaModel.load(f"{LDA_MODEL_DIR}/lda/lda_model")

In [10]:
evaluate(
    lda_model, test_corpus, test_df, test_doc_df, dictionary, "mean_f1", logger
)

0.32309417205931074

In [11]:
logger.info("creating similarity matrix")
sim_matrix = MatrixSimilarity(
    lda_model[test_corpus], num_features=len(dictionary)
)

In [7]:
display(dataset_info(TRAIN_DATASET_DIR))
display(dataset_info(TEST_DATASET_DIR))

Unnamed: 0,dataset_name,num_queries,num_docs,factoid,list,summary,yesno
2,10B1_golden,90,588,34,14,19,23
3,10B2_golden,90,621,34,15,23,18
1,10B3_golden,89,553,32,11,22,24
4,10B4_golden,90,564,31,12,23,24
0,10B5_golden,90,639,29,18,15,28
5,10B6_golden,37,547,6,15,10,6


Unnamed: 0,dataset_name,num_queries,num_docs,factoid,list,summary,yesno
2,10B1_golden,90,588,34,14,19,23
3,10B2_golden,90,621,34,15,23,18
1,10B3_golden,89,553,32,11,22,24
4,10B4_golden,90,564,31,12,23,24
0,10B5_golden,90,639,29,18,15,28
5,10B6_golden,37,547,6,15,10,6


In [12]:
# filter factoid questions from train_filtered_df
train_factoid_df = filter_factoid_questions(train_df)

# filter list questions from train_filtered_df
train_list_df = filter_list_questions(train_df)

# filter factoid questions from test_filtered_df
test_factoid_df = filter_factoid_questions(test_df)

# filter list questions from test_filtered_df
test_list_df = filter_list_questions(test_df)

In [13]:
from haystack import Document

def convert_docs_to_haystack_docs(docs):
    hs_docs = []
    for doc in docs:
        document = Document(content=doc)
        hs_docs.append(document)
    return hs_docs

def run_query_and_get_documents_lda(query, doc_df, lda_model, sim_matrix, dictionary, topn=10):
    query = preprocess_documents([query])
    doc_indexes = retrieve_doc_indexes(query[0], lda_model, sim_matrix, dictionary, topn)
    docs = get_text_from_doc_indexes(doc_indexes, doc_df)
    return docs

def run_query_and_get_documents(query):
    pmids = test_df[test_df['query']==query]['pmids'].values
    docs = test_doc_df[test_doc_df['pmid'].isin(pmids[0])]["abstractText"].values
    return docs

def run_query_and_get_snippets(query):
    snippets = test_df[test_df['query']==query]['snippets'].values
    snippets = [snippet['text'] for snippet in snippets[0]]
    return snippets

def predict(reader,query,context_src,k):
    minimal_answers = []
    if context_src == 'by_documents':
        docs = run_query_and_get_documents(query)
    elif context_src == 'by_snippets':
        docs = run_query_and_get_snippets(query)
    elif context_src == 'by_lda':
        docs = run_query_and_get_documents_lda(query, test_doc_df, lda_model, sim_matrix, dictionary)
    else:
        # raise error
        print('context_src must be one of the following: by_documents, by_snippets, by_lda')        
    docs = convert_docs_to_haystack_docs(docs)
    answers = reader.predict(query=query, documents=docs, top_k=k)
    [ minimal_answers.append([ans.answer]) for ans in answers['answers'] if [ans.answer] not in minimal_answers]
    return minimal_answers


In [14]:
# function for evaluating factoid questions
def manual_evaluate_factoid(DATA_DIR, MODEL_DIR, RESULTS_DIR, context_src):
    from haystack.nodes import FARMReader
    reader = FARMReader(model_name_or_path=MODEL_DIR, use_gpu=True)
    model_name = MODEL_DIR.split("/")[-1]
    RESULTS_DIR = f"{RESULTS_DIR}/{model_name}/{context_src}"
    create_directories([RESULTS_DIR])
    for json_file in glob.glob(os.path.join(DATA_DIR, '*.json')):
        print(json_file)
        with open(json_file) as fp:
            json_data = ''.join(fp)

        data = json.loads(json_data)
        data = data["questions"]
        df = pd.DataFrame(data)
        factoid_df = df[df['type'] == 'factoid']
        factoid_df['predicted_answers'] = factoid_df.apply( lambda x: predict(reader, x.body, context_src, 5), axis = 1 )
        # factoid_df['predicted_answers'] =  0
        factoid_df['predicted_answer_pos'] = "?"
        file_name = json_file.replace(DATA_DIR,'')[1:-5]+"_factoid.csv"
        factoid_df = factoid_df[['body','type','exact_answer','predicted_answers','predicted_answer_pos']]
        csv_file = os.path.join(RESULTS_DIR,file_name)
        factoid_df.to_csv(csv_file, index=False)
        print(f"Saved {csv_file}")

In [None]:
# manual_evaluate_factoid(TEST_DATASET_DIR, "models/factoid_model_5", RESULTS_DIR, "by_snippets")
manual_evaluate_factoid(TEST_DATASET_DIR, "models/factoid_model_5", RESULTS_DIR, "by_documents")
manual_evaluate_factoid(TEST_DATASET_DIR, "models/factoid_model_5", RESULTS_DIR, "by_lda")

In [15]:
import pandas as pd
import glob
import os
def calculate_strict_accuracy(predictions):
    """
    Calculates the strict accuracy.

    Args:
        predictions (Series): Series of predicted answer positions.

    Returns:
        float: Strict accuracy.
    """
    correct_predictions = predictions[predictions == 1].count()
    total_predictions = len(predictions)

    return correct_predictions / total_predictions

def calculate_lenient_accuracy(predictions):
    """
    Calculates the lenient accuracy of 5.

    Args:
        predictions (Series): Series of predicted answer positions.

    Returns:
        float: Lenient accuracy of 5.
    """
    correct_predictions = predictions[predictions > 0].count()
    total_predictions = len(predictions)

    return correct_predictions / total_predictions

def calculate_mrr(predictions):
    """
    Calculates the Mean Reciprocal Rank (MRR).

    Args:
        predictions (Series): Series of predicted answer positions.

    Returns:
        float: Mean Reciprocal Rank (MRR).
    """
    total_reciprocal_ranks = (1 / predictions[predictions > 0]).sum()
    total_predictions = len(predictions)

    return total_reciprocal_ranks / total_predictions

def calculate_metrics_factoid(dir):
    # Find CSV files that end with "manual_eval.csv"
    filenames = glob.glob(os.path.join(dir , "*factoid_manual_eval.csv"))

    # dictionary to store the results
    results = {}

    for filename in filenames:
        print(filename)
        df = pd.read_csv(filename)

        # Extract predicted answer positions
        predicted_answer_positions = df['predicted_answer_pos']
        print(predicted_answer_positions)

        # Calculate metrics
        strict_accuracy = calculate_strict_accuracy(predicted_answer_positions)
        lenient_accuracy = calculate_lenient_accuracy(predicted_answer_positions)
        mrr = calculate_mrr(predicted_answer_positions)

        # Store results
        batch = int(os.path.basename(filename)[3:4])
        results[batch] = [strict_accuracy, lenient_accuracy, mrr]

    # Create DataFrame from results sorted by filename
    results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Strict', 'Lenient', 'MRR']).sort_index()
    # set index as  1st column
    results_df.index.name = 'batch'
    results_df.reset_index(inplace=True)
    # save results to csv
    results_df.to_csv(os.path.join(dir, f"{TEST_DATASET_NAME}_factoid_results.csv"), index=False)
    return results_df

In [16]:
model_name = 'factoid_model_5'
# calculate_metrics_factoid(f"{RESULTS_DIR}/{model_name}/by_snippets")
calculate_metrics_factoid(f"{RESULTS_DIR}/{model_name}/by_documents")
# calculate_metrics_factoid(f"{RESULTS_DIR}/{model_name}/by_lda")

/workspace/results/biomed_qa/answer_extraction/transformer/minilm_ft/bioasq/2022/Task10BGoldenEnriched//factoid_model_5/by_documents/10B3_golden_factoid_manual_eval.csv
0     3
1     1
2     1
3     4
4     2
5     1
6     2
7     1
8     1
9     1
10    1
11    3
12    2
13    1
14    1
15    1
16    2
17    2
18    1
19    1
20    1
21    1
22    1
23    1
24    3
25    1
26    3
27    0
28    1
29    1
30    1
31    1
Name: predicted_answer_pos, dtype: int64
/workspace/results/biomed_qa/answer_extraction/transformer/minilm_ft/bioasq/2022/Task10BGoldenEnriched//factoid_model_5/by_documents/10B4_golden_factoid_manual_eval.csv
0     5
1     2
2     3
3     1
4     2
5     1
6     5
7     0
8     0
9     2
10    1
11    1
12    0
13    4
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    2
23    2
24    2
25    1
26    1
27    4
28    1
29    3
30    1
Name: predicted_answer_pos, dtype: int64
/workspace/results/biomed_qa/answer_extraction/transformer/minilm_ft/bioasq/

Unnamed: 0,batch,Strict,Lenient,MRR
0,1,0.441176,0.794118,0.569608
1,2,0.735294,0.852941,0.765686
2,3,0.65625,0.96875,0.783854
3,4,0.516129,0.903226,0.663441
4,5,0.586207,0.862069,0.683333
5,6,0.666667,0.666667,0.666667


In [17]:
# function for evaluating factoid questions
def manual_evaluate_list(DATA_DIR, MODEL_DIR, RESULTS_DIR, context_src):
    from haystack.nodes import FARMReader
    reader = FARMReader(model_name_or_path=MODEL_DIR, use_gpu=True)
    model_name = MODEL_DIR.split("/")[-1]
    RESULTS_DIR = f"{RESULTS_DIR}/{model_name}/{context_src}"
    create_directories([RESULTS_DIR])
    for json_file in glob.glob(os.path.join(DATA_DIR, '*.json')):
        print(json_file)
        with open(json_file) as fp:
            json_data = ''.join(fp)

        data = json.loads(json_data)
        data = data["questions"]
        df = pd.DataFrame(data)
        list_df = df[df['type'] == 'list']
        list_df['predicted_answers'] = list_df.apply( lambda x: predict(reader, x.body, context_src, 10), axis = 1 )
        list_df['TP'] = "?"
        list_df['FP'] = "?"
        list_df['FN'] = "?"
        file_name = json_file.replace(DATA_DIR,'')[1:-5]+"_list.csv"
        list_df = list_df[['body','type','exact_answer','predicted_answers','TP','FP','FN']]
        list_df.to_csv(os.path.join(RESULTS_DIR,file_name), index=False)
        print(file_name)

In [None]:
# manual_evaluate_list(TEST_DATASET_DIR, "models/list_model_15", RESULTS_DIR, "by_snippets")
manual_evaluate_list(TEST_DATASET_DIR, "models/list_model_15", RESULTS_DIR, "by_documents")
manual_evaluate_list(TEST_DATASET_DIR, "models/list_model_15", RESULTS_DIR, "by_lda")

In [24]:
def calculate_metrics_list(dir):
    file_names = glob.glob(os.path.join( dir, "*list_manual_eval.csv"))

    # dictionary to store the results
    results = {}

    # iterate through each csv file
    for file_name in file_names:
        # read csv file into pandas dataframe
        df = pd.read_csv(file_name)
        # calculate precision, recall and f-measure for each row
        df['precision'] = df['TP'] / (df['TP'] + df['FP'])
        df['recall'] = df['TP'] / (df['TP'] + df['FN'])
        df['f_measure'] = 2 * df['precision'] * df['recall'] / (df['precision'] + df['recall'])
        
        # calculate mean precision, recall and f-measure for the file
        mean_precision = df['precision'].mean()
        mean_recall = df['recall'].mean()
        mean_f_measure = df['f_measure'].mean()
        # Store results
        batch = os.path.basename(file_name)[3:4]
        results[batch] = [mean_precision, mean_recall, mean_f_measure]

    # Create DataFrame from results sorted by filename
    results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Mean Precision','Mean Recall','Mean F-Measure']).sort_index()
    # set index as  1st column
    results_df.index.name = 'batch'
    results_df.reset_index(inplace=True)
    # save results to csv
    results_df.to_csv(os.path.join(dir, f"{TEST_DATASET_NAME}_list_results.csv"), index=False)
    return results_df

In [34]:
model_name = 'list_model_15'
# calculate_metrics_list(f"{RESULTS_DIR}/{model_name}/by_snippets")
calculate_metrics_list(f"{RESULTS_DIR}/{model_name}/by_documents")
# calculate_metrics_list(f"{RESULTS_DIR}/{model_name}/by_lda")

Unnamed: 0,batch,Mean Precision,Mean Recall,Mean F-Measure
0,1,0.527183,0.713492,0.559418
1,2,0.449339,0.592963,0.51464
2,3,0.363095,0.632468,0.478786
3,4,0.504167,0.484325,0.400337
4,5,0.465929,0.623292,0.574664
5,6,0.415204,0.523118,0.458029
