In [None]:
import os
import jsonlines
import pandas as pd

from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

In [None]:
# === Set your hyperparameters here ===

# Choose the dataset to use from ["en", "de"]
lang = "en"
assert lang in ["en", "de"]

# Set to true if splitting the survey variables with sub-questions into separate corpus items
sep_answers = False
assert isinstance(sep_answers, bool)

# The string used to join survey variable information
join_str = "[UNK]"
assert isinstance(join_str, str)

# Evaluate each of the top k values
k_values = [1, 3, 5, 10, 100]
assert isinstance(k_values, list)
all(isinstance(e, int) for e in k_values)

# Embedding similarity function from ["cos_sim", "dot_score"]
score_function = "cos_sim"
assert score_function in ["cos_sim", "dot_score"]

# Any sentence-transformers (https://www.sbert.net/docs/pretrained_models.html)
# or HuggingFace model (https://huggingface.co/models) works 
pretrained_model = "all-MiniLM-L6-v2"

# Set batch size for inference
batch_size = 16
assert isinstance(batch_size, int)

# ================ End ================

In [None]:
def split_variables(row):
    if row not in ["No", "NoSkip"]:
        return [tuple(x.split('-')) for x in row.replace('[','').replace(']','').split(',')]
    else:
        return [row]
    
def get_variables(row):
    return [x[0] for x in row.variables]

def make_label(row, answer, join_str):
    v_id = row["v_id"] if row["v_id"] else ""
    v_label = row["v_label"] if row["v_label"] else ""
    v_topic = row["v_topic"] if row["v_topic"] else ""
    v_question = row["v_question"] if row["v_question"] else ""
    v_answer = answer if answer else ""

    label = join_str.join([v_label, v_topic, v_question, v_answer])
    
    return label

def get_labels(df, sep_answers=False, join_str="[UNK]"):
    ids = []
    labels = []

    df.fillna("", inplace=True)

    for i,row in df.iterrows():

        if sep_answers:  # split survey variable answers into separate corpus items
            answers = row["v_answer"].split(";")
        else:  # do not split survey variable answers
            answers = [row["v_answer"] if row["v_answer"] else ""]
            
        for v_answer in answers:
            v_id = row["v_id"] if row["v_id"] else ""
            label = make_label(row, v_answer, join_str)
            ids.append(v_id)
            labels.append(label)
    
    return ids, labels

In [None]:
# Load raw data

data_path = '../../data/trial/test/en.tsv' if lang != "de" else '../../data/trial/test/de.tsv'
variables_path = '../../data/trial/vocabulary/en.tsv' if lang != "de" else '../../data/vocabulary/de.tsv'

data_df = pd.read_csv(data_path, sep="\t")
data_df.rename(columns={"is_variable": "label"}, inplace=True)
data_df["variables"] = data_df.variable.apply(lambda x: split_variables(x))

variable_df = pd.read_csv(variables_path, sep="\t")

ids, labels = get_labels(variable_df, sep_answers, join_str)

In [None]:
def make_beir_data(data_df, beir_data_dir):
    queries = {}
    qrels = {}
    for i,row in data_df.iterrows():
        queries[str(i)] = row.text
        rel_labels = []
        if row.variable not in ["No", "NoSkip"]:
            rel_labels = ["v"+x for x in get_variables(row)]
        qrels[str(i)] = rel_labels        

    corpus = {}
    for i,label in enumerate(labels):
        corpus[ids[i]] = label

    beir_qrels_dir = os.path.join(beir_data_dir, "qrels")
    if not os.path.exists(beir_qrels_dir):
        os.makedirs(beir_qrels_dir)

    queries_beir = []
    for k,v in queries.items():
        queries_beir.append({"_id": k, "text": v})

    corpus_beir = []
    for k,v in corpus.items():
        corpus_beir.append({"_id": k, "title": "", "text": v})
        # corpus_beir.append({"_id": k, "title": f"Doc_{str(k)}", "text": v})

    qrels_beir = {"query-id": [], "corpus-id": [], "score": []}

    for k,vals in qrels.items():
        for v in vals:
            qrels_beir["query-id"].append(k)
            qrels_beir["corpus-id"].append(v)
            qrels_beir["score"].append(1)

    df = pd.DataFrame.from_records(qrels_beir)
    df[["query-id", "corpus-id", "score"]].to_csv(os.path.join(beir_data_dir, "qrels", "all.tsv"), index=False, sep="\t")

    with jsonlines.open(os.path.join(beir_data_dir, "queries.jsonl"), "w") as writer:
        writer.write_all(queries_beir)

    with jsonlines.open(os.path.join(beir_data_dir, "corpus.jsonl"), "w") as writer:
        writer.write_all(corpus_beir)
    
    return queries_beir, corpus_beir, qrels_beir

In [None]:
# Load data in BEIR format

beir_data_dir = f"../../data/trial/beir_data/{lang}"
_queries_beir, _corpus_beir, _qrels_beir = make_beir_data(data_df, beir_data_dir)

corpus, queries, qrels = GenericDataLoader(data_folder=beir_data_dir).load(split="all")

In [None]:
# Initialize retriever model

model = DRES(models.SentenceBERT(pretrained_model), batch_size=batch_size)
retriever = EvaluateRetrieval(model, score_function=score_function, k_values=k_values)

In [None]:
# Evaluate model using multiple metrics

results = retriever.retrieve(corpus, queries, return_sorted=True)

In [None]:
# Save files

import json

path = "./qrels.json"
with open(path, "w") as fp:
    json.dump(qrels, fp)

path = "./run.json"
with open(path, "w") as fp:
    json.dump(results, fp)