In [1]:
import os
import jsonlines
import pandas as pd
from collections import Counter, defaultdict

from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

In [2]:
# === Set your hyperparameters here ===

# Choose the dataset to use from ["en", "de"]
lang = "en"
assert lang in ["en", "de"]

# Set to true if splitting the survey variables with sub-questions into separate corpus items
sep_answers = False
assert isinstance(sep_answers, bool)

# The string used to join survey variable information
join_str = " [UNK] "
assert isinstance(join_str, str)

# Evaluate each of the top k values
k_values = [1, 3, 5, 10, 100]
assert isinstance(k_values, list)
all(isinstance(e, int) for e in k_values)

# Embedding similarity function from ["cos_sim", "dot_score"]
score_function = "cos_sim"
assert score_function in ["cos_sim", "dot_score"]

# Any sentence-transformers (https://www.sbert.net/docs/pretrained_models.html)
# or HuggingFace model (https://huggingface.co/models) works 
pretrained_models = ["all-MiniLM-L6-v2", "paraphrase-MiniLM-L3-v2", "all-mpnet-base-v2", "paraphrase-multilingual-mpnet-base-v2"]
# pretrained_models = ["paraphrase-MiniLM-L3-v2", "paraphrase-multilingual-mpnet-base-v2", "multi-qa-distilbert-cos-v1", "paraphrase-multilingual-MiniLM-L12-v2"]

# Set batch size for inference
batch_size = 16
assert isinstance(batch_size, int)

# ================ End ================

In [3]:
def split_variables(row):
    if row not in ["No", "NoSkip"]:
        return [tuple(x.split('-')) for x in row.replace('[','').replace(']','').split(',')]
    else:
        return [row]
    
def get_variables(row):
    return [x[0] for x in row.variables]

def make_label(row, answer, join_str):
    v_id = row["id"] if row["id"] else ""
    v_label = row["label"] if row["label"] else ""
    v_topic = row["topic"] if row["topic"] else ""
    v_question = row["question"] if row["question"] else ""
    v_answer = answer if answer else ""

    label = join_str.join([v_label, v_topic, v_question, v_answer])
    
    return label

def get_labels(df, sep_answers=False, join_str="[UNK]"):
    ids = []
    labels = []

    df.fillna("", inplace=True)

    for i,row in df.iterrows():

        if sep_answers:  # split survey variable answers into separate corpus items
            answers = row["answer"].split(";")
        else:  # do not split survey variable answers
            answers = [row["answer"] if row["answer"] else ""]
            
        for v_answer in answers:
            v_id = row["id"] if row["id"] else ""
            label = make_label(row, v_answer, join_str)
            ids.append(v_id)
            labels.append(label)
    
    return ids, labels

In [4]:
# Load raw data

data_path = '../../data/trial/test/en.tsv' if lang != "de" else '../../data/trial/test/de.tsv'
variables_path = '../../data/trial/vocabulary/en.tsv' if lang != "de" else '../../data/trial/vocabulary/de.tsv'

data_df = pd.read_csv(data_path, sep="\t")
data_df.rename(columns={"is_variable": "label"}, inplace=True)
data_df["variables"] = data_df.variable.apply(lambda x: split_variables(x))

variable_df = pd.read_csv(variables_path, sep="\t")

ids, labels = get_labels(variable_df, sep_answers, join_str)

In [5]:
def make_beir_data(data_df, beir_data_dir):
    queries = {}
    qrels = {}
    for i,row in data_df.iterrows():
        queries[str(i)] = row.text
        rel_labels = []
        if row.variable not in ["No", "NoSkip"]:
            rel_labels = ["v"+x for x in get_variables(row)]
        qrels[str(i)] = rel_labels        

    corpus = {}
    for i,label in enumerate(labels):
        corpus[ids[i]] = label

    beir_qrels_dir = os.path.join(beir_data_dir, "qrels")
    if not os.path.exists(beir_qrels_dir):
        os.makedirs(beir_qrels_dir)

    queries_beir = []
    for k,v in queries.items():
        queries_beir.append({"_id": k, "text": v})

    corpus_beir = []
    for k,v in corpus.items():
        corpus_beir.append({"_id": k, "title": "", "text": v})
        # corpus_beir.append({"_id": k, "title": f"Doc_{str(k)}", "text": v})

    qrels_beir = {"query-id": [], "corpus-id": [], "score": []}

    for k,vals in qrels.items():
        for v in vals:
            qrels_beir["query-id"].append(k)
            qrels_beir["corpus-id"].append(v)
            qrels_beir["score"].append(1)

    df = pd.DataFrame.from_records(qrels_beir)
    df[["query-id", "corpus-id", "score"]].to_csv(os.path.join(beir_data_dir, "qrels", "all.tsv"), index=False, sep="\t")

    with jsonlines.open(os.path.join(beir_data_dir, "queries.jsonl"), "w") as writer:
        writer.write_all(queries_beir)

    with jsonlines.open(os.path.join(beir_data_dir, "corpus.jsonl"), "w") as writer:
        writer.write_all(corpus_beir)
    
    return queries_beir, corpus_beir, qrels_beir

In [6]:
# Load data in BEIR format

beir_data_dir = f"../../data/trial/beir_data/{lang}"
_queries_beir, _corpus_beir, _qrels_beir = make_beir_data(data_df, beir_data_dir)

corpus, queries, qrels = GenericDataLoader(data_folder=beir_data_dir).load(split="all")

100%|██████████| 182/182 [00:00<00:00, 245612.40it/s]


In [7]:
model_results = {}

for model_name in pretrained_models:
    # Initialize retriever model
    model = DRES(models.SentenceBERT(model_name), batch_size=batch_size)
    retriever = EvaluateRetrieval(model, score_function=score_function, k_values=k_values)

    # Evaluate model using multiple metrics
    results = retriever.retrieve(corpus, queries, return_sorted=True)
    model_results[model_name] = results

Batches: 100%|██████████| 2/2 [00:00<00:00, 110.49it/s]
Batches: 100%|██████████| 12/12 [00:00<00:00, 175.16it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 299.17it/s]
Batches: 100%|██████████| 12/12 [00:00<00:00, 238.23it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 79.82it/s]
Batches: 100%|██████████| 12/12 [00:00<00:00, 58.47it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00, 65.71it/s]
Batches: 100%|██████████| 12/12 [00:00<00:00, 56.53it/s]


In [8]:
def get_sorted_counted_results(model_results):

    all_counts = {}
    
    corpus_ids = list(model_results[list(model_results.keys())[0]].keys())

    for corpus_id in corpus_ids:

        all_vars = []
        all_var_scores = defaultdict(list)
        for model_name,res in model_results.items():
            assert (corpus_id in res)

            all_vars.extend(res[corpus_id])

            for var,score in res[corpus_id].items():
                all_var_scores[var].append(score)
        
        counts = Counter(all_vars)

        avg_scores = {}
        for var,scores in all_var_scores.items():
            avg_scores[var] = sum(scores)/len(scores)
        
        score_counts = {}
        for c,v in counts.items():
            score_counts[c] = (v, avg_scores[c])

        # all_counts[corpus_id] = counts       
        # all_avg_scores[corpus_id] = avg_scores
        all_counts[corpus_id] = score_counts
    
    return all_counts

In [9]:
# all_scores = get_sorted_counted_results(model_results)
# sorted_counts = sorted(counts.items(), key=lambda x: x[::-1], reverse=True)

In [10]:
def get_pooled_results(model_results):

    orig_results_length = len(next(iter(next(iter(model_results.values())).values())))

    all_scores = get_sorted_counted_results(model_results)
    all_sorted_counts = {}

    for corpus_id,counts in all_scores.items():
        sorted_counts = sorted(counts.items(), key=lambda x: x[::-1], reverse=True)

        weighted_counts = {}
        for c,(count,score) in sorted_counts:
            weighted_counts[c] = (count*(score/len(model_results)))
        
        filtered_counts_tuple = sorted(weighted_counts.items(), key=lambda x: x[1], reverse=True)[:orig_results_length]
        filtered_keys = [k[0] for k in filtered_counts_tuple]
        filtered_counts = {k:v for k,v in weighted_counts.items() if k in filtered_keys}

        all_sorted_counts[corpus_id] = filtered_counts
    
    return all_sorted_counts

In [11]:
pooled_results = get_pooled_results(model_results)

In [12]:
# Save files

import json

with open(f"./qrels_dense_{lang}.json", "w") as fp:
    json.dump(qrels, fp)

with open(f"./run_dense_{lang}.json", "w") as fp:
    json.dump(results, fp)

with open(f"./run_dense_pooled_{lang}.json", "w") as fp:
    json.dump(pooled_results, fp)