In [1]:
import os
import json

Query1_dir = "data/beir-quora-test"
Query2_dir = "data/nano-beir-arguana"

def load_queries_tsv(path):
    queries_dict = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                query_id, query = parts
                # Convert query to boolean format
                queries_dict[query_id] = {
                    'text': query,
                    'is_relevant': False
                }
    return queries_dict

def extract_queries_from_qrels(qrels_path, queries_dict):
    used_queries = {}
    with open(qrels_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 1:
                query_id = parts[0]
                if query_id in queries_dict:
                    used_queries[query_id] = queries_dict[query_id]
    return used_queries


In [2]:

# Load queries with boolean relevance
queries1 = load_queries_tsv(f"{Query1_dir}/queries.tsv")
queries2 = load_queries_tsv(f"{Query2_dir}/queries.tsv")

# Extract and mark relevant queries based on qrels
queries_set_1 = extract_queries_from_qrels(f"{Query1_dir}/qrels.tsv", queries1)
queries_set_2 = extract_queries_from_qrels(f"{Query2_dir}/qrels.tsv", queries2)

# Combine all queries
all_queries = {**queries_set_1, **queries_set_2}

# Save to file with query text and relevance information
with open("query_pool.json", "w", encoding="utf-8") as f:
        json.dump(all_queries, f)
  
print(f"✅ Query pool built with {len(all_queries)} queries.")


✅ Query pool built with 10050 queries.


In [3]:
import json
from collections import defaultdict

INPUT_FILE = "query_pool.json"
OUTPUT_FILE = "query_frequencies.json"

def build_query_freq_dict():
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        raw_queries = json.load(f)

    freq_dict = defaultdict(int)
    for item in raw_queries.values():
        query_text = item.get("text", "").strip()
        if query_text:
            freq_dict[query_text] += 1  # All queries occur once here

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(freq_dict, f, indent=2, ensure_ascii=False)

    print(f"✅ Saved query frequency file to: {OUTPUT_FILE}")

if __name__ == "__main__":
    build_query_freq_dict()

✅ Saved query frequency file to: query_frequencies.json
