In [None]:
import json

LOCAL_PATH  = "local_run.json"
GLOBAL_PATH = "global_run.json"
FULL_PATH   = "full_run.json"

# --------------------------------------------------------------------
# 1. load both runs into dict[query] → record
# --------------------------------------------------------------------
with open(LOCAL_PATH,  encoding="utf-8") as f:
    local_run  = {r["query"]: r for r in json.load(f)}

with open(GLOBAL_PATH, encoding="utf-8") as f:
    global_run = {r["query"]: r for r in json.load(f)}

assert local_run.keys() == global_run.keys(), "Local & global sets differ!"

# --------------------------------------------------------------------
# 2. merge according to question_type
# --------------------------------------------------------------------
full_run = []
for q, local_rec in local_run.items():
    rec = (local_rec if local_rec["question_type"] == "inference_query"
           else global_run[q])
    full_run.append(rec)

print(f"✓ merged {len(full_run)} records → {FULL_PATH}")

# --------------------------------------------------------------------
# 3. save
# --------------------------------------------------------------------
with open(FULL_PATH, "w", encoding="utf-8") as f:
    json.dump(full_run, f, ensure_ascii=False, indent=2)


✓ merged 2556 records → full_run.json


In [None]:
import json
from tqdm import tqdm
import re
from collections import Counter

In [None]:
def eval(fname, q_file='MultiHopRAG.json'):
  # Read files
  with open(fname, 'r') as file:
      doc_data = json.load(file)

  with open(q_file, 'r') as file:
      query_data = json.load(file)

  # Initialize dictionary to save lists of predictions and gold standards for each question_type
  type_data = {}
  overall_pred_list = []
  overall_gold_list = []

  # Function to get the correct answer
  def get_gold(query):
      for q in query_data:
          if q['query'] == query:
              return q['answer']
      return ''

  # Function to check if there is an intersection of words between two strings
  def has_intersection(a, b):
      a_words = set(a.split())
      b_words = set(b.split())
      return len(a_words.intersection(b_words)) > 0

  # Function to extract the answer
  def extract_answer(input_string):
      match = re.search(r'The answer to the question is "(.*?)"', input_string)
      return match.group(1) if match else input_string

  # Main loop, iterate through document data
  for d in tqdm(doc_data):
      model_answer = d['model_answer']
      if 'The answer' in model_answer:
          model_answer = extract_answer(model_answer)
      gold = get_gold(d['query'])
      if gold:
          question_type = d['question_type']
          if question_type not in type_data:
              type_data[question_type] = {'pred_list': [], 'gold_list': []}
          type_data[question_type]['pred_list'].append(model_answer)
          type_data[question_type]['gold_list'].append(gold)
          overall_pred_list.append(model_answer)
          overall_gold_list.append(gold)

  # Function to calculate evaluation metrics
  def calculate_metrics(pred_list, gold_list):
      tp = sum(1 for pred, gold in zip(pred_list, gold_list) if has_intersection(pred.lower(), gold.lower()))
      fp = sum(1 for pred, gold in zip(pred_list, gold_list) if not has_intersection(pred.lower(), gold.lower()))
      fn = len(gold_list) - tp
      tn = len(pred_list) - tp

      precision = tp / (tp + fp) if tp + fp > 0 else 0
      recall = tp / (tp + fn) if tp + fn > 0 else 0
      f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
      accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0

      return precision, recall, f1, accuracy

  # Output evaluation data for each question_type
  for question_type, data in type_data.items():
      precision, recall, f1, accuracy = calculate_metrics(data['pred_list'], data['gold_list'])
      print(f"Question Type: {question_type}")
      print(f" Precision: {precision:.3f}")
      print(f" Recall: {recall:.3f}")
      print(f" F1 Score: {f1:.3f}")
      print(f" accuracy: {accuracy:.3f}")

  # Calculate overall evaluation metrics
  overall_precision, overall_recall, overall_f1, overall_accuracy = calculate_metrics(overall_pred_list, overall_gold_list)
  print(f"Overall Metrics:")
  print(f" Precision: {overall_precision:.3f}")
  print(f" Recall: {overall_recall:.3f}")
  print(f" F1 Score: {overall_f1:.3f}")
  print(f" Accuracy: {overall_accuracy:.3f}")

In [None]:
# eval('full_run_len_600_limit_50.json')
# eval('deep_full_run_len_600_limit_50.json')
eval('full_run.json')

100%|██████████| 2556/2556 [00:00<00:00, 14395.11it/s]


Question Type: inference_query
 Precision: 0.920
 Recall: 0.920
 F1 Score: 0.920
 accuracy: 0.863
Question Type: comparison_query
 Precision: 0.210
 Recall: 0.210
 F1 Score: 0.210
 accuracy: 0.388
Question Type: null_query
 Precision: 0.246
 Recall: 0.246
 F1 Score: 0.246
 accuracy: 0.399
Question Type: temporal_query
 Precision: 0.415
 Recall: 0.415
 F1 Score: 0.415
 accuracy: 0.461
Overall Metrics:
 Precision: 0.488
 Recall: 0.488
 F1 Score: 0.488
 Accuracy: 0.494


In [None]:
# eval('local_run_len_600_limit_50.json')
# eval('local_run_limit_10.json')
eval('local_run.json')

100%|██████████| 2556/2556 [00:00<00:00, 15459.22it/s]


Question Type: inference_query
 Precision: 0.920
 Recall: 0.920
 F1 Score: 0.920
 accuracy: 0.863
Question Type: comparison_query
 Precision: 0.072
 Recall: 0.072
 F1 Score: 0.072
 accuracy: 0.350
Question Type: null_query
 Precision: 0.033
 Recall: 0.033
 F1 Score: 0.033
 accuracy: 0.341
Question Type: temporal_query
 Precision: 0.148
 Recall: 0.148
 F1 Score: 0.148
 accuracy: 0.370
Overall Metrics:
 Precision: 0.356
 Recall: 0.356
 F1 Score: 0.356
 Accuracy: 0.437


In [None]:
# eval('global_run_len_600_limit_50.json')
# eval('deep_global_run_len_600_limit_50.json')
eval('global_run.json')

100%|██████████| 2556/2556 [00:00<00:00, 15251.56it/s]


Question Type: inference_query
 Precision: 0.450
 Recall: 0.450
 F1 Score: 0.450
 accuracy: 0.476
Question Type: comparison_query
 Precision: 0.210
 Recall: 0.210
 F1 Score: 0.210
 accuracy: 0.388
Question Type: null_query
 Precision: 0.246
 Recall: 0.246
 F1 Score: 0.246
 accuracy: 0.399
Question Type: temporal_query
 Precision: 0.415
 Recall: 0.415
 F1 Score: 0.415
 accuracy: 0.461
Overall Metrics:
 Precision: 0.338
 Recall: 0.338
 F1 Score: 0.338
 Accuracy: 0.430


In [None]:
for fname in ['qwen_llm_run.json', 'qwen_rag_run.json']:
    print(fname)
    eval(fname)

qwen_llm_run.json


100%|██████████| 2556/2556 [00:00<00:00, 15502.11it/s]


Question Type: inference_query
 Precision: 0.645
 Recall: 0.645
 F1 Score: 0.645
 accuracy: 0.585
Question Type: comparison_query
 Precision: 0.030
 Recall: 0.030
 F1 Score: 0.030
 accuracy: 0.340
Question Type: null_query
 Precision: 0.043
 Recall: 0.043
 F1 Score: 0.043
 accuracy: 0.343
Question Type: temporal_query
 Precision: 0.106
 Recall: 0.106
 F1 Score: 0.106
 accuracy: 0.359
Overall Metrics:
 Precision: 0.245
 Recall: 0.245
 F1 Score: 0.245
 Accuracy: 0.399
qwen_rag_run.json


100%|██████████| 2556/2556 [00:00<00:00, 15914.47it/s]


Question Type: inference_query
 Precision: 0.886
 Recall: 0.886
 F1 Score: 0.886
 accuracy: 0.814
Question Type: comparison_query
 Precision: 0.090
 Recall: 0.090
 F1 Score: 0.090
 accuracy: 0.355
Question Type: null_query
 Precision: 0.233
 Recall: 0.233
 F1 Score: 0.233
 accuracy: 0.394
Question Type: temporal_query
 Precision: 0.170
 Recall: 0.170
 F1 Score: 0.170
 accuracy: 0.376
Overall Metrics:
 Precision: 0.379
 Recall: 0.379
 F1 Score: 0.379
 Accuracy: 0.446


In [None]:
import json
from typing import Sequence

def filter_json_by_indices(
    input_path: str,
    output_path: str,
    indices_path: str,
) -> None:
    """
    Keep only the elements whose 0-based positions are listed in *indices_path*.

    Parameters
    ----------
    input_path   : file with the full list of dicts (e.g. answers_all.json)
    output_path  : where the 125-item subset should be written
    indices_path : JSON file containing a list of integers, already sorted
                   (the same indices you saved when sampling the questions)
    """
    # load everything
    with open(input_path, encoding="utf-8") as f:
        full_list = json.load(f)

    with open(indices_path, encoding="utf-8") as f:
        keep = json.load(f)

    # light sanity check
    if keep and (keep[-1] >= len(full_list) or min(keep) < 0):
        raise IndexError("Index list contains out-of-range values.")

    # slice – order is preserved because *keep* is sorted
    filtered = [full_list[i] for i in keep]

    # save
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(filtered, f, ensure_ascii=False, indent=2)

    print(f"✓ wrote {len(filtered)} items → {output_path}")


# --- quick example --------------------------------------------------
# filter_json_by_indices(
#     input_path="answers_all.json",
#     output_path="answers_sampled.json",
#     indices_path="sampled_indices.json",
# )


In [None]:
for fname in ['qwen_rag_run.json', 'full_run.json']:
    filter_json_by_indices(
        input_path=fname,
        output_path='sample_'+fname,
        indices_path="sampled_indices.json",
    )

✓ wrote 125 items → sample_qwen_rag_run.json
✓ wrote 125 items → sample_full_run.json


In [None]:
for fname in ['qwen_rag_run.json', 'full_run.json']:
    print(fname)
    eval('sample_'+fname, 'sampled_queries.json')

qwen_rag_run.json


100%|██████████| 125/125 [00:00<00:00, 159019.71it/s]


Question Type: inference_query
 Precision: 1.000
 Recall: 1.000
 F1 Score: 1.000
 accuracy: 1.000
Question Type: comparison_query
 Precision: 0.167
 Recall: 0.167
 F1 Score: 0.167
 accuracy: 0.375
Question Type: temporal_query
 Precision: 0.143
 Recall: 0.143
 F1 Score: 0.143
 accuracy: 0.368
Question Type: null_query
 Precision: 0.400
 Recall: 0.400
 F1 Score: 0.400
 accuracy: 0.455
Overall Metrics:
 Precision: 0.456
 Recall: 0.456
 F1 Score: 0.456
 Accuracy: 0.479
full_run.json


100%|██████████| 125/125 [00:00<00:00, 156550.61it/s]

Question Type: inference_query
 Precision: 0.975
 Recall: 0.975
 F1 Score: 0.975
 accuracy: 0.952
Question Type: comparison_query
 Precision: 0.190
 Recall: 0.190
 F1 Score: 0.190
 accuracy: 0.382
Question Type: temporal_query
 Precision: 0.321
 Recall: 0.321
 F1 Score: 0.321
 accuracy: 0.424
Question Type: null_query
 Precision: 0.267
 Recall: 0.267
 F1 Score: 0.267
 accuracy: 0.405
Overall Metrics:
 Precision: 0.480
 Recall: 0.480
 F1 Score: 0.480
 Accuracy: 0.490





In [None]:
for fname in ['ms_local.json', 'ms_global.json']:
    print(fname)
    eval(fname, 'sampled_queries.json')

ms_local.json


100%|██████████| 125/125 [00:00<00:00, 145031.26it/s]


Question Type: inference_query
 Precision: 0.975
 Recall: 0.975
 F1 Score: 0.975
 accuracy: 0.952
Question Type: comparison_query
 Precision: 0.071
 Recall: 0.071
 F1 Score: 0.071
 accuracy: 0.350
Question Type: temporal_query
 Precision: 0.107
 Recall: 0.107
 F1 Score: 0.107
 accuracy: 0.359
Question Type: null_query
 Precision: 0.000
 Recall: 0.000
 F1 Score: 0.000
 accuracy: 0.333
Overall Metrics:
 Precision: 0.360
 Recall: 0.360
 F1 Score: 0.360
 Accuracy: 0.439
ms_global.json


100%|██████████| 125/125 [00:00<00:00, 29188.73it/s]

Question Type: inference_query
 Precision: 0.750
 Recall: 0.750
 F1 Score: 0.750
 accuracy: 0.667
Question Type: comparison_query
 Precision: 0.071
 Recall: 0.071
 F1 Score: 0.071
 accuracy: 0.350
Question Type: temporal_query
 Precision: 0.179
 Recall: 0.179
 F1 Score: 0.179
 accuracy: 0.378
Question Type: null_query
 Precision: 0.067
 Recall: 0.067
 F1 Score: 0.067
 accuracy: 0.349





Overall Metrics:
 Precision: 0.312
 Recall: 0.312
 F1 Score: 0.312
 Accuracy: 0.421
