In [6]:
import json, os, csv
from tqdm.auto import tqdm

DIR="/store/scratch/n3thakur/trec-rag-2024/trec2024-rag/support_eval/results/support"
OUTPUT_DIR="/store/scratch/n3thakur/trec-rag-2024/trec2024-rag/support_eval/competition/results/llm_pred_all_301_topics"
RESULTS_DIR = [f"{DIR}/gen/pairwise", f"{DIR}/auggen/pairwise"]
tasks = ["gen", "auggen"]
SCORES_WEIGHTED = {"NS": 0, "PS": 0.5, "FS": 1.0}
SCORES_HARD = {"NS": 0, "PS": 0, "FS": 1.0}

for idx, results_dir in enumerate(RESULTS_DIR):
    task = tasks[idx]
    input_filepaths = os.listdir(results_dir)
    # Remove the markdown files
    input_filepaths = [input_filepath for input_filepath in input_filepaths if not input_filepath.endswith(".md")]
    for input_filepath in tqdm(input_filepaths):
        avg_weighted_precision, avg_weighted_recall, avg_hard_precision, avg_hard_recall, avg_sentences = 0, 0, 0, 0, 0
        with open(os.path.join(results_dir, input_filepath), 'r') as fin:
            results = {}
            for line in fin:
                weighted_precision, weighted_recall, hard_precision, hard_recall, sentences = 0, 0, 0, 0, 0
                data = json.loads(line)
                topic_id = data['topic_id']
                group_name = input_filepath.split(".")[0]
                os.makedirs(f"{OUTPUT_DIR}/{task}/merged", exist_ok=True)
                assert len(data['answer']) == len(data['support_eval'])
                sentences = len(data['support_eval'])
                scores = []
                for row in data['support_eval']:
                    if len(row['eval_scores']) > 0:
                        if row['eval_scores'][0] in ["NS", "PS", "FS"]:
                            scores.append(row['eval_scores'][0])
                
                if len(scores) > 0:
                    weighted_precision = sum([SCORES_WEIGHTED[score] for score in scores]) / len(scores)
                    hard_precision = sum([SCORES_HARD[score] for score in scores]) / len(scores)
                
                weighted_recall = sum([SCORES_WEIGHTED[score] for score in scores]) / sentences
                hard_recall = sum([SCORES_HARD[score] for score in scores]) / sentences
                avg_weighted_precision += weighted_precision
                avg_hard_precision += hard_precision
                avg_weighted_recall += weighted_recall
                avg_hard_recall += hard_recall
                avg_sentences += sentences
                results[topic_id] = {
                    "weighted_precision": weighted_precision,
                    "weighted_recall": weighted_recall,
                    "hard_precision": hard_precision,
                    "hard_recall": hard_recall,
                    "sentences": sentences
                }
            output_filepath = f"{OUTPUT_DIR}/{task}/merged/{input_filepath}"
            print("Writing to file: ", input_filepath)
            with open(output_filepath, 'w') as fout:
                for topic_id, result in results.items():
                    fout.write(json.dumps({
                        "topic_id": topic_id,
                        "weighted_precision": result["weighted_precision"],
                        "weighted_recall": result["weighted_recall"],
                        "hard_precision": result["hard_precision"],
                        "hard_recall": result["hard_recall"],
                        "sentences": result["sentences"]
                    }) + "\n")
                
                fout.write(json.dumps({
                    "topic_id": "all",
                    "weighted_precision": avg_weighted_precision / 301,
                    "weighted_recall": avg_weighted_recall / 301,
                    "hard_precision": avg_hard_precision / 301,
                    "hard_recall": avg_hard_recall / 301,
                    "sentences": avg_sentences / 301,
                }) + "\n")
                    
                

  4%|▍         | 4/97 [00:00<00:05, 16.05it/s]

Writing to file:  IITD-IRL.zeph_rag_mistral_expansion_rrf_5.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p1_straight_ht.jsonl
Writing to file:  IIIA-UNIPD.iiia_standard_p1_reverse.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p2_reverse_ht.jsonl


  9%|▉         | 9/97 [00:00<00:04, 18.85it/s]

Writing to file:  IITD-IRL.zeph_rag_mistral_expansion_rrf_10.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p2_straight_ht.jsonl
Writing to file:  IIIA-UNIPD.iiia_standard_p2_reverse.jsonl
Writing to file:  IIIA-UNIPD.iiia_standard_p1_reverse_ht.jsonl
Writing to file:  IIIA-UNIPD.iiia_standard_p1_straight.jsonl


 12%|█▏        | 12/97 [00:00<00:04, 19.57it/s]

Writing to file:  IIIA-UNIPD.iiia_standard_p1_straight_ht.jsonl
Writing to file:  ldisnu.ragnarok.jsonl
Writing to file:  ielab.ielab_custom_baseline_blender_70b_filtered_meta-llama-Meta-Llama-3.1-70B-Instruct_llm_based_attribution_trec_rag_few_shots.jsonl
Writing to file:  IIIA-UNIPD.iiia_standard_p2_straight.jsonl
Writing to file:  IIIA-UNIPD.iiia_standard_p2_straight_ht.jsonl


 18%|█▊        | 17/97 [00:00<00:04, 18.82it/s]

Writing to file:  ielab.ielab_custom_baseline_blender_70b_filtered_meta-llama-Meta-Llama-3.1-70B-Instruct_ad_hoc_attribution.jsonl
Writing to file:  ielab.ielab_custom_baseline_blender_8b_filtered_meta-llama-Meta-Llama-3.1-8B-Instruct_ad_hoc_attribution.jsonl
Writing to file:  ncsu-las.LAS-splade-mxbai-rrf-mmr8-rag24test-doc-multistep.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p1_reverse.jsonl


 24%|██▎       | 23/97 [00:01<00:03, 22.01it/s]

Writing to file:  ielab.ielab_custom_baseline_meta-llama-Meta-Llama-3.1-8B-Instruct_llm_based_attribution_no_trec_rag_few_shots.jsonl
Writing to file:  IITD-IRL.zeph_test_rag_rrf_raw_query.jsonl
Writing to file:  IITD-IRL.zeph_rag_mistral_expansion_rrf_15.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p1_straight.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p2_straight.jsonl


 30%|██▉       | 29/97 [00:01<00:02, 22.75it/s]

Writing to file:  buw.buw_3.jsonl
Writing to file:  buw.oneshot_post_sentenced.jsonl
Writing to file:  IRIT.ISIR-IRIT-zephyr_query_gen.jsonl
Writing to file:  IRIT.ISIR-IRIT-zephyr_sprompt_3p.jsonl
Writing to file:  uog-tht.uog-tht.jsonl
Writing to file:  buw.buw_1.jsonl


 33%|███▎      | 32/97 [00:01<00:02, 23.74it/s]

Writing to file:  IRIT.ISIR-IRIT-zephyr_query_gen_3p.jsonl
Writing to file:  InfoLab.bge-queryAgm.jsonl
Writing to file:  gpt-4o_InfoLab.bm25-ro-defl.jsonl
Writing to file:  ielab.ielab_custom_baseline_blender_70b_filtered_meta-llama-Meta-Llama-3.1-70B-Instruct_llm_based_attribution_trec_rag_few_shots.jsonl
Writing to file:  InfoLab.bge-AnsAI.jsonl


 39%|███▉      | 38/97 [00:01<00:02, 22.33it/s]

Writing to file:  IITD-IRL.zeph_rag_mistral_expansion_rrf_20.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p2_reverse.jsonl
Writing to file:  ncsu-las.LAS-enn-mmr8-rag24test.jsonl
Writing to file:  WaterlooClarke.UWCgarag.jsonl
Writing to file:  ncsu-las.LAS-splade-mxbai-rrf-mmr8-rag24test.jsonl


 45%|████▌     | 44/97 [00:02<00:02, 21.14it/s]

Writing to file:  webis.webis-reuserag-promptedreuse-k10.jsonl
Writing to file:  Ruc01.Ruc01.jsonl
Writing to file:  buw.buw_5.jsonl
Writing to file:  InfoLab.bge-ranker.jsonl
Writing to file:  IRIT.ISIR-IRIT-zephyr_p2.jsonl


 48%|████▊     | 47/97 [00:02<00:02, 19.78it/s]

Writing to file:  citi.academia sinica.jsonl
Writing to file:  buw.buw.jsonl
Writing to file:  citi.academia sinica.jsonl
Writing to file:  citi.academia sinica.jsonl


 55%|█████▍    | 53/97 [00:02<00:02, 19.96it/s]

Writing to file:  citi.academia sinica.jsonl
Writing to file:  ii_research.iiresearch-bm25-top10-llama3-8b-instruct.jsonl
Writing to file:  citi.academia sinica.jsonl
Writing to file:  WaterlooClarke.UWCrag.jsonl
Writing to file:  citi.academia sinica.jsonl


 58%|█████▊    | 56/97 [00:02<00:02, 19.78it/s]

Writing to file:  neu.neu.jsonl
Writing to file:  neu.neu.jsonl
Writing to file:  ielab.ielab_custom_baseline_blender_8b_meta-llama-Meta-Llama-3.1-8B-Instruct_llm_based_attribution_no_trec_rag_few_shots.jsonl
Writing to file:  ielab.ielab_custom_baseline_blender_8b_filtered_meta-llama-Meta-Llama-3.1-8B-Instruct_ad_hoc_attribution.jsonl


 63%|██████▎   | 61/97 [00:02<00:01, 21.65it/s]

Writing to file:  IIIA-UNIPD.iiia_standard_p2_reverse_ht.jsonl
Writing to file:  webis.webis-taskrag-zephyr-gpt4omini-k10.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p1_reverse_ht.jsonl
Writing to file:  webis.webis-manual.jsonl
Writing to file:  webis.webis-taskrag-zephyr-llama31-k10.jsonl
Writing to file:  ielab.ielab_custom_baseline_blender_8b_filtered_meta-llama-Meta-Llama-3.1-8B-Instruct_llm_based_attribution_no_trec_rag.jsonl


 69%|██████▉   | 67/97 [00:03<00:01, 22.75it/s]

Writing to file:  uog-tht.uog-tht.jsonl
Writing to file:  SGU.Sai Gon University.jsonl
Writing to file:  uog-tht.uog-tht.jsonl
Writing to file:  ldisnu.ragnarok.jsonl
Writing to file:  ldisnu.ragnarok.jsonl


 72%|███████▏  | 70/97 [00:03<00:01, 22.54it/s]

Writing to file:  webis.webis-taskrag-zephyr-gpt4omini-k20.jsonl
Writing to file:  citi.academia sinica.jsonl
Writing to file:  IITD-IRL.zeph_test_rag_rrf_expand_query.jsonl
Writing to file:  ncsu-las.LAS-splade-mxbai-rrf-occams_segment_selection_50-titleraggy_test.jsonl


 78%|███████▊  | 76/97 [00:03<00:01, 19.98it/s]

Writing to file:  ldisnu.ragnarok.jsonl
Writing to file:  coordinators.baseline_frag_rag24.test_command-r-plus_top20.jsonl
Writing to file:  softbank-meisei.ragtask-bm25-rank_zephyr-gpt4o-llama70b.jsonl
Writing to file:  ncsu-las.LAS-splade-mxbai-mmr8-rag24test.jsonl


 85%|████████▍ | 82/97 [00:03<00:00, 22.22it/s]

Writing to file:  IITD-IRL.zeph_test.rag24.rrf.jsonl
Writing to file:  TREMA-UNH.baseline.jsonl
Writing to file:  ielab.ielab_custom_baseline_blender_8b_filtered_meta-llama-Meta-Llama-3.1-8B-Instruct_llm_based_attribution_no_trec_rag_few_shots.jsonl
Writing to file:  webis.webis-reuserag-promptedreuse-clustered.jsonl
Writing to file:  gpt-4o_coordinators.coordinators.all_nuggets.jsonl
Writing to file:  gpt-4o_coordinators.coordinators.anserini_bm25.rag24.test_top1.jsonl


 90%|████████▉ | 87/97 [00:04<00:00, 25.60it/s]

Writing to file:  gpt-4o_coordinators.coordinators.fs4_bm25+rocchio_snowael_snowaem_gtel+monot5_rrf+rz_rrf.rag24.test_top1.jsonl
Writing to file:  coordinators.coordinators.all_nuggets.jsonl
Writing to file:  coordinators.coordinators.all_nuggets.jsonl
Writing to file:  coordinators.coordinators.fs4_bm25+rocchio_snowael_snowaem_gtel+monot5_rrf+rz_rrf.rag24.test_top1.jsonl
Writing to file:  InfoLab.UdInfo-RAG-bge-t.jsonl.jsonl
Writing to file:  h2oloo.listgalore_gpt4o_ragnarokv4nocite_top20.jsonl
Writing to file:  h2oloo.listgalore_l31-70b_ragnarokv4nocite_top20.jsonl


 97%|█████████▋| 94/97 [00:04<00:00, 25.91it/s]

Writing to file:  coordinators.baseline_frag_rag24.test_gpt-4o_top20.jsonl
Writing to file:  WaterlooClarke.UWCrag_stepbystep.jsonl
Writing to file:  softbank-meisei.rag_bm25-colbert_faiss-gpt4o-llama70b.jsonl
Writing to file:  coordinators.coordinators.anserini_bm25.rag24.test_top1.jsonl
Writing to file:  InfoLab.UdInfo-RAG-bgeQueryAgm-t.jsonl


100%|██████████| 97/97 [00:04<00:00, 21.67it/s]


Writing to file:  InfoLab.UdInfo-RAG-bgeAnsAi-t.jsonl
Writing to file:  h2oloo.listgalore_gpt4o_ragnarokv4_top20.jsonl
Writing to file:  h2oloo.listgalore_l31-70b_ragnarokv4_top20.jsonl


  0%|          | 0/53 [00:00<?, ?it/s]

Writing to file:  CIR.cir_gpt-4o-mini_Cosine_50_0.5_100_301_p2.jsonl
Writing to file:  IIIA-UNIPD.iiia_standard_p1_straight_ht_ag.jsonl


  6%|▌         | 3/53 [00:00<00:02, 23.32it/s]

Writing to file:  CIR.cir_gpt-4o-mini_Cosine_50_0.25_100_301_p1.jsonl
Writing to file:  CIR.cir_gpt-4o-mini_Cosine_50_0.5_100_301_p3.jsonl
Writing to file:  IIIA-UNIPD.iiia_standard_p1_reverse_ht_ag.jsonl


 11%|█▏        | 6/53 [00:00<00:02, 22.43it/s]

Writing to file:  webis.webis-taskrag-gpt4omini-k20.jsonl
Writing to file:  CIR.cir_gpt-4o-mini_Cosine_20_0.5_100_301_p1.jsonl


 17%|█▋        | 9/53 [00:00<00:01, 25.15it/s]

Writing to file:  CIR.cir_gpt-4o-mini_Jaccard_50_1.0_100_301_p0.jsonl
Writing to file:  CIR.cir_gpt-4o-mini_Cosine_50_0.5_100_301_p1.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p1_straight_ht_ag.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p2_reverse_ag.jsonl


 23%|██▎       | 12/53 [00:00<00:01, 26.63it/s]

Writing to file:  IIIA-UNIPD.iiia_dedup_p2_straight_ht_ag.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p2_straight_ag.jsonl


 28%|██▊       | 15/53 [00:00<00:01, 25.80it/s]

Writing to file:  webis.webis-reuserag-baseline-promptedreuse-clustered.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p1_straight_ag.jsonl
Writing to file:  KML.cmd_plus_prompt.jsonl
Writing to file:  IITD-IRL.ag_rag_gpt35_expansion_rrf_20.jsonl


 34%|███▍      | 18/53 [00:00<00:01, 21.57it/s]

Writing to file:  IIIA-UNIPD.iiia_standard_p2_reverse_ag.jsonl
Writing to file:  IIIA-UNIPD.iiia_standard_p2_reverse_ht_ag.jsonl
Writing to file:  IIIA-UNIPD.iiia_standard_p2_straight_ag.jsonl


 40%|███▉      | 21/53 [00:00<00:01, 20.38it/s]

Writing to file:  IIIA-UNIPD.iiia_standard_p2_straight_ht_ag.jsonl


 45%|████▌     | 24/53 [00:01<00:01, 20.24it/s]

Writing to file:  IIIA-UNIPD.iiia_standard_p1_straight_ag.jsonl
Writing to file:  CIR.cir_gpt-4o-mini_no_reranking_50_0.5_100_301_p1.jsonl
Writing to file:  IITD-IRL.ag_rag_mistral_expansion_rrf_20.jsonl
Writing to file:  webis.webis-taskrag-gpt4omini-k10.jsonl
Writing to file:  IITD-IRL.ag_rag_gpt35_expansion_rrf_7.jsonl


 57%|█████▋    | 30/53 [00:01<00:00, 23.14it/s]

Writing to file:  IIIA-UNIPD.iiia_standard_p1_reverse_ag.jsonl
Writing to file:  CIR.cir_gpt-4o-mini_Jaccard_50_0.5_100_301_p0.jsonl
Writing to file:  IITD-IRL.ag_rag_mistral_expansion_rrf_7.jsonl
Writing to file:  IITD-IRL.ag_rag_mistral_expansion_rrf_15.jsonl
Writing to file:  IITD-IRL.ag_rag_gpt35_expansion_rrf_15.jsonl


 62%|██████▏   | 33/53 [00:01<00:00, 22.82it/s]

Writing to file:  KML.chatgpt_4_mini.jsonl
Writing to file:  uis-iai.ginger.jsonl
Writing to file:  TREMA-UNH.generated_pormpt_based_sort_flag_True_ragnarbm25.jsonl
Writing to file:  TREMA-UNH.generated_rule_based_sort_flag_False_ragnarbm25.jsonl


 68%|██████▊   | 36/53 [00:01<00:00, 23.40it/s]

Writing to file:  coordinators.baseline_rag24.test_command-r-plus_top20.jsonl


 74%|███████▎  | 39/53 [00:01<00:00, 23.51it/s]

Writing to file:  softbank-meisei.agtask-bm25-colbert_faiss-gpt4o-llama70b.jsonl
Writing to file:  uis-iai.baseline_top_5.jsonl
Writing to file:  ldisnu.ragnarok.jsonl
Writing to file:  uis-iai.ginger-fluency_top_5.jsonl


 79%|███████▉  | 42/53 [00:01<00:00, 23.49it/s]

Writing to file:  uis-iai.ginger-fluency_top_20.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p1_reverse_ag.jsonl


 85%|████████▍ | 45/53 [00:01<00:00, 22.88it/s]

Writing to file:  webis.webis-reuserag-baseline-promptedreuse-k10.jsonl
Writing to file:  uis-iai.ginger-fluency_top_10.jsonl
Writing to file:  CIR.cir_gpt-4o-mini_Cosine_50_0.75_100_301_p1.jsonl


 91%|█████████ | 48/53 [00:02<00:00, 24.16it/s]

Writing to file:  CIR.cir_gpt-4o-mini_Cosine_50_1.0_100_301_p1.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p1_reverse_ht_ag.jsonl
Writing to file:  IIIA-UNIPD.iiia_dedup_p2_reverse_ht_ag.jsonl
Writing to file:  InfoLab.ag-v1.jsonl


 96%|█████████▌| 51/53 [00:02<00:00, 19.42it/s]

Writing to file:  KML.chatgpt_4_mini.jsonl
Writing to file:  InfoLab.AGv2.jsonl


100%|██████████| 53/53 [00:02<00:00, 22.29it/s]

Writing to file:  coordinators.baseline_rag24.test_gpt-4o_top20.jsonl
Writing to file:  coordinators.baseline_rag24.test_l31_70b_instruct_top20.jsonl





In [None]:
import json, os, csv
from tqdm.autonotebook import tqdm
import matplotlib.pyplot as plt

DIR="/store/scratch/n3thakur/trec-rag-2024/trec2024-rag/support_eval/results/support"
RESULTS_DIR = [f"{DIR}/rag-track/pairwise", f"{DIR}/ag-track/pairwise"]

for idx, results_dir in enumerate(RESULTS_DIR):

    topic_results = {}

    input_filepaths = os.listdir(results_dir)
    # Remove the markdown files
    input_filepaths = [input_filepath for input_filepath in input_filepaths if not input_filepath.endswith(".md")]
    for input_filepath in tqdm(input_filepaths, desc=f"Processing {results_dir}", total=len(input_filepaths)):
        with open(os.path.join(results_dir, input_filepath), 'r') as fin:
            for line in fin:
                data = json.loads(line)
                topic_id = data["topic_id"]
                if topic_id not in topic_results:
                    topic_results[topic_id] = {
                        "topic": data["topic"],
                        "num_sentences": 0,
                        "avg_response_length": 0,
                        "full_support_count": 0,
                        "partial_support_count": 0,
                        "no_support_count": 0,
                        "support_score": 0
                    }
                
                support_scores = []
                for row in data['support_eval']:
                    scores_dict = {"NS": 0, "PS": 0.5, "FS": 1.0}
                    if len(row['eval_scores']) == 0 or None in row['eval_scores']:
                        support_scores.append(0)
                    else:
                        support_scores.append(sum([scores_dict[score] for score in row['eval_scores']]) / len(row['eval_scores']))

                    for score in row['eval_scores']:
                        if score == "FS":
                            topic_results[topic_id]["full_support_count"] += 1
                        elif score == "PS":
                            topic_results[topic_id]["partial_support_count"] += 1
                        elif score == "NS":
                            topic_results[topic_id]["no_support_count"] += 1

                topic_results[topic_id]["avg_response_length"] += data['response_length']
                topic_results[topic_id]["num_sentences"] += len(data['support_eval'])
                topic_results[topic_id]["support_score"] += sum(support_scores) / len(support_scores)
                
    for topic_id in topic_results:
        topic_results[topic_id]["avg_response_length"] /= len(input_filepaths)
        topic_results[topic_id]["num_sentences"] /= len(input_filepaths)
        topic_results[topic_id]["support_score"] /= len(input_filepaths)
    
    # printing the topicwise results as a scatterplot with support score on x-axis and avg. response length on y-axis
    RESULTS_DIR_OUTPUT = "/store/scratch/n3thakur/trec-rag-2024/trec2024-rag/support_eval/summary/support"
    title = "RAG Track" if idx == 0 else "AG Track"
    plt.figure(figsize=(7, 7))
    plt.scatter(x=[topic_results[topic_id]["avg_response_length"] for topic_id in topic_results], y=[topic_results[topic_id]["support_score"] for topic_id in topic_results])
    plt.ylabel("Support Score", fontsize=14)
    plt.xlabel("Avg. Response Length", fontsize=14)
    plt.title("Topic wise statistics in {}".format(title), fontsize=16)
    plt.legend(["Topics"], loc='upper right')
    plt.savefig(f"{RESULTS_DIR_OUTPUT}/topicwise_support_score_vs_avg_response_length_{title.lower().replace(' ', '_')}.png")

    output_filepath = "topicwise_support_results_rag_track.csv" if idx == 0 else "topicwise_support_results_ag_track.csv"
    # sort topics based on highest to lowest support score
    topic_results = {k: v for k, v in sorted(topic_results.items(), key=lambda item: item[1]["support_score"], reverse=True)}
    
    with open(f"{RESULTS_DIR_OUTPUT}/{output_filepath}", 'w') as fout:
        writer = csv.writer(fout, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["Topic ID", "Topic", "Avg. #Sentences", "Avg. Response Length", "Support Score", "FS Count", "PS Count", "NS Count"])
        for topic_id in topic_results:
            writer.writerow([
                topic_id, 
                topic_results[topic_id]["topic"], 
                round(topic_results[topic_id]["num_sentences"], 2), 
                round(topic_results[topic_id]["avg_response_length"], 2), 
                round(topic_results[topic_id]["support_score"], 2), 
                round(topic_results[topic_id]["full_support_count"], 2), 
                round(topic_results[topic_id]["partial_support_count"], 2), 
                round(topic_results[topic_id]["no_support_count"], 2)
                ])


In [None]:
import json, os, csv
from tqdm.autonotebook import tqdm

DIR="/store/scratch/n3thakur/trec-rag-2024/trec2024-rag/support_eval/results/support"
RESULTS_DIR = [f"{DIR}/rag-track/pairwise", f"{DIR}/ag-track/pairwise"]

for idx, results_dir in enumerate(RESULTS_DIR):
    participant_results = {}
    input_filepaths = os.listdir(results_dir)
    # Remove the markdown files
    input_filepaths = [input_filepath for input_filepath in input_filepaths if not input_filepath.endswith(".md")]
    for input_filepath in tqdm(input_filepaths, desc=f"Processing {results_dir}", total=len(input_filepaths)):
        with open(os.path.join(results_dir, input_filepath), 'r') as fin:
            total_topics = 0
            
            for line in fin:
                total_topics += 1
                data = json.loads(line)
                topic_id = data["topic_id"]

                if input_filepath not in participant_results:
                    participant_results[input_filepath] = {
                        "run_id": data["run_id"],
                        "num_sentences": 0,
                        "avg_response_length": 0,
                        "full_support_count": 0,
                        "partial_support_count": 0,
                        "no_support_count": 0,
                        "support_score": 0
                    }
                
                support_scores = []
                for row in data['support_eval']:
                    scores_dict = {"NS": 0, "PS": 0.5, "FS": 1.0}
                    if len(row['eval_scores']) == 0 or None in row['eval_scores']:
                        support_scores.append(0)
                    else:
                        support_scores.append(sum([scores_dict[score] for score in row['eval_scores']]) / len(row['eval_scores']))

                    for score in row['eval_scores']:
                        if score == "FS":
                            participant_results[input_filepath]["full_support_count"] += 1
                        elif score == "PS":
                            participant_results[input_filepath]["partial_support_count"] += 1
                        elif score == "NS":
                            participant_results[input_filepath]["no_support_count"] += 1

                participant_results[input_filepath]["avg_response_length"] += data['response_length']
                participant_results[input_filepath]["num_sentences"] += len(data['support_eval'])
                participant_results[input_filepath]["support_score"] += sum(support_scores) / len(support_scores)
                
        participant_results[input_filepath]["avg_response_length"] /= total_topics
        participant_results[input_filepath]["num_sentences"] /= total_topics
        participant_results[input_filepath]["support_score"] /= total_topics
    
    # printing the topicwise results
    RESULTS_DIR_OUTPUT = "/store/scratch/n3thakur/trec-rag-2024/trec2024-rag/support_eval/summary/support"
    title = "RAG Track" if idx == 0 else "AG Track"
    plt.figure(figsize=(7, 7))
    plt.scatter(x=[participant_results[input_filepath]["avg_response_length"] for input_filepath in input_filepaths], y=[participant_results[input_filepath]["support_score"] for input_filepath in input_filepaths])
    plt.ylabel("Support Score", fontsize=14)
    plt.xlabel("Avg. Response Length", fontsize=14)
    plt.title("Participant wise statistics in {}".format(title), fontsize=16)
    plt.legend(["Runs"], loc='upper right')
    plt.savefig(f"{RESULTS_DIR_OUTPUT}/participantwise_support_score_vs_avg_response_length_{title.lower().replace(' ', '_')}.png")


    output_filepath = "participantwise_support_results_rag_track.csv" if idx == 0 else "participantwise_support_results_ag_track.csv"

    # sort participant results based on highest to lowest support score
    participant_results = {k: v for k, v in sorted(participant_results.items(), key=lambda item: item[1]["support_score"], reverse=True)}

    with open(f"{RESULTS_DIR_OUTPUT}/{output_filepath}", 'w') as fout:
        writer = csv.writer(fout, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["File Name", "Run ID", "Avg. #Sentences", "Avg. Response Length", "Support Score", "FS Count", "PS Count", "NS Count"])
        for input_filepath in participant_results:
            writer.writerow([
                input_filepath.replace(".jsonl", ""),
                participant_results[input_filepath]["run_id"], 
                round(participant_results[input_filepath]["num_sentences"], 2), 
                round(participant_results[input_filepath]["avg_response_length"], 2), 
                round(participant_results[input_filepath]["support_score"], 2), 
                round(participant_results[input_filepath]["full_support_count"], 2), 
                round(participant_results[input_filepath]["partial_support_count"], 2), 
                round(participant_results[input_filepath]["no_support_count"], 2)
                ])