In [1]:
import json
import os

In [2]:
def make_first_letter_uppercase(string):
    return string[0].upper() + string[1:]

new_data = []
for dataset in ["ConceptNet", "GoogleRE", "SQUAD", "TREx"]:
    for category in ["high_ranked", "low_ranked", "random"]:
        if not os.path.exists(f"data/{dataset}/{category}"): continue
        for subdataset in os.listdir(f"data/{dataset}/{category}"):
            if subdataset.endswith(".jsonl"):
                with open(f"data/{dataset}/{category}/{subdataset}", "r") as f:
                    data = [json.loads(line) for line in f.readlines()]
                
                    for idx, item in enumerate(data):

                        assert len(item["masked_negations"]) == 1
                        
                        choice_correct = item["obj_label"].strip().lower()
                        choice_wrong = item["masked_misprimed"][0].split("?")[0].strip().lower()
                        negated_question = item["masked_negations"][0].replace("[MASK]", "___")
                        assert negated_question[-1] == "."
                        negated_question = negated_question[:-1] + "?"
                        
                        new_item = {
                            "question": {
                                "stem": negated_question,
                                "choices": [
                                    {
                                        "text": choice_correct,
                                        "label": "A" if idx % 2 == 0 else "B"
                                    },
                                    {
                                        "text": choice_wrong,
                                        "label": "B" if idx % 2 == 0 else "A"
                                    }
                                ]
                            },
                            "answerKey": "B" if idx % 2 == 0 else "A",
                            "metadata": {
                                "dataset": dataset,
                                "category": category,
                                "subdataset": subdataset,
                                "idx": idx
                            }
                        }

                        new_data.append(new_item)

with open(f"processed.jsonl", "w") as f:
    for item in new_data:
        f.write(json.dumps(item) + "\n")

In [3]:
import random
random.seed(1234)

# randomly sample 50 items from each unique (dataset, category, subdataset) tuple
data_categorized = {}

for item in new_data:
    key = (item["metadata"]["dataset"], item["metadata"]["category"], item["metadata"]["subdataset"])
    if key not in data_categorized:
        data_categorized[key] = []
    data_categorized[key].append(item)

sampled_data = []
for key in data_categorized:
    if key[0] == "TREx":
        sampled_data.extend(random.sample(data_categorized[key], min(5, len(data_categorized[key]))))
    else:
        sampled_data.extend(random.sample(data_categorized[key], min(50, len(data_categorized[key]))))

with open(f"processed_sampled.jsonl", "w") as f:
    for item in sampled_data:
        f.write(json.dumps(item) + "\n")
