In [1]:
import json
from pprint import pprint
from glob import glob

dataset_name = "commonsenseqa2"

questions_raw = []
with open(f"{dataset_name}/CSQA.jsonl") as f:
    for line in f:
        q = json.loads(line.strip())
        questions_raw.append(q)
print(len(questions_raw))
pprint(questions_raw[:3])

with open(f"{dataset_name}/question_stems.txt", "w") as f:
    for q in questions_raw:
        f.write(q["question"] + "\n")

questions = []
for q in questions_raw:
    questions.append({
        "id": q["id"],
        "question": {
            "stem": q["question"],
            "choices": [
                {"text": "Yes", "label": "A"},
                {"text": "No", "label": "B"},
            ]
        },
        "answerKey": "A" if q["answer"] == "yes" else "B",
    })


11805
[{'answer': 'yes',
  'confidence': 0.89,
  'date': '12/16/2020',
  'id': '0000488c294c99bd1a6cf10258dae8c1',
  'question': 'The world trade center is no more because of 9/11?',
  'relational_prompt': 'because',
  'relational_prompt_used': True,
  'topic_prompt': 'world trade center',
  'topic_prompt_used': True,
  'validations': ['yes', 'yes', 'yes', 'no']},
 {'answer': 'yes',
  'confidence': 0.98,
  'date': '2/12/2021',
  'id': '0007ec77ec8db1d0b9f4e72fd1a931c5',
  'question': 'a pupil can be either a student or part of an eye',
  'relational_prompt': 'sometimes',
  'relational_prompt_used': False,
  'topic_prompt': 'pupil',
  'topic_prompt_used': True,
  'validations': ['yes', 'yes']},
 {'answer': 'yes',
  'confidence': 0.71,
  'date': '4/7/2021',
  'id': '000a24f0db61226f9a4a744960a16104',
  'question': 'Unity has a lot to do with family.',
  'relational_prompt': 'has',
  'relational_prompt_used': True,
  'topic_prompt': 'unity',
  'topic_prompt_used': True,
  'validations': [

In [2]:
import re 
import string
from collections import defaultdict

import spacy 
from tqdm.notebook import tqdm
nlp = spacy.load("en_core_web_sm")

cnt = 0
processed_questions = []
counters = defaultdict(int)

"""
- not: add the word “not”
    - be not
    - can not
    - does not
    - not because
- word prefix: “un-”, “in-”, …
- antonym: “highest” → “lowest”
- prompt: add “which of the following is not true”, “choose the wrong answer”
"""

def join_spacy_tokens(tokens):
    ret = ""
    for t in tokens:
        if t[0] in string.punctuation:
            ret += t
        else:
            ret += (" " + t)
    return ret.strip()


def negate_root_verb(nlp, q_body, rank):
    doc = nlp(q_body.strip())
    tokens = [token.text for token in doc]

    root_i = None
    root_token = None
    for i, token in enumerate(doc):
        if token.dep_ == "ROOT" and token.pos_ == "VERB":
            root_i = i
            root_token = token
            break
    """
    https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    27. VB	Verb, base form
    28.	VBD	Verb, past tense
    29.	VBG	Verb, gerund or present participle
    30.	VBN	Verb, past participle
    31.	VBP	Verb, non-3rd person singular present
    32.	VBZ	Verb, 3rd person singular present
    """
    if root_i:
        if root_token.tag_ == "VBD":
            tokens = tokens[:root_i] + (["did", "not", root_token.lemma_] if rank % 2 else ["didn't", root_token.lemma_]) + tokens[root_i + 1:]
        elif root_token.tag_ == "VBP":
            tokens = tokens[:root_i] + (["do", "not", root_token.lemma_] if rank % 2 else ["don't", root_token.lemma_]) + tokens[root_i + 1:]
        elif root_token.tag_ == "VBZ":
            tokens = tokens[:root_i] + (["does", "not", root_token.lemma_] if rank % 2 else ["doesn't", root_token.lemma_]) + tokens[root_i + 1:]
        else:
            return None
    else:
        return None
    return join_spacy_tokens(tokens)          
    

### MAIN ###
for q in tqdm(questions):
    q_text = q["question"]["stem"]
    q_text_split = [x for x in re.compile("[,\.\?]").split(q_text) if x]
    q_body_old = q_text_split[-1].strip()

    q_body = q_text_split[-1].strip()
    q_words = q_body.split()

    q_body = " " + q_body
    if not q_words: continue

    if "Not" in q_words or "not" in q_words or "n't" in q_body \
        or "nothing" in q_words or "none" in q_words or "never" in q_words or "nobody" in q_words \
        or "nowhere" in q_words or "no" in q_words:
        q["negation_rule"] = "gold/not"

    elif "unable" in q_words or "unlike" in q_words or "unlikely" in q_words \
        or "incapable" in q_words or "unusual" in q_words or "impossible" in q_words \
        or "rarely" in q_words or "rare" in q_words or "barely" in q_words or "scarcely" in q_words:
        q["negation_rule"] = "gold/un-in-"

    elif "because" in q_words:
        q["negation_rule"] = "not/because"
        q_body = q_body.replace(" because", " not because")

    elif q_words.count("is") + q_words.count("are") + q_words.count("was") + q_words.count("were") == 1:
        q["negation_rule"] = "not/be"
        rank = counters[q["negation_rule"]]
        q_body = q_body.replace(" is", (" is not" if rank % 2 else " isn't"))
        q_body = q_body.replace(" are", (" are not" if rank % 2 else " aren't"))
        q_body = q_body.replace(" was", (" was not" if rank % 2 else " wasn't"))
        q_body = q_body.replace(" were", (" were not" if rank % 2 else " weren't"))

    elif (
            q_words.count("may") + q_words.count("might") \
            + q_words.count("can") + q_words.count("could") \
            + q_words.count("will") + q_words.count("would") == 1
        ):
        q["negation_rule"] = "not/modal"
        rank = counters[q["negation_rule"]]
        q_body = q_body.replace(" may", " may not")
        q_body = q_body.replace(" might", " might not")
        q_body = q_body.replace(" can", (" can not" if rank % 2 else " can't"))
        q_body = q_body.replace(" could", (" could not" if rank % 2 else " couldn't"))
        q_body = q_body.replace(" will", (" will not" if rank % 2 else " won't"))
        q_body = q_body.replace(" would", (" would not" if rank % 2 else " wouldn't"))

    elif negate_root_verb(nlp, q_body, counters["not/do"]):
        q["negation_rule"] = "not/do"
        rank = counters[q["negation_rule"]]
        q_body = negate_root_verb(nlp, q_body, rank)

    elif q_words[0] != "Which" and q_words[0] != "which" and q_text.strip()[-1] != "?":
        # if not any(set(word) == {"_"} for word in q_text.split()):
        #     q_body += " ____"
        q["negation_rule"] = "prompt"
        rank = counters[q["negation_rule"]]

    if "negation_rule" in q:
        if q["negation_rule"] == "prompt":
            rank = counters[q["negation_rule"]]
            q["question"]["stem"] = ("Choose the incorrect answer. " if rank % 2 else "Which of the following is not true? ") + q_text
        
        else:
            q["question"]["stem"] = q_text.replace(q_body_old.strip(), q_body.strip())

        processed_questions.append(q)
        counters[q["negation_rule"]] += 1

print(len(questions))
print(len(processed_questions))
pprint(dict(counters))
pprint(processed_questions[:10])

  0%|          | 0/11805 [00:00<?, ?it/s]

11805
10130
{'gold/not': 1143,
 'gold/un-in-': 20,
 'not/be': 4291,
 'not/because': 270,
 'not/do': 1784,
 'not/modal': 1105,
 'prompt': 1517}
[{'answerKey': 'A',
  'id': '0000488c294c99bd1a6cf10258dae8c1',
  'negation_rule': 'gold/not',
  'question': {'choices': [{'label': 'A', 'text': 'Yes'},
                           {'label': 'B', 'text': 'No'}],
               'stem': 'The world trade center is no more because of 9/11?'}},
 {'answerKey': 'A',
  'id': '0007ec77ec8db1d0b9f4e72fd1a931c5',
  'negation_rule': 'not/modal',
  'question': {'choices': [{'label': 'A', 'text': 'Yes'},
                           {'label': 'B', 'text': 'No'}],
               'stem': "a pupil can't be either a student or part of an eye"}},
 {'answerKey': 'A',
  'id': '000a24f0db61226f9a4a744960a16104',
  'negation_rule': 'not/do',
  'question': {'choices': [{'label': 'A', 'text': 'Yes'},
                           {'label': 'B', 'text': 'No'}],
               'stem': "Unity doesn't have a lot to do with family

In [3]:
import random
from copy import deepcopy

random.seed(1234)

new_questions = []

for q in processed_questions:
    answer_index = ord(q["answerKey"]) - ord("A")
    correct_answer = deepcopy(q["question"]["choices"][answer_index])
    wrong_answers = [a for a in q["question"]["choices"] if a["label"] != q["answerKey"]]
    wrong_answer = deepcopy(random.choice(wrong_answers))
    new_q = deepcopy(q)
    # negate correct answer
    if random.random() > 0.5:
        correct_answer["label"] = "A"
        wrong_answer["label"] = "B"
        new_q["question"]["choices"] = [correct_answer, wrong_answer]
        new_q["answerKey"] = ("A" if q["negation_rule"].startswith("gold/") else "B")
    else:
        correct_answer["label"] = "B"
        wrong_answer["label"] = "A"
        new_q["question"]["choices"] = [wrong_answer, correct_answer]
        new_q["answerKey"] = ("B" if q["negation_rule"].startswith("gold/") else "A")
    new_questions.append(new_q)
pprint(new_questions[:5])

with open(f"{dataset_name}/all_negated.jsonl", "w") as f:
    for new_q in new_questions:
        f.write(json.dumps(new_q) + "\n")

with open(f"{dataset_name}/sample_un_in.jsonl", "w") as f_un_in,\
    open(f"{dataset_name}/sample_gold_not.jsonl", "w") as f_gold_not,\
    open(f"{dataset_name}/sample_not_be.jsonl", "w") as f_not_be,\
    open(f"{dataset_name}/sample_not_because.jsonl", "w") as f_not_because,\
    open(f"{dataset_name}/sample_not_do.jsonl", "w") as f_not_verb,\
    open(f"{dataset_name}/sample_not_modal.jsonl", "w") as f_not_modal,\
    open(f"{dataset_name}/sample_prompt.jsonl", "w") as f_prompt:

    buffers = defaultdict(list)

    for new_q in new_questions:
        buffer = buffers[new_q["negation_rule"]]
        buffer.append(new_q)

    for negation_rule, buffer in buffers.items():
        random.shuffle(buffer)
        target_file = {
            "gold/not": f_gold_not,
            "gold/un-in-": f_un_in,
            "not/be": f_not_be,
            "not/because": f_not_because,
            "not/do": f_not_verb,
            "not/modal": f_not_modal,
            "prompt": f_prompt,
        }.get(negation_rule, None)
        if not target_file: continue
        for new_q in buffer:
            target_file.write(json.dumps(new_q) + "\n")

[{'answerKey': 'A',
  'id': '0000488c294c99bd1a6cf10258dae8c1',
  'negation_rule': 'not/because',
  'question': {'choices': [{'label': 'A', 'text': 'No'},
                           {'label': 'B', 'text': 'Yes'}],
               'stem': 'The world trade center is no more not because of '
                       '9/11?'}},
 {'answerKey': 'B',
  'id': '0007ec77ec8db1d0b9f4e72fd1a931c5',
  'negation_rule': 'not/modal',
  'question': {'choices': [{'label': 'A', 'text': 'Yes'},
                           {'label': 'B', 'text': 'No'}],
               'stem': "a pupil can't be either a student or part of an eye"}},
 {'answerKey': 'B',
  'id': '000a24f0db61226f9a4a744960a16104',
  'negation_rule': 'not/do',
  'question': {'choices': [{'label': 'A', 'text': 'Yes'},
                           {'label': 'B', 'text': 'No'}],
               'stem': "Unity doesn't have a lot to do with family."}},
 {'answerKey': 'A',
  'id': '001c7dabce40afff007d3d0aae04865d',
  'negation_rule': 'not/be',
  'question

In [4]:
import os
import json
from glob import glob

def read_jsonl(path):
    ret = []
    with open(path) as f:
        for line in f:
            ret.append(json.loads(line))
    return ret

for path in glob(f"{dataset_name}/sample_*.jsonl"):
    questions = read_jsonl(path)
    for i, q in enumerate(questions):
        if i % 2 == 0 and q["answerKey"] == "A":
            q["question"]["choices"] = q["question"]["choices"][::-1]
            q["answerKey"] = "B"
        if i % 2 == 1 and q["answerKey"] == "B":
            q["question"]["choices"] = q["question"]["choices"][::-1]
            q["answerKey"] = "A"
        
        q["question"]["choices"][0]["label"] = "A"
        q["question"]["choices"][1]["label"] = "B"

    new_path = path.replace("sample_", "rebalanced_sample_")
    with open(new_path, "w") as f:
        for q in questions:
            f.write(json.dumps(q) + "\n")

In [5]:
"""
,prompt,classes,answer_index
0,"Question: {Q}
A. {A1}
B. {A2}
Answer:","[' A', ' B']",0/1
1,"Question: {Q}
A. {A1}
B. {A2}
Answer:","[' A', ' B']",0/1
"""
import json, csv

for path in glob(f"{dataset_name}/rebalanced_sample_*.jsonl"):
    with open(path) as fin, open(path.replace("rebalanced_sample_", "final_"), "w") as csvfile:
        csvwriter = csv.writer(csvfile, delimiter=',')
        csvwriter.writerow(["", "prompt", "classes", "answer_index"])
        for i, line in enumerate(fin):
            q = json.loads(line)
            Q = q["question"]["stem"].replace("\"", "``")
            A1 = q["question"]["choices"][0]["text"]
            A2 = q["question"]["choices"][1]["text"]
            prompt = f"""The following are multiple choice questions (with answers) about common sense.
            
    Question: {Q}
    A. {A1}
    B. {A2}
    Answer:"""
            classes = "[' A', ' B']"
            answer_index = (0 if q["answerKey"] == "A" else 1)
            csvwriter.writerow([i, prompt, classes, answer_index])