In [None]:
import json

with open('../corpus/en.json', 'r') as f:
    quiz_obj = json.load(f)

### Create a simple QA datset _without_ options

In [None]:
dataset = []
lesson_quizzes: list = quiz_obj[0]["quizzes"]
for quiz_info in lesson_quizzes:
    title = quiz_info["title"]  # Lesson wise title
    quizzes = quiz_info["quiz"]  # List of quiz for a single lesson
    for quiz in quizzes:
        question = quiz["questionText"]
        answer = list(
            filter(lambda item: item["isCorrect"] == "true", quiz["answerOptions"])
        )[0]["answerText"]
        dataset.append({
            'question': question,
            'answer': answer
        })

In [None]:
import pandas as pd

df_dataset = pd.DataFrame(dataset)
df_dataset.head()

In [None]:
df_dataset.to_json('../datasets/itml_qa.jsonl', lines=True, orient='records')

> Some examples in the simple QA dataset require manual review and removal. These examples include questions that provide options where significant context is embedded within the options themselves. For instance, questions like "Which of these options...?" or "Fill in the blank with the correct option" fall into this category.

### Create a simple QA datset _with_ options provided as MCQ

with options

In [None]:
dataset = []
lesson_quizzes: list = quiz_obj[0]["quizzes"]
for quiz_info in lesson_quizzes:
    title = quiz_info["title"]  # Lesson wise title
    quizzes = quiz_info["quiz"]  # List of quiz for a single lesson
    for quiz in quizzes:
        question_text = quiz["questionText"]
        answer = list(
            filter(lambda item: item["isCorrect"] == "true", quiz["answerOptions"])
        )[0]["answerText"]

        mcq = "\n".join(["- " + item['answerText'] for item in quiz["answerOptions"]])
        prompt = "Select the correct answer from the options given for the question below:"
        question = (
            f"{prompt}\n"
            f"\nQuestion:\n{question_text}\n"
            f"\nOptions:\n{mcq}"
        )

        dataset.append({
            'question': question,
            'answer': answer
        })

print(dataset[1]['question'])

In [None]:
ds = pd.DataFrame(dataset)

In [None]:
ds.to_json('../datasets/itml_mcq.jsonl', lines=True, orient='records')