In [259]:
from typing import Optional
import numpy as np
import datasets
from datasets import Value, Dataset

DEFAULT_LETTER_ORDER = np.array(["A", "B", "C", "D"])
def to_mmmlu_format(mmlu_dataset: Dataset):
    """
    Convert the format of English mmlu to mmmlu.
    """
    for col_name in mmlu_dataset.features:
        mmlu_dataset = mmlu_dataset.rename_column(col_name, col_name.capitalize())

    # Add options columns
    choice_arr = np.array(mmlu_dataset['Choices'])
    for idx, letter in enumerate(DEFAULT_LETTER_ORDER):
        mmlu_dataset = mmlu_dataset.add_column(letter, choice_arr[:, idx])
    mmlu_dataset = mmlu_dataset.remove_columns('Choices')

    # From number answer cols to letter answer cols
    mmlu_dataset = mmlu_dataset.cast_column('Answer', Value('string'))
    mmlu_dataset = mmlu_dataset.map(lambda x: {"Answer": DEFAULT_LETTER_ORDER[int(x['Answer'])]})

    # Add Question id in subtask for convenience
    df = mmlu_dataset.to_pandas()
    df['Question id in subtask'] = df.groupby("Subject").cumcount()
    columns = list(df.columns)
    question_id_col_name = columns.pop()
    columns.insert(0, question_id_col_name)
    df = df[columns]
    mmlu_dataset = Dataset.from_pandas(df)
    return mmlu_dataset

In [224]:
from datasets import load_dataset
# [English, JA_JP]
lang_list = ["English", "JA_JP"]
curr_langauage = lang_list[0]
if curr_langauage == 'English':
    mmmlu_ds = load_dataset('cais/mmlu', 'all', split='test')
    mmmlu_ds = to_mmmlu_format(mmmlu_ds)
else:
    mmmlu_ds = load_dataset("openai/MMMLU", curr_langauage, split='test')
    mmmlu_ds = mmmlu_ds.rename_column('Unnamed: 0', "Question id in subtask")

In [225]:
# https://github.com/hendrycks/test/blob/master/categories.py
subcategories = {
    "abstract_algebra": ["math"],
    "anatomy": ["health"],
    "astronomy": ["physics"],
    "business_ethics": ["business"],
    "clinical_knowledge": ["health"],
    "college_biology": ["biology"],
    "college_chemistry": ["chemistry"],
    "college_computer_science": ["computer science"],
    "college_mathematics": ["math"],
    "college_medicine": ["health"],
    "college_physics": ["physics"],
    "computer_security": ["computer science"],
    "conceptual_physics": ["physics"],
    "econometrics": ["economics"],
    "electrical_engineering": ["engineering"],
    "elementary_mathematics": ["math"],
    "formal_logic": ["philosophy"],
    "global_facts": ["other"],
    "high_school_biology": ["biology"],
    "high_school_chemistry": ["chemistry"],
    "high_school_computer_science": ["computer science"],
    "high_school_european_history": ["history"],
    "high_school_geography": ["geography"],
    "high_school_government_and_politics": ["politics"],
    "high_school_macroeconomics": ["economics"],
    "high_school_mathematics": ["math"],
    "high_school_microeconomics": ["economics"],
    "high_school_physics": ["physics"],
    "high_school_psychology": ["psychology"],
    "high_school_statistics": ["math"],
    "high_school_us_history": ["history"],
    "high_school_world_history": ["history"],
    "human_aging": ["health"],
    "human_sexuality": ["culture"],
    "international_law": ["law"],
    "jurisprudence": ["law"],
    "logical_fallacies": ["philosophy"],
    "machine_learning": ["computer science"],
    "management": ["business"],
    "marketing": ["business"],
    "medical_genetics": ["health"],
    "miscellaneous": ["other"],
    "moral_disputes": ["philosophy"],
    "moral_scenarios": ["philosophy"],
    "nutrition": ["health"],
    "philosophy": ["philosophy"],
    "prehistory": ["history"],
    "professional_accounting": ["other"],
    "professional_law": ["law"],
    "professional_medicine": ["health"],
    "professional_psychology": ["psychology"],
    "public_relations": ["politics"],
    "security_studies": ["politics"],
    "sociology": ["culture"],
    "us_foreign_policy": ["politics"],
    "virology": ["health"],
    "world_religions": ["philosophy"],
}

categories = {
    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
    "humanities": ["history", "philosophy", "law"],
    "social sciences": ["politics", "culture", "economics", "geography", "psychology"],
    "other (business, health, misc.)": ["other", "business", "health"],
}

# Choose the first subtask in the subcategory
subtasks = {}
chosen_subtasks = []
for subtask, task_subcategory in subcategories.items():
    curr_category = task_subcategory[0]
    if curr_category not in subtasks:
        subtasks[curr_category] = []
        chosen_subtasks.append(subtask)
    subtasks[curr_category].append(subtask)

In [226]:
# Manually select. Prefer professional over high school
target_subtasks = ['abstract_algebra', 'virology', 'astronomy', 'marketing', 'college_biology', 'college_chemistry', 'machine_learning',
    'econometrics', 'electrical_engineering', 'philosophy', 'global_facts', 'prehistory',
    'high_school_geography', 'security_studies', 'professional_psychology', 'sociology', 'jurisprudence'
]

subtasks = {}
for subtask, task_subcategory in subcategories.items():
    curr_category = task_subcategory[0]
    if curr_category not in subtasks:
        subtasks[curr_category] = []
    subtasks[curr_category].append(subtask)

chosen_subtasks = target_subtasks

In [227]:
chosen_subtasks

['abstract_algebra',
 'virology',
 'astronomy',
 'marketing',
 'college_biology',
 'college_chemistry',
 'machine_learning',
 'econometrics',
 'electrical_engineering',
 'philosophy',
 'global_facts',
 'prehistory',
 'high_school_geography',
 'security_studies',
 'professional_psychology',
 'sociology',
 'jurisprudence']

In [230]:
datasets.disable_progress_bar()
def sample_first_n_data_from_subtask(ds: Dataset, target_subtasks: list[str], sample_first_n: int = 100) -> Dataset:
    subtask_set = set(set(target_subtasks))
    sampled_ds = ds.filter(lambda x: x['Subject'] in subtask_set and x['Question id in subtask'] < sample_first_n)
    return sampled_ds


In [231]:
mmmlu_subset = sample_first_n_data_from_subtask(mmmlu_ds, chosen_subtasks)
mmmlu_subset

Dataset({
    features: ['Question id in subtask', 'Question', 'Subject', 'Answer', 'A', 'B', 'C', 'D'],
    num_rows: 1700
})

In [232]:
import copy
# common.py

# The original base prompt
QUERY_TEMPLATE_MULTICHOICE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.

{Question}

A) {A}
B) {B}
C) {C}
D) {D}
""".strip()

# Base prompt with variable output format
BASE_TEMPLATE_FREE = """
Answer the following multiple choice question. {Output_format} Think step by step before answering.

{Question}

A) {A}
B) {B}
C) {C}
D) {D}
""".strip()

# The sentecnes of base prompt
BASE_TASK = "Answer the following multiple choice question."
BASE_OUTPUT_FORMAT = "The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD."
BASE_INSTRUCTION = "Think step by step before answering."


JSON_ANSWER_OUTPUT_FORMAT = """The last line of your response should be of the following format:
'
{
    "Answer": "$LETTER"
}
'
where LETTER is one of ABCD. Namely, a JSON format for the output is required.
Each value in JSON should be single line.
""".strip()

JSON_FULL_OUTPUT_FORMAT = """Your response should be of the following format:
'
{
    "Reasoning": ...,
    "Answer": "$LETTER"
}
'
where LETTER is one of ABCD. Namely, a JSON format for the output is required.
Each value in JSON should be single line.
""".strip()

XML_ANSWER_OUTPUT_FORMAT = """The last line of your response should be of the following format:
'
<root>
    <Answer> $LETTER </Answer>
</root>
'
where LETTER is one of ABCD. Namely, a XML format for the output is required.
""".strip()

XML_FULL_OUTPUT_FORMAT = """Your response should be of the following format:
'
<root>
    <Reasoning> ... </Reasoning>
    <Answer> $LETTER </Answer>
</root>
'
where LETTER is one of ABCD. Namely, a XML format for the output is required.
""".strip()

# Draft Json. Not confirm
JSON_TEMPLATE_DICT = {
    "Task": "",
    "Output_format": "",
    "Instruction": "",
    "Question": "",
    "Options": {
        "A": "",
        "B": "",
        "C": "",
        "D": ""
    }
}

# Fill task and instruction from base prompt
base_json_dict = copy.deepcopy(JSON_TEMPLATE_DICT)
base_json_dict['Task'] = BASE_TASK
base_json_dict['Instruction'] = BASE_INSTRUCTION

DEFAULT_LETTER_ORDER = np.array(["A", "B", "C", "D"])
DEFAULT_ORDER = np.arange(0, len(DEFAULT_LETTER_ORDER))

In [233]:
DEFAULT_ORDER

array([0, 1, 2, 3])

In [234]:
import xml.etree.ElementTree as ET
import html

def get_xml_input(common_query_filling: dict, options_filling: dict) -> str:
    '''
    common_query_filling - question and output_format
    options_filling - options and their contents
    '''
    xml_root = ET.Element("root")
    task_tag = ET.SubElement(xml_root, 'Task')
    task_tag.text = BASE_TASK

    output_tag = ET.SubElement(xml_root, "Output_format")
    output_tag.text = common_query_filling['Output_format']

    instruction_tag = ET.SubElement(xml_root, "Instruction")
    instruction_tag.text = BASE_INSTRUCTION

    question_tag = ET.SubElement(xml_root, "Question")
    question_tag.text = common_query_filling['Question']

    options_tag = ET.SubElement(xml_root, "Options")
    for opt_id in DEFAULT_LETTER_ORDER:
        opt_id = opt_id.upper()
        opt_tag = ET.SubElement(options_tag, opt_id)
        opt_tag.text = options_filling[opt_id]

    ET.indent(xml_root)
    xml_query = ET.tostring(xml_root, encoding='unicode')
    return xml_query

In [235]:
print(JSON_ANSWER_OUTPUT_FORMAT)

The last line of your response should be of the following format:
'
{
    "Answer": "$LETTER"
}
'
where LETTER is one of ABCD. Namely, a JSON format for the output is required.
Each value in JSON should be single line.


In [236]:
from enum import Enum

class InputFormat(Enum):
    BASE = "free text"
    JSON = "json"
    XML = "xml"

class OutputFormat(Enum):
    BASE = "free text"
    JSON_ANSWER = "json answer"
    JSON_FULL = "json full"
    XML_ANSWER = "xml answer only"
    XML_FULL = "xml full"


class ShuffleMethod(Enum):
    DEFAULT = "default"
    REVERSE = "reverse"
    LONGEST_FIRST = "longest-first"
    SHORTEST_FIRST = "shortest-first"

    MOST_KANA = "Japanese-kana-most"
    FEWEST_KANA = "Japanese-kana-fewest"

    # Not use
    # GOLD_A = "gold A"          # Always Put the answer in the option A
    # GOLD_B = "gold B"
    # GOLD_C = "gold C"
    # GOLD_D = "gold D"

output_format_dict = {
    OutputFormat.BASE: BASE_OUTPUT_FORMAT,
    OutputFormat.JSON_ANSWER: JSON_ANSWER_OUTPUT_FORMAT,
    OutputFormat.JSON_FULL: JSON_FULL_OUTPUT_FORMAT,
    OutputFormat.XML_ANSWER: XML_ANSWER_OUTPUT_FORMAT,
    OutputFormat.XML_FULL: XML_FULL_OUTPUT_FORMAT,
}


In [237]:
ShuffleMethod.DEFAULT.value == "default"

True

In [238]:
import json
# Get Json string
query_json_string = json.dumps(base_json_dict, ensure_ascii=False, indent=4)
print(query_json_string)

{
    "Task": "Answer the following multiple choice question.",
    "Output_format": "",
    "Instruction": "Think step by step before answering.",
    "Question": "",
    "Options": {
        "A": "",
        "B": "",
        "C": "",
        "D": ""
    }
}


In [None]:
import copy
import re

DEFAULT_MAPPING = {str(letter):str(letter) for letter in DEFAULT_LETTER_ORDER}

# hiragana \u3040 - \u309F
# katagana \u30A0 - \u30FF
# katagana extension \u31F0-\u31FF
kana_pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF]+')

def get_option_mapping(option_contents, shuffle_method: ShuffleMethod = ShuffleMethod.DEFAULT, output_format: OutputFormat = OutputFormat.BASE) -> tuple[dict[str, str], dict[str, str]]:
    '''
    Retrieve a dictionary that could map original position to new position
    Return: (original_to_shuffled, shuffled_to_original)
    '''
    # Temp
    original_to_shuffled = DEFAULT_MAPPING
    if shuffle_method is ShuffleMethod.REVERSE:
        original_to_shuffled = {str(idx):str(to_idx) for idx, to_idx in zip(DEFAULT_LETTER_ORDER, np.flip(DEFAULT_LETTER_ORDER))}
    elif shuffle_method in [ShuffleMethod.LONGEST_FIRST, ShuffleMethod.SHORTEST_FIRST]:
        if shuffle_method is ShuffleMethod.LONGEST_FIRST:
            sorted_source = np.argsort(-np.char.str_len(option_contents))
        else:
            sorted_source = np.argsort(np.char.str_len(option_contents))
        original_to_shuffled = {str(DEFAULT_LETTER_ORDER[source_idx]):str(DEFAULT_LETTER_ORDER[pos_idx]) for pos_idx, source_idx in enumerate(sorted_source)}
    elif shuffle_method in [ShuffleMethod.MOST_KANA, ShuffleMethod.FEWEST_KANA]:
        kana_counts = np.array([sum(len(kanas) for kanas in kana_pattern.findall(option_content)) for option_content in option_contents])
        if shuffle_method is ShuffleMethod.MOST_KANA:
            sorted_source = np.argsort(-kana_counts)
        else:
            sorted_source = np.argsort(kana_counts)
        original_to_shuffled = {str(DEFAULT_LETTER_ORDER[source_idx]):str(DEFAULT_LETTER_ORDER[pos_idx]) for pos_idx, source_idx in enumerate(sorted_source)}
    return original_to_shuffled


def get_output_format(output_format_type: OutputFormat = OutputFormat.BASE) -> str:
    return output_format_dict[output_format_type]

# Base Prompt
def get_query_shuffle_pair(curr_question : str,
                           shuffle_method: ShuffleMethod = ShuffleMethod.DEFAULT,
                           input_format_type: InputFormat = InputFormat.BASE,
                           output_format_type: OutputFormat = OutputFormat.BASE):
    output_format = get_output_format(output_format_type)
    options = np.array([curr_question[opt] for opt in DEFAULT_LETTER_ORDER])
    original_to_shuffled = get_option_mapping(options, shuffle_method)

    common_query_filling = {'Question': curr_question['Question'].strip(), "Output_format": output_format}

    # {shuffled_pos: content}
    option_filling = {original_to_shuffled[opt]: curr_question[opt].strip() for opt in DEFAULT_LETTER_ORDER}

    # Ensure the order. Sort to A -> D
    option_filling = dict(sorted(option_filling.items()))
    if input_format_type is InputFormat.BASE:
        common_query_filling.update(option_filling)
        query = BASE_TEMPLATE_FREE
        result =  (query.format_map(common_query_filling), original_to_shuffled)
    elif input_format_type is InputFormat.JSON:
        query = copy.deepcopy(base_json_dict)
        option_wrap = {"Options":  option_filling}
        query.update(common_query_filling)
        query.update(option_wrap)
        json_query = json.dumps(query, ensure_ascii=False, indent=4)
        result =  (json_query, original_to_shuffled)
    elif input_format_type is InputFormat.XML:
        xml_query = get_xml_input(common_query_filling, option_filling)
        result = (xml_query, original_to_shuffled)
    return result


# Test concept
for task_i, curr_subtask in enumerate(chosen_subtasks):
    curr_ds = mmmlu_subset.filter(lambda x: x['Subject'] == curr_subtask)
    shuffle_method = ShuffleMethod.DEFAULT
    input_format = InputFormat.BASE
    output_format = OutputFormat.BASE
    query_list = []
    for subtask_question_idx in range(100):
        curr_query, curr_orig_to_shuffled = get_query_shuffle_pair(curr_ds[subtask_question_idx],
                                                                                        shuffle_method=shuffle_method,
                                                                                        input_format_type=input_format,
                                                                                        output_format_type=output_format)
        orig_ans = curr_ds[subtask_question_idx]['Answer']
        curr_ans = curr_orig_to_shuffled[orig_ans] # Get the new position of answer
        query_dict = {
            "Question id in subtask": curr_ds[subtask_question_idx]['Question id in subtask'],
            "Shuffle method": shuffle_method.value,
            "Original to shuffled": curr_orig_to_shuffled,
            "Input format": input_format.value,
            "Output format": output_format.value,
            "Language": curr_langauage,
            "Query": curr_query,
            "Original correct answer": orig_ans,
            "Shuffled correct answer": curr_ans
        }
        query_list.append({"Query": curr_query, "Original to shuffled": curr_orig_to_shuffled, "Original correct answer": orig_ans, "Shuffled correct answer": curr_ans})
    break


In [240]:
import getpass
import os

try:
    # load environment variables from .env file (requires `python-dotenv`)
    from dotenv import load_dotenv

    load_dotenv()
except ImportError:
    pass

# Langsmith
os.environ["LANGSMITH_TRACING"] = "true"
if "LANGSMITH_API_KEY" not in os.environ:
    os.environ["LANGSMITH_API_KEY"] = "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"
if "LANGSMITH_PROJECT" not in os.environ:
    os.environ["LANGSMITH_PROJECT"] = "nlp_final"

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = ""

In [241]:
import getpass
import os

if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")

from langchain.chat_models import init_chat_model

model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

In [245]:
from langchain_core.prompts import ChatPromptTemplate

# language: The language to translate text into
# text: The text to translate


prompt_template = ChatPromptTemplate.from_messages(
    [("user", "{text}")]
)

prompt = prompt_template.invoke({"text": curr_query})
print(prompt)
print(curr_query)

messages=[HumanMessage(content="Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\nStatement 1 | Every ideal in a ring is a subring of the ring. Statement 2 | Every subring of every ring is an ideal of the ring.\n\nA) True, True\nB) False, False\nC) True, False\nD) False, True", additional_kwargs={}, response_metadata={})]
Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.

Statement 1 | Every ideal in a ring is a subring of the ring. Statement 2 | Every subring of every ring is an ideal of the ring.

A) True, True
B) False, False
C) True, False
D) False, True


In [246]:
response = model.invoke(prompt)

In [247]:
print(response.content)

Let's analyze each statement.
Statement 1: Every ideal in a ring is a subring of the ring.
For a subset $I$ of a ring $R$ to be an ideal, it must satisfy:
1. $I$ is a subgroup of $R$ under addition.
2. For any $r \in R$ and $x \in I$, $rx \in I$ and $xr \in I$.
For a subset $S$ of a ring $R$ to be a subring, it must satisfy:
1. $S$ is a subgroup of $R$ under addition.
2. $S$ is closed under multiplication.

An ideal $I$ must be a subgroup under addition, but it is not necessarily closed under multiplication. For example, consider the ring $\mathbb{Z}$ and the ideal $2\mathbb{Z}$. $2\mathbb{Z}$ is an ideal of $\mathbb{Z}$, but $2 \in 2\mathbb{Z}$ and $2 \cdot 2 = 4 \in 2\mathbb{Z}$. If we consider the ideal $2\mathbb{Z}$ in $\mathbb{Z}$, $2\mathbb{Z}$ is a subring of $\mathbb{Z}$ as well. However, consider the ring $\mathbb{Z}_6$ and the ideal $I = \{0, 2, 4\}$. $I$ is an ideal of $\mathbb{Z}_6$ and is a subring of $\mathbb{Z}_6$ because $2 \cdot 2 = 4 \in I$, $2 \cdot 4 = 8 \equiv 2 \p

In [248]:
# From common.py
def normalize_response(response: str) -> str:
    """
    Normalize the response by removing markdown and LaTeX formatting that may prevent a match.
    """

    return (
        response.replace("**", "")
        .replace("$\\boxed{", "")
        .replace("}$", "")
        .replace("\\$", "")
        .replace("$\\text{", "")
        .replace("$", "")
        .replace("\\mathrm{", "")
        .replace("\\{", "")
        .replace("\\text", "")
        .replace("\\(", "")
        .replace("\\mathbf{", "")
        .replace("{", "")
        .replace("\\boxed", "")
    )

MULTILINGUAL_ANSWER_REGEXES = [
    "Answer\s*:",
    "Answer\s*:​​​​​​",  # Korean invisible character
    "উত্তর\s*:",
    "उत्तर\s*:",
    "উত্তরঃ",
    "উত্তর\s*:",
    "Antwort\s*:",
    "답변\s*:",
    "정답\s*:",
    "답\s*:",
    "答案\s*：",
    "答案\s*:",
    "答\s*：",
    "答\s*:",
    "答复\s*：",
    "答曰\s*：",
    "الإجابة:",
    "الجواب:",
    "إجابة:",
    "الإجابة النهائية:",
    "الإجابة الصحيحة:",
    "الإجابة الصحيحة هي:",
    "الإجابة هي:",
    "الجواب النهائي:",
    "Respuesta\s*:",
    "Risposta\s*:",
    "答え\s*:",
    "答え\s*：",
    "回答\s*:",
    "回答\s*：",
    "解答\s*:",
    "Jawaban\s*:",
    "Réponse\s*:",
    "Resposta\s*:",
    "Jibu\s*:",
    "Idahun\s*:",
    "Ìdáhùn\s*:",
    "Idáhùn\s*:",
    "Àmọ̀nà\s*:",
    "Àdáhùn\s*:",
    "Ànúgọ\s*:",
    "Àṣàyàn\s*:",
    r'"Answer":\s*',   # Added
    r"'Answer':\s*",   # Added
    "<Answer>\s*",     # Added
]

# Modify for "A-D" or 'A-D'
MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = (
    r"(?i){}[ \t]*[\"\']?([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[Ａ]|[Ｂ]|[Ｃ]|[Ｄ])[\"\']?"
)

def normalize_extracted_answer(extracted_answer: str) -> str:
    return (
        # In arabic these are the letters used for A-D in multiple choice questions
        extracted_answer.replace("أ", " A")
        .replace("ب", " B")
        .replace("ج", " C")
        .replace("د", " D")
        # In Bengali these are the letters used for A-D in multiple choice questions
        .replace("অ", " A")
        .replace("ব", " B")
        .replace("ড", " C")
        .replace("ঢ", " D")
        # In Japanese these are the letters sometimes used for A-D in multiple choice questions
        .replace("Ａ", " A")
        .replace("Ｂ", " B")
        .replace("Ｃ", " C")
        .replace("Ｄ", " D")
        .strip()
    )

In [260]:
import re
# From common.py


def extract_answer_from_response(response_text: str) -> Optional[str]:
    extracted_answer = None
    for answer_regex in MULTILINGUAL_ANSWER_REGEXES:
        regex = MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
        match = re.search(regex, response_text)
        if match:
            extracted_answer = normalize_extracted_answer(match.group(1))
            # print(extracted_answer)
            break
    return extracted_answer

In [265]:
print(extract_answer_from_response("TEST A TEST"))
print(extract_answer_from_response(response.content))

None
B


In [250]:
# The experimental reuslt for saving?
response.to_json()#['kwargs']
 #to_json()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'messages', 'AIMessage'],
 'kwargs': {'content': "Let's analyze each statement.\nStatement 1: Every ideal in a ring is a subring of the ring.\nFor a subset $I$ of a ring $R$ to be an ideal, it must satisfy:\n1. $I$ is a subgroup of $R$ under addition.\n2. For any $r \\in R$ and $x \\in I$, $rx \\in I$ and $xr \\in I$.\nFor a subset $S$ of a ring $R$ to be a subring, it must satisfy:\n1. $S$ is a subgroup of $R$ under addition.\n2. $S$ is closed under multiplication.\n\nAn ideal $I$ must be a subgroup under addition, but it is not necessarily closed under multiplication. For example, consider the ring $\\mathbb{Z}$ and the ideal $2\\mathbb{Z}$. $2\\mathbb{Z}$ is an ideal of $\\mathbb{Z}$, but $2 \\in 2\\mathbb{Z}$ and $2 \\cdot 2 = 4 \\in 2\\mathbb{Z}$. If we consider the ideal $2\\mathbb{Z}$ in $\\mathbb{Z}$, $2\\mathbb{Z}$ is a subring of $\\mathbb{Z}$ as well. However, consider the ring $\\mathbb{Z}_6$ and the ideal $I 

In [None]:
import pandas as pd

# Draft
experiment_save_dict = {
    "Model": "",
    "Question id": "",
    "Shuffle method": "",
    "Original to shuffled": "",
    "Input format": "",
    "Output format": "",
    "Query": "",
    "Language": "",
    "Subtask": "",
    "Original correct answer": "",
    "Shuffled correct answer": "",
    "Response answer": "",
    "Model output": "",  # Output text only
    "Full response": "", # All the output
}

experiment_list = [experiment_save_dict]
experiment_df = pd.DataFrame(experiment_list)

In [267]:
experiment_df

Unnamed: 0,Model,Question id,Shuffle method,Original to shuffled,Input format,Output format,Query,Language,Subtask,Original correct answer,Shuffled correct answer,Response answer,Model output,Full response
0,,,,,,,,,,,,,,
