In [1]:
import os
import pandas as pd
from openai import OpenAI
from concurrent.futures import ProcessPoolExecutor
from tqdm.auto import tqdm
import json
import glob
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = "~/work/eedi_synthetic_data/MalAlgoQA_format.csv"

In [3]:
df = pd.read_csv(data_path)

In [4]:
df.head()

Unnamed: 0,QuestionId,QuestionText,CorrectAnswer,AnswerAText,AnswerBText,AnswerCText,AnswerDText,Rationale_A,Rationale_B,Rationale_C,Rationale_D,CorrectAnswerText
0,malalgoqa_0,Which list shows the following number in order...,C,235 237 254 276,237 276 235 254,276 254 237 235,276 254 235 237,Ordered least to greatest,Ordered greatest to least by ones place.,Correctly ordered the values from greatest to ...,Switched last 2 numbers.,276 254 237 235
1,malalgoqa_1,"Which is another way to represent 2,819?",B,"2,000 + 80 + 10 + 9","2,000 + 800 + 10 + 9","2,000 + 8,000 + 10 + 9","2,000 + 800 + 100 + 90",Extends tens place to the 8.,Place values are correct.,Misrepresents the place value of 8 as thousand...,Misrepresents the place value of 1 as hundreds...,"2,000 + 800 + 10 + 9"
2,malalgoqa_2,"Which is another representation for 3,236?",D,3 + 2 + 3 + 6,32 + 36,300 + 236,"3,000 + 236","Uses only ""+"" between all of the digits and do...","Uses ""+"" between 2 sets of the digits and does...","Omits zero after ""3"" to make ""300"", not ""3,000""",,"3,000 + 236"
3,malalgoqa_3,A series of numbers is shown: 18 25 32 39 ...,B,46,53,57,60,Chooses the 5th number in the sequence.,39 + 2*7 = 53,Chooses the 2nd and 3rd terms added together. ...,Chooses the 7th number in the sequence.,53
4,malalgoqa_4,Mia baked cookies from 2:45 p.m. to 3:22 p.m. ...,B,23 minutes,37 minutes,67 minutes,77 minutes,Calculates 45 - 22 = 23,15 + 22 = 37,Calculates 45 + 22 with no carrying from the o...,Calculates 45 + 22 = 77,37 minutes


In [5]:
index_start = 0
index_end = len(df)
step = 100
max_workers = 2

In [6]:
model_config = dict(
openai_api_base = "https://testshellapi.kimi.asia/v1", #host
api_key = "****", # model api key
model = "gpt-4o", # model name
default_system_prompt = """##Task
You are a Mathematics teacher. Your task is to reason and identify the ConstructName and SubjectName and then the misconception behind the user input Incorrect Answers with the Question.
ConstructName is Most granular level of knowledge related to question, appears to describe the specific mathematical method or procedure used to solve the question. It explains the technique or approach needed to reach the answer.
SubjectName is More general context than the construct, represents the broader mathematical topic or category that the question belongs to.
Misconceptions are a mistake in conceptual understanding and they have relations with all the applications of those concepts. For example, a single misconception on the connections among proportional relationships (part/whole, part/part, whole/part) can cause problems in identifying those patterns in drawings and can be the cause of failing to realize all parts must be of equal size, therefore associating the denominator of the fraction with the total number of parts regardless their size.
Answer concisely what misconception it is to lead to getting the incorrect answer.
Do not use "The misconception is" to start your answers.
Do not mention the concrete details of the question or answers. 

##User input
Question: The question text
A: multiple choice answer A text
B: multiple choice answer B text
C: multiple choice answer C text
D: multiple choice answer D text
Correct Answer: The correct answer text

##You should answer in the following JSON format
{
    "ConstructName": "here writes the constructName",
    "SubjectName": "here writes the SubjectName"
    "MisconceptionAName": "here writes the answer A's misconception.",
    "MisconceptionBName": "here writes the answer B's misconception.",
    "MisconceptionCName": "here writes the answer C's misconception.",
    "MisconceptionDName": "here writes the answer D's misconception.",
}
""", # system prompt
default_temperature=0.5, # model temperature 越高随机性越强。
max_tokens=256, # model 最大输出token数量
)

In [7]:
cache_folder = f"./cache_{model_config['model']}_model_misconceptions_result"
if not os.path.exists(cache_folder):
    os.makedirs(cache_folder)

In [8]:
output_data_path = f"misconception_data_{os.path.splitext(os.path.basename(data_path))[0]}_{model_config['model']}.csv"

In [9]:
class LLMChat:
    def __init__(self, openai_api_base, api_key, model, default_temperature, default_system_prompt, max_tokens=512):
        self.client = OpenAI(
            api_key = api_key,
            base_url=openai_api_base,
            )
        self.model = model
        self.default_temperature = default_temperature
        self.default_system_prompt = default_system_prompt
        self.max_tokens = max_tokens
    
    def chat(self, user_prompt, system_prompt=None, temperature=None):
        if not system_prompt:
            system_prompt = self.default_system_prompt
            
        if not temperature:
            temperature = self.default_temperature

        chat_response = self.client.chat.completions.create(
            model=self.model,
            temperature=temperature,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            max_tokens=self.max_tokens,
            response_format={"type": "json_object"} #{"type": "text"}
        )
        return chat_response.choices[0].message.content

In [10]:
vc = LLMChat(**model_config)

In [11]:
def process_row(args, debug=False):
    user_prompt = """
Question: {question}
A: {answer_a}
B: {answer_b}
C: {answer_c}
D: {answer_d}
Correct Answer: {correct_answer}
"""
    index, row = args
    ca = row["CorrectAnswer"]
    correctanswer = row[f"Answer{ca}Text"]
    input_user_prompt = user_prompt.format(
        question=row['QuestionText'],
        answer_a=row['AnswerAText'],
        answer_b=row['AnswerBText'],
        answer_c=row['AnswerCText'],
        answer_d=row['AnswerDText'],
        correct_answer=correctanswer,
    )
    ret_data = {}
    try:
        ret_data = vc.chat(input_user_prompt)
        if debug:
            print(ret_data+'\n')
    except Exception as e:
        print(f'An exception occur {str(e)}')
        ret_data['error'] = str(e)
        pass
    if debug:
        print('system: ', model_config['default_system_prompt'])
        print('>'* 50)
        print('user_input: ', input_user_prompt)
        print('>'* 50)
        print('assistant: ', ret_data)
    return ret_data

In [12]:
r = process_row((7, df.iloc[7]), debug=True)

{
    "ConstructName": "Ordering Numbers",
    "SubjectName": "Number Sense",
    "MisconceptionAName": "Confusing the arrangement by focusing on the first or last digits rather than comparing the entire numbers.",
    "MisconceptionBName": "Reversing the order, possibly misunderstanding ascending and descending order.",
    "MisconceptionCName": "Misinterpreting the order by not correctly identifying the relative size of each number.",
    "MisconceptionDName": "No misconception, as this is the correct answer."
}

system:  ##Task
You are a Mathematics teacher. Your task is to reason and identify the ConstructName and SubjectName and then the misconception behind the user input Incorrect Answers with the Question.
ConstructName is Most granular level of knowledge related to question, appears to describe the specific mathematical method or procedure used to solve the question. It explains the technique or approach needed to reach the answer.
SubjectName is More general context than the 

In [13]:
def save_json(fn, obj):
    with open(fn, 'w') as f:
        json.dump(obj, f, ensure_ascii=False, indent=4)
    print(f"save file to {fn}")

def slice_range(start, end, step):
    """
    按指定的步长对给定的范围进行分片。

    参数:
    start (int): 起始值
    end (int): 结束值
    step (int): 步长

    返回:
    list: 分片后的列表
    """
    if step <= 0:
        raise ValueError("步长必须大于0")
    
    result = []
    while start <= end:
        result.append(start)
        start += step
    if result[-1] < end:
        result.append(end)
    return result


def process_pairs(sliced_range):
    """
    对分片后的列表中的每两个元素进行处理。

    参数:
    sliced_range (list): 分片后的列表

    返回:
    None
    """
    slices = []
    for first, second in zip(sliced_range, sliced_range[1:]):
        # 在这里进行你想要的操作
        slices.append([first, second])
    return slices

In [14]:
sliced_range = process_pairs(slice_range(index_start, index_end, step))
print(sliced_range)

[[0, 100], [100, 200], [200, 300], [300, 400], [400, 500], [500, 600], [600, 700], [700, 800], [800, 807]]


In [15]:
for slices in tqdm(sliced_range, total=len(sliced_range)):
    output_filepath = f'{cache_folder}/cache_res_{slices[0]}.json'
    if os.path.exists(output_filepath):
        print(f'cache file exists, skip {output_filepath}')
        continue
    df_tasks = df.iloc[slices[0]:slices[1]]
    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        # 使用 tqdm 包装 executor.map 来显示进度条
        results = list(tqdm(executor.map(process_row, df_tasks.iterrows()), total=len(df_tasks)))
    save_json(output_filepath, results)

  0%|          | 0/9 [00:00<?, ?it/s]

cache file exists, skip ./cache_gpt-4o_model_misconceptions_result/cache_res_0.json
cache file exists, skip ./cache_gpt-4o_model_misconceptions_result/cache_res_100.json



  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:02<04:31,  2.74s/it][A
  3%|▎         | 3/100 [00:04<02:21,  1.46s/it][A
  4%|▍         | 4/100 [00:04<01:38,  1.02s/it][A
  5%|▌         | 5/100 [00:07<02:30,  1.59s/it][A
  7%|▋         | 7/100 [00:09<02:04,  1.34s/it][A
  8%|▊         | 8/100 [00:10<01:45,  1.14s/it][A
  9%|▉         | 9/100 [00:12<01:59,  1.32s/it][A
 10%|█         | 10/100 [00:13<02:07,  1.41s/it][A
 11%|█         | 11/100 [00:15<02:03,  1.38s/it][A
 12%|█▏        | 12/100 [00:16<01:57,  1.33s/it][A
 13%|█▎        | 13/100 [00:17<01:56,  1.34s/it][A
 14%|█▍        | 14/100 [00:18<01:50,  1.29s/it][A
 15%|█▌        | 15/100 [00:19<01:39,  1.17s/it][A
 16%|█▌        | 16/100 [00:21<01:56,  1.39s/it][A
 17%|█▋        | 17/100 [00:22<01:34,  1.13s/it][A
 18%|█▊        | 18/100 [00:23<01:42,  1.25s/it][A
 19%|█▉        | 19/100 [00:24<01:32,  1.14s/it][A
 20%|██        | 20/100 [00:26<01:42,  1.28s/it][A
 21%|██        | 21/100 [00

save file to ./cache_gpt-4o_model_misconceptions_result/cache_res_200.json



  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:03<05:27,  3.31s/it][A
  3%|▎         | 3/100 [00:05<02:33,  1.58s/it][A
  4%|▍         | 4/100 [00:06<02:23,  1.50s/it][A
  5%|▌         | 5/100 [00:07<02:12,  1.40s/it][A
  6%|▌         | 6/100 [00:08<01:55,  1.23s/it][A
  7%|▋         | 7/100 [00:11<02:35,  1.67s/it][A
  8%|▊         | 8/100 [00:11<01:53,  1.24s/it][A
  9%|▉         | 9/100 [00:13<02:06,  1.39s/it][A
 10%|█         | 10/100 [00:13<01:45,  1.17s/it][A
 11%|█         | 11/100 [00:20<04:21,  2.93s/it][A
 14%|█▍        | 14/100 [00:21<02:05,  1.46s/it][A
 15%|█▌        | 15/100 [00:25<02:38,  1.87s/it][A
 17%|█▋        | 17/100 [00:25<01:46,  1.28s/it][A
 18%|█▊        | 18/100 [00:27<01:54,  1.40s/it][A
 19%|█▉        | 19/100 [00:28<01:40,  1.25s/it][A
 20%|██        | 20/100 [00:30<01:45,  1.31s/it][A
 21%|██        | 21/100 [00:30<01:30,  1.15s/it][A
 22%|██▏       | 22/100 [00:32<01:38,  1.26s/it][A
 23%|██▎       | 23/100 [00:

save file to ./cache_gpt-4o_model_misconceptions_result/cache_res_300.json



  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:03<05:04,  3.07s/it][A
  3%|▎         | 3/100 [00:05<02:36,  1.61s/it][A
  4%|▍         | 4/100 [00:06<02:06,  1.31s/it][A
  5%|▌         | 5/100 [00:07<02:17,  1.45s/it][A
  6%|▌         | 6/100 [00:08<01:42,  1.09s/it][A
  7%|▋         | 7/100 [00:10<02:13,  1.44s/it][A
  9%|▉         | 9/100 [00:14<02:37,  1.73s/it][A
 11%|█         | 11/100 [00:16<02:13,  1.50s/it][A
 13%|█▎        | 13/100 [00:18<01:58,  1.36s/it][A
 15%|█▌        | 15/100 [00:21<01:59,  1.41s/it][A
 16%|█▌        | 16/100 [00:22<01:40,  1.19s/it][A
 17%|█▋        | 17/100 [00:24<01:57,  1.42s/it][A
 18%|█▊        | 18/100 [00:25<01:39,  1.22s/it][A
 19%|█▉        | 19/100 [00:27<02:13,  1.64s/it][A
 21%|██        | 21/100 [00:30<01:51,  1.41s/it][A
 22%|██▏       | 22/100 [00:30<01:33,  1.20s/it][A
 23%|██▎       | 23/100 [00:32<01:41,  1.32s/it][A
 24%|██▍       | 24/100 [00:33<01:30,  1.19s/it][A
 25%|██▌       | 25/100 [00

save file to ./cache_gpt-4o_model_misconceptions_result/cache_res_400.json



  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:04<07:17,  4.42s/it][A
  3%|▎         | 3/100 [00:07<03:23,  2.10s/it][A
  4%|▍         | 4/100 [00:07<02:20,  1.46s/it][A
  5%|▌         | 5/100 [00:10<03:09,  1.99s/it][A
  7%|▋         | 7/100 [00:12<02:25,  1.57s/it][A
  9%|▉         | 9/100 [00:14<02:00,  1.32s/it][A
 10%|█         | 10/100 [00:15<01:43,  1.15s/it][A
 11%|█         | 11/100 [00:17<02:03,  1.39s/it][A
 12%|█▏        | 12/100 [00:17<01:33,  1.06s/it][A
 13%|█▎        | 13/100 [00:19<02:00,  1.38s/it][A
 14%|█▍        | 14/100 [00:20<01:37,  1.13s/it][A
 15%|█▌        | 15/100 [00:22<02:00,  1.42s/it][A
 16%|█▌        | 16/100 [00:22<01:41,  1.21s/it][A
 17%|█▋        | 17/100 [00:25<02:12,  1.60s/it][A
 19%|█▉        | 19/100 [00:27<01:52,  1.39s/it][A
 21%|██        | 21/100 [00:30<01:42,  1.30s/it][A
 22%|██▏       | 22/100 [00:34<02:40,  2.06s/it][A
 24%|██▍       | 24/100 [00:35<01:40,  1.33s/it][A
 25%|██▌       | 25/100 [0

save file to ./cache_gpt-4o_model_misconceptions_result/cache_res_500.json



  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:02<04:51,  2.95s/it][A
  3%|▎         | 3/100 [00:04<02:11,  1.36s/it][A
  4%|▍         | 4/100 [00:05<01:50,  1.15s/it][A
  5%|▌         | 5/100 [00:07<02:18,  1.46s/it][A
  6%|▌         | 6/100 [00:07<01:41,  1.08s/it][A
  7%|▋         | 7/100 [00:10<02:27,  1.59s/it][A
  9%|▉         | 9/100 [00:12<02:01,  1.34s/it][A
 10%|█         | 10/100 [00:12<01:33,  1.04s/it][A
 11%|█         | 11/100 [00:15<02:10,  1.47s/it][A
 13%|█▎        | 13/100 [00:17<01:59,  1.37s/it][A
 14%|█▍        | 14/100 [00:17<01:32,  1.08s/it][A
 15%|█▌        | 15/100 [00:20<01:55,  1.36s/it][A
 16%|█▌        | 16/100 [00:20<01:29,  1.06s/it][A
 17%|█▋        | 17/100 [00:22<01:57,  1.42s/it][A
 18%|█▊        | 18/100 [00:22<01:28,  1.09s/it][A
 19%|█▉        | 19/100 [00:24<01:49,  1.35s/it][A
 20%|██        | 20/100 [00:25<01:20,  1.01s/it][A
 21%|██        | 21/100 [00:27<01:44,  1.32s/it][A
 22%|██▏       | 22/100 [00

save file to ./cache_gpt-4o_model_misconceptions_result/cache_res_600.json



  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:02<04:47,  2.91s/it][A
  3%|▎         | 3/100 [00:04<02:20,  1.45s/it][A
  4%|▍         | 4/100 [00:05<01:48,  1.13s/it][A
  5%|▌         | 5/100 [00:07<02:32,  1.60s/it][A
  7%|▋         | 7/100 [00:10<02:13,  1.44s/it][A
  8%|▊         | 8/100 [00:10<01:43,  1.13s/it][A
  9%|▉         | 9/100 [00:12<02:01,  1.33s/it][A
 10%|█         | 10/100 [00:12<01:30,  1.01s/it][A
 11%|█         | 11/100 [00:16<02:45,  1.86s/it][A
 13%|█▎        | 13/100 [00:17<01:43,  1.19s/it][A
 14%|█▍        | 14/100 [00:19<01:55,  1.34s/it][A
 15%|█▌        | 15/100 [00:20<01:45,  1.24s/it][A
 16%|█▌        | 16/100 [00:21<01:51,  1.33s/it][A
 17%|█▋        | 17/100 [00:22<01:43,  1.24s/it][A
 18%|█▊        | 18/100 [00:24<01:42,  1.25s/it][A
 19%|█▉        | 19/100 [00:26<02:09,  1.60s/it][A
 20%|██        | 20/100 [00:26<01:39,  1.24s/it][A
 21%|██        | 21/100 [00:29<02:08,  1.63s/it][A
 23%|██▎       | 23/100 [00

save file to ./cache_gpt-4o_model_misconceptions_result/cache_res_700.json



  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:02<00:16,  2.77s/it][A
 29%|██▊       | 2/7 [00:03<00:07,  1.50s/it][A
 43%|████▎     | 3/7 [00:05<00:06,  1.65s/it][A
 57%|█████▋    | 4/7 [00:06<00:04,  1.34s/it][A
 71%|███████▏  | 5/7 [00:07<00:02,  1.37s/it][A
 86%|████████▌ | 6/7 [00:08<00:01,  1.15s/it][A
100%|██████████| 7/7 [00:10<00:00,  1.52s/it][A
100%|██████████| 9/9 [13:08<00:00, 87.66s/it] 

save file to ./cache_gpt-4o_model_misconceptions_result/cache_res_800.json





In [16]:
f_names = glob.glob(f'{cache_folder}/*.json')

In [17]:
def natural_sort_key(filename):
    # 使用正则表达式匹配文件名中的数字序列
    parts = re.findall(r'\d+', filename)
    # 将数字序列转换为整数
    return tuple(map(int, parts))

In [18]:
sorted_filenames = sorted(f_names, key=natural_sort_key)
f_names = sorted_filenames

In [19]:
results = []
for fn in f_names:
    with open(fn, 'r') as f:
        batch_results =  json.load(f)
    results.extend(batch_results)

In [20]:
l = len(results)

In [21]:
results = [json.loads(r) for r in results]

In [22]:
df = df.iloc[:l]

In [23]:
gen_df = pd.DataFrame(results)

In [24]:
df = pd.concat([df, gen_df], axis=1)

In [25]:
df.to_csv(output_data_path, index=False)