In [1]:
import pandas as pd
import torch
import numpy as np
from vllm import LLM, SamplingParams
from tqdm.auto import tqdm
import random
import os

2024-11-04 12:42:14.732224: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730691734.749731  423556 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730691734.755015  423556 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-04 12:42:14.776384: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
No module named 'vllm._version'
  from vllm.version import __version__ as VLLM_VERSION


In [2]:
seed = 777
# seed = 20241027
# seed = 20240617
# seed = 355643

def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    print('> SEEDING DONE')

set_seed(seed)

> SEEDING DONE


In [3]:
MODEL_NAME = "Qwen/Qwen2.5-72B-Instruct-AWQ"
NUM_GPU = torch.cuda.device_count()
few_shot = 3

In [None]:
llm = LLM(
    MODEL_NAME,
    tensor_parallel_size=1,
    gpu_memory_utilization=0.95,
    trust_remote_code=True,
    dtype="half",
    enforce_eager=True,
    max_model_len=4096 * 2,
    disable_log_stats=True,
    max_num_seqs=256,
)
tokenizer = llm.get_tokenizer()

sampling_params = SamplingParams(
    n=1,  # Number of output sequences to return for each prompt.
    top_p=0.7,  # Float that controls the cumulative probability of the top tokens to consider.
    temperature=1.0,  # randomness of the sampling
    seed=seed,  # Seed for reprodicibility
    skip_special_tokens=True,  # Whether to skip special tokens in the output.
    max_tokens=3072  # Maximum number of tokens to generate per output sequence.
)

INFO 11-04 12:42:32 awq_marlin.py:97] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 11-04 12:42:32 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='Qwen/Qwen2.5-72B-Instruct-AWQ', speculative_config=None, tokenizer='Qwen/Qwen2.5-72B-Instruct-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_

Loading safetensors checkpoint shards:   0% Completed | 0/11 [00:00<?, ?it/s]


INFO 11-04 12:42:50 model_runner.py:1071] Loading model weights took 38.7697 GB
INFO 11-04 12:42:56 gpu_executor.py:122] # GPU blocks: 6846, # CPU blocks: 819
INFO 11-04 12:42:56 gpu_executor.py:126] Maximum concurrency for 8192 tokens per request: 13.37x


In [None]:
df = pd.read_csv('../../../../data/raw/misconception_mapping.csv')
df

Unnamed: 0,MisconceptionId,MisconceptionName
0,0,Does not know that angles in a triangle sum to...
1,1,Uses dividing fractions method for multiplying...
2,2,Believes there are 100 degrees in a full turn
3,3,Thinks a quadratic without a non variable term...
4,4,Believes addition of terms and powers of terms...
...,...,...
2582,2582,"When multiplying numbers with the same base, m..."
2583,2583,Does not know what a cube number is
2584,2584,Believes that any percentage of a larger numbe...
2585,2585,Believes a cubic expression should have three ...


In [None]:
import pandas as pd
df_train = pd.read_csv('../../../../data/1.chain_of_thought/train.csv')
df_train

Unnamed: 0,QuestionId_Answer,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,QuestionText,Answer,AnswerText,Correct,CorrectAnswer,CorrectAnswerText,FirstSubjectId,FirstSubjectName,SecondSubjectId,SecondSubjectName,ThirdSubjectId,ThirdSubjectName,MisconceptionId,p000-qwen25-32b-instruct-cot_misunderstanding
0,0_D,0,856,Use the order of operations to carry out calcu...,33,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,D,Does not need brackets,0,A,\( 3 \times(2+4)-5 \),32,Number,144,Basic Arithmetic,33,BIDMAS,1672.0,The students' misunderstanding lies in their l...
1,1_A,1,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",A,\( m+1 \),0,D,Does not simplify,49,Algebra,255,Algebraic Fractions,1077,Simplifying Algebraic Fractions,2142.0,The misunderstanding likely stems from a conce...
2,1_B,1,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",B,\( m+2 \),0,D,Does not simplify,49,Algebra,255,Algebraic Fractions,1077,Simplifying Algebraic Fractions,143.0,The misunderstanding likely stems from a conce...
3,1_C,1,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",C,\( m-1 \),0,D,Does not simplify,49,Algebra,255,Algebraic Fractions,1077,Simplifying Algebraic Fractions,2142.0,### Explanation of the Mistake\n\n**Step-by-St...
4,2_A,2,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,Tom and Katie are discussing the \( 5 \) plant...,A,Only\nTom,0,B,Only\nKatie,101,Data and Statistics,338,Data Processing,339,Range and Interquartile Range from a List of Data,1287.0,The misunderstanding here revolves around how ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4365,1867_C,1867,2634,Distinguish between congruency and similarity,274,Congruency in Other Shapes,Tom and Katie are discussing congruence and si...,C,Both Tom and Katie,0,B,Only Katie,71,Geometry and Measure,272,Similarity and Congruency,274,Congruency in Other Shapes,2312.0,The misunderstanding here revolves around the ...
4366,1867_D,1867,2634,Distinguish between congruency and similarity,274,Congruency in Other Shapes,Tom and Katie are discussing congruence and si...,D,Neither is correct,0,B,Only Katie,71,Geometry and Measure,272,Similarity and Congruency,274,Congruency in Other Shapes,2312.0,The misunderstanding here revolves around the ...
4367,1868_A,1868,2680,Describe a 90° or 270° rotation giving the ang...,93,Rotation,Jo and Paul are arguing about how to fully des...,A,Only\nJo,0,B,Only Paul,71,Geometry and Measure,278,Transformations,93,Rotation,801.0,The misunderstanding here revolves around the ...
4368,1868_C,1868,2680,Describe a 90° or 270° rotation giving the ang...,93,Rotation,Jo and Paul are arguing about how to fully des...,C,Both Jo and Paul,0,B,Only Paul,71,Geometry and Measure,278,Transformations,93,Rotation,801.0,The misunderstanding here revolves around the ...


In [8]:
system_prompt_template = 'You are an excellent math teacher about to teach students of year group 1 to 14. You will be provided a misconception that your students may have. Please write 5 diagnostic questions that can help you identify the misconception. Only write the question. No need to provide explanation for correct answer. Each question should be in format of "Topic: {subject}. Question: {question}."'
example_template = 'Misconception: {misconception}\nTopic: {subject}. Question: {question}.'
user_prompt_template = "Here are some examples:\n{examples}.\nHere are all of the topics of your lesson: {subjects}\n\nPlease start writing the question. Misconception: {misconception}\n"

In [9]:
i = 36
row = df.iloc[i]
misconception = row['MisconceptionName']

df_train_sample = df_train.sample(few_shot).reset_index(drop=True)
examples = []
for j in range(len(df_train_sample)):
    row_sample = df_train_sample.iloc[j]
    misconception_sample = row_sample['Misconception']
    question_sample = row_sample['QuestionText']
    correct_answer_sample = row_sample['CorrectAnswerText']
    wrong_answer_sample = row_sample['AnswerText']
    first_subject = row_sample['FirstSubjectName']
    second_subject = row_sample['SecondSubjectName']
    third_subject = row_sample['ThirdSubjectName']
    construct = row_sample['ConstructName']
    misunderstanding = row_sample['p000-qwen25-32b-instruct_misunderstanding']

    example = f'example{j+1}: \n' + example_template.format(misconception=misconception_sample, subject=third_subject, question=question_sample)
    examples.append(example)

example_str = '\n\n'.join(examples)

user_prompt = user_prompt_template.format(misconception=misconception, examples=example_str, subjects=all_subjects_str)
system_prompt = system_prompt_template
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt},
]

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(text)

<|im_start|>system
You are an excellent math teacher about to teach students of year group 1 to 14. You will be provided a misconception that your students may have. Please write 5 diagnostic questions that can help you identify the misconception. Only write the question. No need to provide explanation for correct answer. Each question should be in format of "Topic: {subject}. Question: {question}."<|im_end|>
<|im_start|>user
Here are some examples:
example1: 
Misconception: Does not know how to identify common factors from algebraic terms
Topic: Factors and Highest Common Factor. Question: Tom and Katie are discussing factors

Tom says \( 4 \) is a common factor of \( 6 x \) and \( 9 y^{2} \)

Katie says \( y \) is a common factor of \( 6 x \) and \( 9 y^{2} \)

Who is correct?.

example2: 
Misconception: Divides rather than adds when answering worded problems
Topic: Mental Addition and Subtraction. Question: Which of the following calculations would solve this problem?
Harvey is moni

In [10]:
output = llm.generate(text, sampling_params=sampling_params)
print(output[0].outputs[0].text)

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 1/1 [00:22<00:00, 22.02s/it, est. speed input: 198.83 toks/s, output: 13.90 toks/s]

Topic: Enlargement. Question: A shape is to be enlarged by a scale factor of 2 with the center of enlargement at point (3, 3). If a point on the shape is at (1, 1), where will the corresponding point be after the enlargement?

Topic: Enlargement. Question: A triangle has vertices at (2, 2), (4, 2), and (2, 4). It is to be enlarged by a scale factor of 3 with the center of enlargement at the origin (0, 0). What are the coordinates of the new vertices?

Topic: Enlargement. Question: A rectangle has vertices at (1, 1), (1, 3), (4, 3), and (4, 1). It is to be enlarged by a scale factor of 0.5 with the center of enlargement at (2, 2). What are the coordinates of the new vertices?

Topic: Enlargement. Question: A point (5, 5) is to be enlarged by a scale factor of -2 with the center of enlargement at (1, 1). What are the coordinates of the new point?

Topic: Enlargement. Question: A shape is to be enlarged by a scale factor of 1.5 with the center of enlargement at (2, 2). If a point on the s




In [11]:
all_texts = []
for i in tqdm(range(len(df))):
    row = df.iloc[i]
    misconception = row['MisconceptionName']

    df_train_sample = df_train.sample(few_shot).reset_index(drop=True)
    examples = []
    for j in range(len(df_train_sample)):
        row_sample = df_train_sample.iloc[j]
        misconception_sample = row_sample['Misconception']
        question_sample = row_sample['QuestionText']
        correct_answer_sample = row_sample['CorrectAnswerText']
        wrong_answer_sample = row_sample['AnswerText']
        first_subject = row_sample['FirstSubjectName']
        second_subject = row_sample['SecondSubjectName']
        third_subject = row_sample['ThirdSubjectName']
        construct = row_sample['ConstructName']
        misunderstanding = row_sample['p000-qwen25-32b-instruct_misunderstanding']

        example = f'example{j+1}: \n' + example_template.format(misconception=misconception_sample, subject=third_subject, question=question_sample)
        examples.append(example)

    example_str = '\n\n'.join(examples)

    user_prompt = user_prompt_template.format(misconception=misconception, examples=example_str, subjects=all_subjects_str)
    system_prompt = system_prompt_template
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    all_texts.append(text)

  0%|          | 0/2587 [00:00<?, ?it/s]

In [12]:
output = llm.generate(all_texts, sampling_params=sampling_params)

Processed prompts:   1%|          | 24/2587 [02:30<4:09:37,  5.84s/it, est. speed input: 696.09 toks/s, output: 23.09 toks/s]

In [None]:
df['synthetic_question'] = [x.outputs[0].text for x in output]

In [5]:
def extract_question(text):
    topics = []
    questions = []
    splits = text.split('Topic:')
    for split in splits:
        try:
            topic, question = split.split('Question:')
            topic = topic.strip()
            if topic[-1] == '.':
                topic = topic[:-1]
            question = question.strip()
            if question[-1] == '.':
                question = question[:-1]
            topics.append(topic)
            questions.append(question)
        except:
            continue
    return topics, questions

In [None]:
rows = []
for i in range(len(df)):
    row = df.iloc[i]
    mis_id = row['MisconceptionId']
    mis_name = row['MisconceptionName']
    synthetic_question = row['synthetic_question']
    topics, questions = extract_question(synthetic_question)
    for topic, question in zip(topics, questions):
        rows.append([mis_id, mis_name, topic, question])
df_generated = pd.DataFrame(rows, columns=['MisconceptionId', 'MisconceptionName', 'SubjectName', 'QuestionText'])

In [None]:
df_generated.to_csv('../../../../data/2.synthetic_data_generation/generation1_qwen.csv', index=False)