In [1]:
import pandas as pd
import torch
import numpy as np
import random
import os
from vllm import LLM, SamplingParams
from tqdm.auto import tqdm

seed = 99

def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    print('> SEEDING DONE')

set_seed(seed)

2024-11-06 00:07:22.853616: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730819242.908712 1299302 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730819242.926243 1299302 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 00:07:23.049098: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
No module named 'vllm._version'
  from vllm.version import __version__ as VLLM_VERSION


> SEEDING DONE


In [2]:
few_shot = 5

In [None]:

df = pd.read_csv('../../../../data/2.synthetic_data_generation/generation1_all_render.csv')
df_train = pd.read_csv('../../../../data/1.chain_of_thought/train.csv')
df

Unnamed: 0,MisconceptionId,MisconceptionName,ThirdSubjectName,QuestionText,author,ThirdSubjectId,SecondSubjectId,SecondSubjectName,FirstSubjectId,FirstSubjectName,s000-AnswerText-qwen-42,s000-CorrectAnswerText-qwen-42,s000-AnswerText-qwen-must-answer-42,CorrectAnswerText-qwen-seed42,AnswerText-qwen-seed42,AnswerText-qwen-must-answer-seed42,AnswerText-qwen-answer-final-seed42,CorrectAnswerText,AnswerText
0,2,Believes there are 100 degrees in a full turn,"Basic Angle Facts (straight line, opposite, ar...","If Sally spins around in a full circle, how ma...",llama,181,74.0,Angles,71.0,Geometry and Measure,To solve the problem under the constraint that...,To determine how many degrees Sally has turned...,To solve the problem under the constraint that...,360,100,100,100,360,100
1,2,Believes there are 100 degrees in a full turn,"Basic Angle Facts (straight line, opposite, ar...",What is the sum of the angles around a point?,llama,181,74.0,Angles,71.0,Geometry and Measure,To solve the problem under the constraint that...,To determine the sum of the angles around a po...,To solve the problem under the constraint that...,360,100,100,100,360,100
2,2,Believes there are 100 degrees in a full turn,Angles-Others,"A car wheel makes one full rotation, how many ...",llama,1177,74.0,Angles,71.0,Geometry and Measure,To solve the problem under the constraint that...,To determine how many degrees a car wheel turn...,To solve the problem under the constraint that...,360,100,100,100,360,100
3,2,Believes there are 100 degrees in a full turn,"Basic Angle Facts (straight line, opposite, ar...","If an angle is 90 degrees, what fraction of a ...",llama,181,74.0,Angles,71.0,Geometry and Measure,To solve the problem under the constraint that...,To determine what fraction of a full turn an a...,To solve the problem under the constraint that...,\frac{1}{4},\frac{9}{10},\frac{9}{10},\frac{9}{10},\frac{1}{4},\frac{9}{10}
4,2,Believes there are 100 degrees in a full turn,Angles-Others,If a person turns 180 degrees and then another...,llama,1177,74.0,Angles,71.0,Geometry and Measure,To solve the problem under the constraint that...,To determine the total degrees turned by a per...,To solve the problem under the constraint that...,360,100,100,100,360,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16356,2053,"When reading integers on a number line, assume...",Mental Addition and Subtraction,On a number line with dashes marked at every 5...,llama,203,144.0,Basic Arithmetic,32.0,Number,To solve the problem under the given constrain...,To determine the value of the point that is 3 ...,To solve the problem under the given constrain...,25 or -5,Not enough information,13,13,25 or -5,13
16357,1271,Confuses chord and radius,Parts of a Circle,"In a circle, a line segment that connects two ...",qwen,184,77.0,Circles,71.0,Geometry and Measure,To solve the problem under the constraint of t...,To determine the name of a line segment that c...,To solve the problem under the constraint that...,diameter,Not enough information,chord,chord,diameter,chord
16358,1741,Confuses diameter and chord,Circles-Others,"In a circle with a radius of 5 cm, what is the...",qwen,1178,77.0,Circles,71.0,Geometry and Measure,To solve the problem under the constraint that...,To find the length of the diameter of a circle...,To solve the problem under the constraint that...,10,Not enough information,5,5,10,5
16359,566,Thinks the inverse of subtraction is division,Linear Equations,"If x = 5 - 2, what operation would you use to ...",llama,64,154.0,Solving Equations,49.0,Algebra,To solve the problem under the constraint that...,To isolate the 5 in the equation \( x = 5 - 2 ...,To solve the problem under the constraint that...,addition,Not enough information,division,division,addition,division


In [4]:

model_path = "Qwen/Qwen2.5-72B-Instruct-AWQ"
num_gpu = 1

llm = LLM(
    model_path,
    tensor_parallel_size=num_gpu,
    gpu_memory_utilization=0.95,
    trust_remote_code=True,
    dtype="half",
    enforce_eager=True,
    max_model_len=4096,
    disable_log_stats=True,
    max_num_seqs=128,
)
tokenizer = llm.get_tokenizer()

sampling_params = SamplingParams(
    n=1,  # Number of output sequences to return for each prompt.
    top_p=0.9,  # Float that controls the cumulative probability of the top tokens to consider.
    temperature=0.7,  # randomness of the sampling
    seed=seed,  # Seed for reprodicibility
    skip_special_tokens=True,  # Whether to skip special tokens in the output.
    max_tokens=512,  # Maximum number of tokens to generate per output sequence.
)

INFO 11-06 00:07:42 awq_marlin.py:97] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 11-06 00:07:42 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='Qwen/Qwen2.5-72B-Instruct-AWQ', speculative_config=None, tokenizer='Qwen/Qwen2.5-72B-Instruct-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_

Loading safetensors checkpoint shards:   0% Completed | 0/11 [00:00<?, ?it/s]


INFO 11-06 00:08:22 model_runner.py:1071] Loading model weights took 38.7697 GB
INFO 11-06 00:08:25 gpu_executor.py:122] # GPU blocks: 7171, # CPU blocks: 819
INFO 11-06 00:08:25 gpu_executor.py:126] Maximum concurrency for 4096 tokens per request: 28.01x


In [5]:
system_prompt_template = 'You are an excellent math teacher about to teach students of year group 1 to 14. The detail of your lesson is as follows. Subject:{first_subject}, Topic: {second_subject}, Subtopic {third_subject}. You will be provided a question. Please tell me the most granular level of knowledge related to question. Your answer should be in the format of "ConstructName: {construct name}"'
example_template = 'Question: {question}\nConstructName: {construct_name}'
user_prompt_template = "Here are some examples:\n{examples}.\n\nPlease start answering. Question: {question}\nConstructName: "

In [8]:
i = 0
row = df.iloc[i]
question = row['QuestionText']
question = question
third_subject = row['ThirdSubjectName']
second_subject = row['SecondSubjectName']
first_subject = row['FirstSubjectName']
df_train_third = df_train[df_train['ThirdSubjectName'] == third_subject].reset_index(drop=True)
df_train_second = df_train[df_train['SecondSubjectName'] == second_subject].reset_index(drop=True)
df_train_first = df_train[df_train['FirstSubjectName'] == first_subject].reset_index(drop=True)

df_train_sampler = None
if len(df_train_third) > 0:
    df_train_sampler = df_train_third
    diff = few_shot - len(df_train_third)
    if diff > 0:
        if len(df_train_second)>=diff:
            df_train_sampler = pd.concat([df_train_sampler, df_train_second.sample(diff).reset_index(drop=True)])
        else:
            df_train_sampler = pd.concat([df_train_sampler, df_train_second])
            diff = diff - len(df_train_second)
            if len(df_train_first)>=diff:
                df_train_sampler = pd.concat([df_train_sampler, df_train_first.sample(diff).reset_index(drop=True)])
            else:
                df_train_sampler = pd.concat([df_train_sampler, df_train_first])
                diff = diff - len(df_train_first)
                df_not_first
                df_train_sampler = pd.concat([df_train_sampler,  df_train_not_first.sample(diff)]).reset_index(drop=True)
elif len(df_train_second) > 0:
    df_train_sampler = df_train_second
    diff = few_shot - len(df_train_second)
    if diff > 0:
        if len(df_train_first)>=diff:
            df_train_sampler = pd.concat([df_train_sampler, df_train_first.sample(diff).reset_index(drop=True)])
        else:
            df_train_sampler = pd.concat([df_train_sampler, df_train_first])
            diff = diff - len(df_train_first)
            df_train_sampler = pd.concat([df_train_sampler, df_train_not_first.sample(diff)]).reset_index(drop=True)
elif len(df_train_first) > 0:
    df_train_sampler = df_train_first
    diff = few_shot - len(df_train_first)
    if diff > 0:
        df_train_sampler = pd.concat([df_train_sampler, df_train_not_first.sample(diff)]).reset_index(drop=True)
else:
    df_train_sampler = df_train


df_train_sample = df_train_sampler.sample(few_shot).reset_index(drop=True)
examples = []
for j in range(len(df_train_sample)):
    row_sample = df_train_sample.iloc[j]
    question_sample = row_sample['QuestionText']
    ans_a_sample = row_sample['AnswerAText']
    ans_b_sample = row_sample['AnswerBText']
    ans_c_sample = row_sample['AnswerCText']
    ans_d_sample = row_sample['AnswerDText']
    question_sample = question_sample + f'\nA. {ans_a_sample}\nB. {ans_b_sample}\nC. {ans_c_sample}\nD. {ans_d_sample}'
    construct_name = row_sample['ConstructName']
    example = f'example{j+1}: \n' + example_template.format(question=question_sample, construct_name=construct_name)
    examples.append(example)

example_str = '\n\n'.join(examples)

user_prompt = user_prompt_template.format(examples=example_str, question=question)
system_prompt = system_prompt_template
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt},
]

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(text)

<|im_start|>system
You are an excellent math teacher about to teach students of year group 1 to 14. The detail of your lesson is as follows. Subject:{first_subject}, Topic: {second_subject}, Subtopic {third_subject}. You will be provided a question. Please tell me the most granular level of knowledge related to question. Your answer should be in the format of "ConstructName: {construct name}"<|im_end|>
<|im_start|>user
Here are some examples:
example1: 
Question: What is the size of angle \( k \) ? ![Three angles which meet to form a straight line. They are labelled 46 degrees, 115 degrees and k.]()
A. \( 46^{\circ} \)
B. \( 65^{\circ} \)
C. \( 19^{\circ} \)
D. Not enough information
ConstructName: Find missing angles using angles on a straight line

example2: 
Question: What is the size of angle \( w \) ? ![An isosceles triangle with a base angle labelled with w and the angle vertically opposite the vertex angle labelled with 33 degrees]()
A. \( 49^{\degree} \)
B. \( 34^{\degree} \)
C

In [9]:
output = llm.generate(text, sampling_params=sampling_params)
print(output[0].outputs[0].text)

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.54s/it, est. speed input: 441.03 toks/s, output: 11.07 toks/s]

ConstructName: Understand the measure of a full turn (360 degrees)





In [10]:
all_texts = []
for i in tqdm(range(len(df))):
    row = df.iloc[i]
    question = row['QuestionText']
    question = question
    third_subject = row['ThirdSubjectName']
    second_subject = row['SecondSubjectName']
    first_subject = row['FirstSubjectName']
    df_train_third = df_train[df_train['ThirdSubjectName'] == third_subject].reset_index(drop=True)
    df_train_second = df_train[df_train['SecondSubjectName'] == second_subject].reset_index(drop=True)
    df_train_first = df_train[df_train['FirstSubjectName'] == first_subject].reset_index(drop=True)
    df_train_not_first = df_train[df_train['FirstSubjectName'] != first_subject].reset_index(drop=True)

    df_train_sampler = None
    if len(df_train_third) > 0:
        df_train_sampler = df_train_third
        diff = few_shot - len(df_train_third)
        if diff > 0:
            if len(df_train_second)>=diff:
                df_train_sampler = pd.concat([df_train_sampler, df_train_second.sample(diff).reset_index(drop=True)])
            else:
                df_train_sampler = pd.concat([df_train_sampler, df_train_second])
                diff = diff - len(df_train_second)
                if len(df_train_first)>=diff:
                    df_train_sampler = pd.concat([df_train_sampler, df_train_first.sample(diff).reset_index(drop=True)])
                else:
                    df_train_sampler = pd.concat([df_train_sampler, df_train_first])
                    diff = diff - len(df_train_first)
                    df_not_first
                    df_train_sampler = pd.concat([df_train_sampler,  df_train_not_first.sample(diff)]).reset_index(drop=True)
    elif len(df_train_second) > 0:
        df_train_sampler = df_train_second
        diff = few_shot - len(df_train_second)
        if diff > 0:
            if len(df_train_first)>=diff:
                df_train_sampler = pd.concat([df_train_sampler, df_train_first.sample(diff).reset_index(drop=True)])
            else:
                df_train_sampler = pd.concat([df_train_sampler, df_train_first])
                diff = diff - len(df_train_first)
                df_train_sampler = pd.concat([df_train_sampler, df_train_not_first.sample(diff)]).reset_index(drop=True)
    elif len(df_train_first) > 0:
        df_train_sampler = df_train_first
        diff = few_shot - len(df_train_first)
        if diff > 0:
            df_train_sampler = pd.concat([df_train_sampler, df_train_not_first.sample(diff)]).reset_index(drop=True)
    else:
        df_train_sampler = df_train


    df_train_sample = df_train_sampler.sample(few_shot).reset_index(drop=True)
    examples = []
    for j in range(len(df_train_sample)):
        row_sample = df_train_sample.iloc[j]
        question_sample = row_sample['QuestionText']
        ans_a_sample = row_sample['AnswerAText']
        ans_b_sample = row_sample['AnswerBText']
        ans_c_sample = row_sample['AnswerCText']
        ans_d_sample = row_sample['AnswerDText']
        question_sample = question_sample + f'\nA. {ans_a_sample}\nB. {ans_b_sample}\nC. {ans_c_sample}\nD. {ans_d_sample}'
        construct_name = row_sample['ConstructName']
        example = f'example{j+1}: \n' + example_template.format(question=question_sample, construct_name=construct_name)
        examples.append(example)

    example_str = '\n\n'.join(examples)

    user_prompt = user_prompt_template.format(examples=example_str, question=question)
    system_prompt = system_prompt_template
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    all_texts.append(text)

  0%|          | 0/16361 [00:00<?, ?it/s]

In [11]:
output = llm.generate(all_texts, sampling_params=sampling_params)

Processed prompts:   2%|▏         | 305/16361 [03:49<2:23:20,  1.87it/s, est. speed input: 909.26 toks/s, output: 19.31 toks/s]

In [10]:
output_texts = [x.outputs[0].text for x in output]
output_texts[0:10]

['ConstructName: Understand the properties of square numbers and their multiples',
 'ConstructName: Calculate the difference between two fractions whose denominators are not multiples of the same number',
 'ConstructName: Recognise the equations of lines of symmetry from a diagram',
 'ConstructName: Understand and complete a factor tree',
 'ConstructName: Understand properties of regular polygons and their construction from a set of points.',
 'ConstructName: Interpret Venn diagrams to identify common elements',
 'ConstructName: Carry out multiplication problems involving one negative integer.',
 'ConstructName: Understand the placement of numbers in a Venn diagram based on common factors and multiples.',
 'ConstructName: Find missing angles in a triangle with given angles.',
 'ConstructName: Understand and apply properties of angles formed by intersecting lines and parallel lines cut by a transversal.']

In [14]:
output_texts_processed = []
for o in output_texts:
    output_texts_processed.append(o.replace('ConstructName:', '').strip())

In [None]:
df[f'ConstructName'] = output_texts_processed
df

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,FirstSubjectId,FirstSubjectName,SecondSubjectId,SecondSubjectName,ThirdSubjectId,ThirdSubjectName,QuestionTextAll,ConstructName-qwen25-72b-instruct
0,0,-1,Not Available,245,"Squares, Cubes, etc",A,If you multiply a square number by 9 you get a...,always true,sometimes true,never true,Impossible to say.,32,Number,40,"Indices, Powers and Roots",245,"Squares, Cubes, etc","The problem is: ""If you multiply a square numb...",Understand the properties of square numbers an...
1,1,-1,Not Available,228,Ordering Fractions,C,How much bigger is $\dfrac{3}{8}$ than $\dfrac...,$\dfrac{2}{24}$,$\dfrac{2}{5}$,$\dfrac{1}{24}$,$\dfrac{4}{11}$,32,Number,39,Fractions,228,Ordering Fractions,The problem is: How much bigger is $\dfrac{3}{...,Calculate the difference between two fractions...
2,10,-1,Not Available,258,Horizontal and Vertical Lines,A,What is the equation of this line of symmetry?,y = -3.5,y = -7,y = -2,Not possible to work out,49,Algebra,54,Straight Line Graphs,258,Horizontal and Vertical Lines,The problem is: What is the equation of this l...,Recognise the equations of lines of symmetry f...
3,100,-1,Not Available,221,Prime Numbers and Prime Factors,A,This is the first row of a factor tree. What n...,8,12,21,72,32,Number,37,"Factors, Multiples and Primes",221,Prime Numbers and Prime Factors,The problem is: This is the first row of a fac...,Understand and complete a factor tree
4,101,-1,Not Available,219,Factors and Highest Common Factor,A,Tom and Katie are arguing about this ring of 1...,Only Tom,Only Katie,Both Tom and Katie,Neither is correct.,32,Number,37,"Factors, Multiples and Primes",219,Factors and Highest Common Factor,The problem is: Tom and Katie are arguing abou...,Understand properties of regular polygons and ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,95,-1,Not Available,191,Construct Triangle,C,Construct the following triangle. Measure Angl...,76°,47°,58°,117°,71,Geometry and Measure,83,"Construction, Loci and Scale Drawing",191,Construct Triangle,The problem is: Construct the following triang...,Measure an angle using a protractor
944,96,-1,Not Available,229,Converting Mixed Number and Improper Fractions,A,What is 4 written as a fraction?,4/1,1/4,4/4,You cannot write whole numbers as fractions.,32,Number,39,Fractions,229,Converting Mixed Number and Improper Fractions,The problem is: What is 4 written as a fractio...,Write a whole number as a fraction
945,97,-1,Not Available,70,Writing Expressions,A,Using the key on the right which of the follow...,2r,3p,2p + q,2q.,49,Algebra,62,Writing and Simplifying Expressions,70,Writing Expressions,"The problem is: Using the key on the right, wh...",Compare algebraic expressions to determine the...
946,98,-1,Not Available,220,Multiples and Lowest Common Multiple,B,What is the lowest common multiple of 12 and 15?,180,60,15,3,32,Number,37,"Factors, Multiples and Primes",220,Multiples and Lowest Common Multiple,The problem is: What is the lowest common mult...,Identify the Lowest Common Multiple of two num...


In [None]:
df.to_csv('../../../../data/2.synthetic_data_generation/generation1_all_render.csv', index=False)