In [1]:
import pandas as pd
import torch
import numpy as np
from vllm import LLM, SamplingParams
from tqdm.auto import tqdm
import random
import os

2024-10-29 21:26:35.766743: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730204795.821456  537640 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730204795.839411  537640 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-29 21:26:35.946613: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
No module named 'vllm._version'
  from vllm.version import __version__ as VLLM_VERSION


In [None]:
seed = 201
# seed = 20240706
# seed = 20240617
# seed = 355643

def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    print('> SEEDING DONE')

set_seed(seed)

> SEEDING DONE


In [None]:
MODEL_NAME = "xmadai/Llama-3.1-405B-Instruct-xMADai-INT4"
NUM_GPU = torch.cuda.device_count()
few_shot = 3

In [None]:
llm = LLM(
    MODEL_NAME,
    tensor_parallel_size=NUM_GPU,
    gpu_memory_utilization=0.95,
    trust_remote_code=True,
    dtype="half",
    enforce_eager=True,
    max_model_len=256,
    disable_log_stats=True,
    max_num_seqs=256,
)
tokenizer = llm.get_tokenizer()

sampling_params = SamplingParams(
    n=1,  # Number of output sequences to return for each prompt.
    top_p=0.7,  # Float that controls the cumulative probability of the top tokens to consider.
    temperature=1.0,  # randomness of the sampling
    seed=seed,  # Seed for reprodicibility
    skip_special_tokens=True,  # Whether to skip special tokens in the output.
    max_tokens=1024,  # Maximum number of tokens to generate per output sequence.
)

In [2]:
import pandas as pd
df = pd.read_csv('../../../data/raw/misconception_mapping.csv')
df

Unnamed: 0,MisconceptionId,MisconceptionName
0,0,Does not know that angles in a triangle sum to...
1,1,Uses dividing fractions method for multiplying...
2,2,Believes there are 100 degrees in a full turn
3,3,Thinks a quadratic without a non variable term...
4,4,Believes addition of terms and powers of terms...
...,...,...
2582,2582,"When multiplying numbers with the same base, m..."
2583,2583,Does not know what a cube number is
2584,2584,Believes that any percentage of a larger numbe...
2585,2585,Believes a cubic expression should have three ...


In [53]:
system_prompt_template = 'You are an excellent math teacher about to teach students of year group 1 to 14. The subject of your lesson includes Number, Algebra, Data and Statistics, Geometry and Measure. You will be provided a misconception that your students may have. Please explain the misconception in detail and provide some short cases when the misconception will occur. No need to provide the correct approach. The explanation should be in format of "Explanation: {explanation}"'
user_prompt_template = 'Misconception: {misconception}'

In [63]:
i = 4
row = df.iloc[i]
misconception = row['MisconceptionName']

user_prompt = user_prompt_template.format(misconception=misconception)
system_prompt = system_prompt_template
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt},
]

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(text)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are an excellent math teacher about to teach students of year group 1 to 14. The subject of your lesson includes Number, Algebra, Data and Statistics, Geometry and Measure. You will be provided a misconception that your students may have. Please explain the misconception in detail and provide cases when the misconception will occur. No need to provide the correct approach. The question should be in format of "Explanation: {explanation}."<|eot_id|><|start_header_id|>user<|end_header_id|>

Misconception: Believes addition of terms and powers of terms are equivalent e.g. a + c = a^c<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [64]:
output = llm.generate(text, sampling_params=sampling_params)
print(output[0].outputs[0].text)

Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.95s/it, est. speed input: 24.88 toks/s, output: 18.32 toks/s]

Explanation: This misconception arises from a lack of understanding of the fundamental difference between addition and exponentiation. Students may think that the operation of adding two terms is the same as raising one term to the power of another, resulting in incorrect expressions such as a + c = a^c. This misconception may occur when students are first introduced to algebraic expressions and are still developing their understanding of mathematical notation and operations.

For example, when solving equations or simplifying expressions, a student with this misconception may write a + 2 as a^2,





In [65]:
all_texts = []
for i in tqdm(range(len(df))):
    row = df.iloc[i]
    misconception = row['MisconceptionName']

    user_prompt = user_prompt_template.format(misconception=misconception)
    system_prompt = system_prompt_template
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    all_texts.append(text)

  0%|          | 0/2587 [00:00<?, ?it/s]

In [67]:
output = llm.generate(all_texts, sampling_params=sampling_params)

Processed prompts:  21%|██        | 541/2587 [14:57<55:28,  1.63s/it, est. speed input: 89.24 toks/s, output: 65.73 toks/s]  

In [None]:
output_texts = [x.outputs[0].text for x in output]

In [None]:
exp_name = f'a000-llama3-mega-misconception-aug-seed{seed}'
df[f'{exp_name}_misunderstanding'] = output_texts
df

Unnamed: 0,MisconceptionId,MisconceptionName,a000-llama3-mega-misconception-aug-seed201_misunderstanding
0,0,Does not know that angles in a triangle sum to...,Explanation: This misconception arises when st...
1,1,Uses dividing fractions method for multiplying...,Explanation: This misconception arises when st...
2,2,Believes there are 100 degrees in a full turn,Explanation: This misconception arises when st...
3,3,Thinks a quadratic without a non variable term...,Explanation: This misconception arises when st...
4,4,Believes addition of terms and powers of terms...,Explanation: This misconception arises when st...
...,...,...,...
2582,2582,"When multiplying numbers with the same base, m...",Explanation: This misconception arises when st...
2583,2583,Does not know what a cube number is,Explanation: This misconception arises when st...
2584,2584,Believes that any percentage of a larger numbe...,Explanation: This misconception arises when st...
2585,2585,Believes a cubic expression should have three ...,Explanation: This misconception arises when st...


In [8]:
df.to_csv('../../../data/2.synthetic_data_generation/misconception_augmentation/misconception_mapping.csv', index=False)