# Eedi - Mining Misconceptions in Mathematics
### Predict affinity between misconceptions and incorrect answers (distractors) in multiple-choice questions
##### Competition Link : <https://www.kaggle.com/competitions/eedi-mining-misconceptions-in-mathematics>
##### Compeition Dataset Link : <https://www.kaggle.com/competitions/eedi-mining-misconceptions-in-mathematics/data>

## Install LLM Library

In [None]:
installDir = "/kaggle/input/universal-llm-install-package2/Universal-LLM-install-page"
!pip install transformers --no-index --no-deps --find-links=file://{installDir}/tranforemers
!pip install -U accelerate   --no-index --no-deps --find-links=file://{installDir}/accelerate
!pip install build        --no-index  --no-deps --find-links=file://{installDir}/build-1.2.1-py3-none-any.whl
!pip install -U bitsandbytes --no-index --no-deps --find-links=file://{installDir}/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl
!pip install -U peft  --no-index --no-deps --find-links=file://{installDir}/peft-0.11.1-py3-none-any.whl
!pip install -U trl  --no-index --no-deps --find-links=file://{installDir}/trl-0.9.4-py3-none-any.whl     



# Config

In [None]:
class CFG:
    
    USE_LLIAMA3 = True
    USE_GEMMA2  = False
    USE_PHI_MINI = False
    
    # LLM Config
    reportTo = "none"
    
    Model1 = "/kaggle/input/llama-3.2/transformers/3b-instruct/1" # for Llama 3.2 
    
    trainFile = "/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv"
    testFile = "/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv"
    submitSample = "/kaggle/input/eedi-mining-misconceptions-in-mathematics/sample_submission.csv"
    misconcept = "/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv"

## import library

In [None]:

import os, time , gc , json
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import transformers

from transformers import (AutoTokenizer, 
                          BitsAndBytesConfig, 
                          AutoModelForCausalLM,
                         TrainingArguments)

from datasets import Dataset, DatasetDict, load_dataset


# Fine tuning 

from trl import SFTTrainer
from peft import LoraConfig, PeftModel, get_peft_model , prepare_model_for_kbit_training #prepare_model_for_int8_training deprecated 


# text chunk splitter


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
def clearMemory():
    for _ in range(5):
        torch.cuda.empty_cache()
        gc.collect()
        time.sleep(0.3)

In [None]:
clearMemory()

# Load dataset

In [None]:
trainDF = pd.read_csv(CFG.trainFile)
trainDF

In [None]:
trainDF.columns

In [None]:
testDF = pd.read_csv(CFG.testFile)
testDF

In [None]:
testDF.columns

In [None]:
misconceptDF = pd.read_csv(CFG.misconcept)
misconceptDF

In [None]:
print(misconceptDF[misconceptDF["MisconceptionId"] == 1]["MisconceptionName"].values[0])

In [None]:
submit = pd.read_csv(CFG.submitSample)
submit

# Data Cleaning 

In [None]:
trainDF.isna().sum()

# create Training dataset

In [None]:
testDF.columns

In [None]:
### select column for trainning 
trainCol = ["QuestionId", "QuestionText", "ConstructName", "SubjectName", "CorrectAnswer", "CorrectAnswerText",
            "AnswerAText", "AnswerBText", "AnswerCText", "AnswerDText", "misconceptIDList", "misconceptTextList"]
### select column for testing (submit)
testCol = ["QuestionId", "QuestionText", "ConstructName", "SubjectName", "CorrectAnswer", "CorrectAnswerText",
            "AnswerAText", "AnswerBText", "AnswerCText", "AnswerDText"]

In [None]:
def createTrainDataset(df):
    # loop for each row 
#     finalDF = pd.DataFrame(columns=trainCol)
    tempRow = []
    for rowIdx ,row in df.iterrows():
        rowData = df.iloc[rowIdx]
        questionId = rowData["QuestionId"]
        questionText  = rowData["QuestionText"]
        correctAns = rowData["CorrectAnswer"]
        constructName = rowData["ConstructName"]
        subjectName = rowData["SubjectName"]
#         print(f"Row Question Id: {questionId}, Question: {questionText}") # for debug
#         print(f"Row Correct Answer: {correctAns}") # for debug
        #loop for each answer
        misconceptIDList = []
        misconcepTxtList = []
        for option in ["A","B", "C", "D"]:
            if option == correctAns:
                correctAnsText= rowData[f"Answer{option}Text"]
#                 print(f"Option is equal to Correct Answer: {option} , Correct Answer Text: {correctAnsText}")
                continue 
            # get incorrect misconcept id
            misconceptID =  rowData[f'Misconception{option}Id']
            if not pd.isna(misconceptID):
                misconceptTxt = misconceptDF[misconceptDF["MisconceptionId"] == misconceptID]["MisconceptionName"].values[0]
#                 print(f"Misconception ID for option {option}: {misconceptID}, \n\rMisconception Name : {misconceptTxt}")
                misconceptID = int(misconceptID) # convert to intger
                if misconceptID not in misconceptIDList:
                    misconceptIDList.append(misconceptID)
                    misconcepTxtList.append(misconceptTxt)        
#             else:  # for debug
#                 print(f"Misconception ID option {option} is Nan") # for debug
        
#         print(f"List of Misconception ID: {misconceptIDList}")
#         print(f"List of Misconception Name: {misconcepTxtList}")
        tempDict = {
            "QuestionId" : questionId,
            "QuestionText" : questionText,
            "ConstructName": constructName,
            "SubjectName" : subjectName,
            "CorrectAnswer" : correctAns,
            "CorrectAnswerText": correctAnsText,
            "AnswerAText": rowData["AnswerAText"],
            "AnswerBText": rowData["AnswerBText"],
            "AnswerCText": rowData["AnswerCText"],
            "AnswerDText": rowData["AnswerDText"],
            "misconceptIDList": misconceptIDList,
            "misconceptTextList" : misconcepTxtList,
        }
#         print(len(tempDict), tempDict) #
        tempRow.append(tempDict) 
#         print("-"*30) # for debug
          #for debug
#         if rowIdx >= 5:
#             finalDF = pd.DataFrame(data= tempRow, columns=trainCol)
#             print(f"Final DataFrame: {finalDF}")
#             break
    finalDF = pd.DataFrame(data= tempRow, columns=trainCol)
    return finalDF
        

In [None]:
newTrainDF= createTrainDataset(trainDF)
newTrainDF

In [None]:
def createTestDataset(df):
    # loop for each row 
#     finalDF = pd.DataFrame(columns=trainCol)
    tempRow = []
    for rowIdx ,row in df.iterrows():
        rowData = df.iloc[rowIdx]
        questionId = rowData["QuestionId"]
        questionText  = rowData["QuestionText"]
        correctAns = rowData["CorrectAnswer"]
        constructName = rowData["ConstructName"]
        subjectName = rowData["SubjectName"]
#         print(f"Row Question Id: {questionId}, Question: {questionText}") # for debug
#         print(f"Row Correct Answer: {correctAns}") # for debug
        #loop for each answer
        misconceptIDList = []
        misconcepTxtList = []
        for option in ["A","B", "C", "D"]:
            if option == correctAns:
                correctAnsText= rowData[f"Answer{option}Text"]
#                 print(f"Option is equal to Correct Answer: {option} , Correct Answer Text: {correctAnsText}")
                continue 
        tempDict = {
            "QuestionId" : questionId,
            "QuestionText" : questionText,
            "ConstructName": constructName,
            "SubjectName" : subjectName,
            "CorrectAnswer" : correctAns,
            "CorrectAnswerText": correctAnsText,
            "AnswerAText": rowData["AnswerAText"],
            "AnswerBText": rowData["AnswerBText"],
            "AnswerCText": rowData["AnswerCText"],
            "AnswerDText": rowData["AnswerDText"],
        }
#         print(len(tempDict), tempDict) #
        tempRow.append(tempDict) 
#         print("-"*30) # for debug
          #for debug
#         if rowIdx >= 5:
#             finalDF = pd.DataFrame(data= tempRow, columns=trainCol)
#             print(f"Final DataFrame: {finalDF}")
#             break
    finalDF = pd.DataFrame(data= tempRow, columns=testCol)
    return finalDF

In [None]:
newTestDF =  createTestDataset(testDF)
newTestDF

In [None]:
newTestDF["AnswerAText"]

# Load LLM model

In [None]:
bnbConfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True ) # Activate netsed quantization for 4bit base 

In [None]:
if device.type == "cuda":
    model =  AutoModelForCausalLM.from_pretrained(
         CFG.Model1,
         quantization_config = bnbConfig, 
         device_map="auto",
        trust_remote_code = True)
    tokenizer = AutoTokenizer.from_pretrained(CFG.Model1)
else:
    model =  AutoModelForCausalLM.from_pretrained(
         CFG.Model1,
         torch_dtype=torch.bfloat16,
         device_map="auto",
        trust_remote_code = True)
    tokenizer = AutoTokenizer.from_pretrained(CFG.Model1)

tokenizer.pad_token = tokenizer.eos_token

In [None]:
model

# Test LLM Model response 

In [None]:
def generateResponse(query, maxOutToken=256):
    """
    Direct send message to LLSM , get response
    """
    inputIds = tokenizer(query, return_tensors="pt").to(device)
    response = model.generate(**inputIds,
                                 do_sample =True,
                                  top_p=0.95,
                                  top_k=2,
                                  temperature =0.2,
                                  max_new_tokens= maxOutToken
                                 )
    
    return tokenizer.decode(response[0][len(inputIds["input_ids"]):], skip_special_tokens=True)

In [None]:
%%time
result = generateResponse("What is Deep learning and LLM Model?")
print(result)

In [None]:
newTrainDF.columns

In [None]:
# Extra pandas column in list format convert to string 
def dataProcessListToStr(arr):
    out = ""
    for i in range(len(arr)):
        out += str(arr[i]) + ", "
#     print(out[:-2)
    return out[:-2] # remove ", " 2 symbol  
    

## Prepare training dataset into LLM model

In [None]:
newTrainDF["LLM Content"] = (
    "question : " + newTrainDF["QuestionText"] +
    "\nAnswer A Text : " + newTrainDF["AnswerAText"] +
    "\nAnswer B Text : " + newTrainDF["AnswerBText"] +
    "\nAnswer C Text : " + newTrainDF["AnswerCText"] +
    "\nAnswer D Text : " + newTrainDF["AnswerDText"] +
    "\nCorrect Answer : "  + newTrainDF["CorrectAnswer"] +
    "\nMisconcept ID List : " +  newTrainDF["misconceptIDList"].apply(dataProcessListToStr) +
    "\nMisconcept Text List : " + newTrainDF["misconceptTextList"].apply(dataProcessListToStr)
)

In [None]:
newTrainDF

In [None]:
print(newTrainDF["LLM Content"][2])

In [None]:
print(newTrainDF["LLM Content"][0])

### Prepare Test Dataset into LLM for Submit

In [None]:
newTestDF.columns

In [None]:
newTestDF["LLM Content"] = (
    "question : " + newTestDF["QuestionText"] +
    "\nAnswer A Text : " + newTestDF["AnswerAText"] +
    "\nAnswer B Text : " + newTestDF["AnswerBText"] +
    "\nAnswer C Text : " + newTestDF["AnswerCText"] +
    "\nAnswer D Text : " + newTestDF["AnswerDText"] +
    "\nCorrect Answer : "  + newTestDF["CorrectAnswer"]
)

In [None]:
newTestDF

In [None]:
print(newTestDF["LLM Content"][2])

In [None]:
print(newTestDF["LLM Content"][0])

## Tokeniziation for LLM Content

In [None]:
len(newTrainDF) , round(len(newTrainDF) *0.8) 

In [None]:
# set max training data size
maxTrainData = 1500  # around 80% for training

## Create Dataset for train/validation 

In [None]:
tempTrainDF = newTrainDF[:maxTrainData]
tempValDF = newTrainDF[maxTrainData:]
trainDataset = Dataset.from_pandas(tempTrainDF, split="train")
evalDataset = Dataset.from_pandas(tempValDF, split="test")
subDataset = Dataset.from_pandas(newTestDF, split="test")


In [None]:
trainDataset

In [None]:
evalDataset

In [None]:
subDataset

In [None]:
submitDict = DatasetDict({
        "test" : subDataset
})

In [None]:
datasetDict = DatasetDict({
        'train': trainDataset,
        "test" : evalDataset
})

In [None]:
datasetDict

In [None]:
submitDict

In [None]:
del tempTrainDF 
del tempValDF

In [None]:
clearMemory()

### Convert Tokenizer function

In [None]:
def tokenizeFunc1(sample):
    return tokenizer(sample["LLM Content"], max_length=512, padding=True, truncation=True)

In [None]:
datasetDict = datasetDict.map(tokenizeFunc1, batched=True) # generate token value

In [None]:
datasetDict

In [None]:
submitDict = submitDict.map(tokenizeFunc1, batched=True) # generate token value

In [None]:
submitDict

# LoRA fine tuning 

In [None]:
lora_config = LoraConfig(
   r=16,
   lora_alpha=32,
   task_type="CAUSAL_LM",
   bias="none",
   target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
model.print_trainable_parameters()

In [None]:
# extract training column format for 
def formatFuc1(sample):
    text = f"{sample['LLM Content']}"
    return [text]

# SFT Trainer Parameter

In [None]:
trainArg = transformers.TrainingArguments(
    output_dir="/kaggle/working/lora_model",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    warmup_steps=2,
    max_steps= 100,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    eval_steps = 20,
    optim="paged_adamw_8bit",
    report_to=CFG.reportTo
)

In [None]:
# inistal trainer 
trainer = SFTTrainer(
    model=model,
    train_dataset= datasetDict["train"],
    eval_dataset= datasetDict["test"],
    args=trainArg,
    peft_config=lora_config, 
    formatting_func=formatFuc1
)

In [None]:
clearMemory()

In [None]:
trainer.train()

In [None]:
model.save_pretrained('/kaggle/working/lora_model')

In [None]:
# defint prompt Template for generative 15 misconception 
promptTemplate1 ="""###Act as Mathematician,  it prompt given math question , 4 different answer option and 1 is correct answer letter 
use these of data to complete the Task are output misconception Text list and misconption ID list as below: 
question : {question}
Answer A Text : {answerA}
Answer B Text : {answerB} 
Answer C Text : {answerC}
Answer D Text : {answerD}
Correct Answer : {correctAns}
###
Misconcept ID List : 
Misconcept Text List :
"""
promptTemplate2 ="""###Act as Mathematician,  it prompt given math question , 4 different answer option and 1 is correct answer letter 
use these of data to complete the Task is output 25 misconception Text list and 25 misconption ID list as below:

question : {question}
Answer A Text : {answerA}
Answer B Text : {answerB} 
Answer C Text : {answerC}
Answer D Text : {answerD}
Correct Answer : {correctAns}
###
Misconcept ID List : 
Misconcept Text List :
"""

promptTemplate3 ="""###Act as Mathematician,  it prompt given math question , 4 different answer option and 1 is correct answer letter 
use these of data to complete the Task is output 25 misconception Text list and 25 misconption ID list as below:
{llmcontent}
###
Misconcept ID List : 
Misconcept Text List :
"""

## use validataion Dataset for testing 

In [None]:
type(datasetDict["test"][0])

In [None]:
def testValidDataset(ds , maxNumData=10):
    for i , data in enumerate(ds["test"]):
#         print(f"data {i}: \n{data}")
        print(f"data {i}:")
        #generate prompt query from template
        newPrompt = promptTemplate1.format(
        question = data["QuestionText"], 
        answerA = data["AnswerAText"], 
        answerB = data["AnswerBText"],
        answerC = data["AnswerCText"],
        answerD =  data["AnswerDText"],
        correctAns = data["CorrectAnswer"]
        )
#         print(newPrompt)
        result = generateResponse(newPrompt)
        print(result)
        if i >= maxNumData:
            break
        

In [None]:
def inferDF(ds):
    for i , data in enumerate(ds["test"]):
        print(f"data {i}:")
#         print(f"{data}")
        newPrompt = promptTemplate2.format(
        question = data["QuestionText"], 
        answerA = data["AnswerAText"], 
        answerB = data["AnswerBText"],
        answerC = data["AnswerCText"],
        answerD =  data["AnswerDText"],
        correctAns = data["CorrectAnswer"]
        )
#         print(newPrompt)
        result = generateResponse(newPrompt)
        print(result)

In [None]:
print(submitDict["test"]["LLM Content"][0])

In [None]:
submitDict["test"]["LLM Content"][0]
# promptTemplate1.format()

In [None]:
# print(promptTemplate2.format(llmcontent=submitDict["test"]["LLM Content"][0]))

In [None]:
testValidDataset(datasetDict, 2)

In [None]:
inferDF(submitDict)