In [1]:
!pip install --no-index --find-links /kaggle/input/eedi-library-lln pip3_autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install --no-index --find-links /kaggle/input/eedi-library-lln torch torchvision torchaudio
!pip install --no-index --find-links /kaggle/input/eedi-library-lln xformers unsloth sentence_transformers


Looking in links: /kaggle/input/eedi-library-lln
Processing /kaggle/input/eedi-library-lln/pip3_autoremove-1.2.2-py2.py3-none-any.whl
Installing collected packages: pip3_autoremove
Successfully installed pip3_autoremove-1.2.2
dill 0.3.8 is installed but dill<0.3.2,>=0.3.1.1 is required
Redoing requirement with just package name...
cloudpickle 3.1.0 is installed but cloudpickle~=2.2.1 is required
Redoing requirement with just package name...
numpy 1.26.4 is installed but numpy<1.25.0,>=1.14.3 is required
Redoing requirement with just package name...
pyarrow 17.0.0 is installed but pyarrow<10.0.0,>=3.0.0 is required
Redoing requirement with just package name...
jupyterlab 4.3.1 is installed but jupyterlab~=3.6.0 is required
Redoing requirement with just package name...
google-cloud-bigquery 2.34.4 is installed but google-cloud-bigquery[bqstorage,pandas]>=3.10.0 is required
Redoing requirement with just package name...
google-cloud-storage 1.44.0 is installed but google-cl

In [2]:
import numpy as np
import pandas as pd 
import os
import torch
import gc

from datasets import Dataset
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from unsloth import FastLanguageModel
from peft import PeftConfig, PeftModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
# Load data
DATA_PATH = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"
test = pd.read_csv(DATA_PATH+"/test.csv")
mis_map = pd.read_csv(DATA_PATH+"/misconception_mapping.csv")

In [4]:
# Format data
test_long = test.melt(id_vars=['QuestionId','ConstructId', 'ConstructName', 'SubjectId', 'SubjectName', 'QuestionText', 'CorrectAnswer'],
                            value_vars=['AnswerAText','AnswerBText', 'AnswerCText', 'AnswerDText'],
                           var_name="Answer",
                           value_name="AnswerText")
test_long['AnswerOption'] = test_long['Answer'].str[6:7]
test_long=test_long.sort_values(['QuestionId', 'AnswerOption'])
test_long['QuestionId_Answer'] = test_long['QuestionId'].astype(str) + '_' + test_long['AnswerOption']
test_long = test_long[test_long['AnswerOption'] != test_long['CorrectAnswer']]

# Inference of LLM

In [5]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# del model
# gc.collect()
# torch.cuda.empty_cache()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/kaggle/input/eedi-load-models/math_solver", # Reminder we support ANY Hugging Face model!
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)



==((====))==  Unsloth 2024.12.4: Fast Qwen2 patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# first speaker
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Inference time
FastLanguageModel.for_inference(model) # Unsloth has 2x faster inference!

output_list=[]
for idx in tqdm(range(test.shape[0])):
    inputs = tokenizer(
    [
        prompt_style.format(
            """You are a math expert. Solve this question step by step
            """, # instruction
            test['QuestionText'].to_list()[idx], # input
            "", # output - leave this blank for generation!
        )
    ], return_tensors = "pt").to("cuda")
    
    outputs = model.generate(input_ids = inputs.input_ids, attention_mask = inputs.attention_mask, 
                             max_new_tokens = 512, use_cache = True,temperature=0.2)
    outputs_decoded = tokenizer.batch_decode(outputs)
    output_list.append(outputs_decoded[0])
#output_list

100%|██████████| 3/3 [01:00<00:00, 20.07s/it]


In [7]:
# Create input for llm
test['steps_to_solve'] = output_list
test['steps_to_solve'] = test['steps_to_solve'].str.split("Response:",expand=True)[1]

test_long = test_long.merge(test[["QuestionText", "steps_to_solve"]], how="left", 
                            on="QuestionText")

test_long['QSA'] = ("Math question: " + test_long['QuestionText'] + "\n" + 
                       "Math construct tested: " + test_long['steps_to_solve'] + "\n" + 
                       "Wrong Answer: " + test_long['AnswerText'])
test_long['Q&A'] = "Math question: " + test_long['QuestionText'] + "\n" + "Wrong Answer: " + test_long['AnswerText']


In [8]:
# 2nd llm
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

del model
gc.collect()
torch.cuda.empty_cache()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/kaggle/input/eedi-load-models/llm", # Reminder we support ANY Hugging Face model!
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)


==((====))==  Unsloth 2024.12.4: Fast Qwen2 patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
# # Loading adapter
# adapter_model_name = "/kaggle/input/eedi-qlora-adapter-synthonly/finetuned_lora"
# model = PeftModel.from_pretrained(model, adapter_model_name)

In [10]:
# Second speaker
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["Q&A"] # examples["problem"]
    outputs      = examples["MisconceptionName"] # examples["solution"]
    texts = []
    for input, output in zip(inputs, outputs):
        text = prompt_style.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts}

# Inference time
FastLanguageModel.for_inference(model) # Unsloth has 2x faster inference!

output_list =[]

for idx in tqdm(range(test_long.shape[0])):
    #print(idx)
    inputs = tokenizer(
    [
        prompt_style.format(
            """You are a math teacher. You are given a math question, and the steps required to solve the question.
            A student selected the wrong answer to the question. What misundestanding or misconception does the student have?
            Summarise concisely in 1-2 sentences.""", # instruction
            test_long['QSA'].to_list()[idx], # input
            "", # output - leave this blank for generation!
        )
    ], return_tensors = "pt").to("cuda")
    
    outputs = model.generate(input_ids = inputs.input_ids, attention_mask = inputs.attention_mask, 
                             max_new_tokens = 128, use_cache = True)
    outputs_decoded = tokenizer.batch_decode(outputs)
    output_list.append(outputs_decoded[0])

# Add LLM response to dataset
test_long['llm_response'] = output_list
test_long['llm_response'] = test_long['llm_response'].str.split("Response:\n",expand=True,n=2)[1].str.replace('<|endoftext|>', '')

100%|██████████| 9/9 [01:18<00:00,  8.77s/it]


In [11]:
# For misconceptions
model = SentenceTransformer("/kaggle/input/eedi-load-models/embedding_model", 
                            trust_remote_code=True)
misconceptions = mis_map["MisconceptionName"].to_list()
misconception_embeddings = model.encode(misconceptions)

Batches:   0%|          | 0/81 [00:00<?, ?it/s]

In [12]:
# Supporting functions
def find_most_similar_misconception(generated_misconception, misconception_embeddings, misconceptions):
    # Encode the generated misconception
    gen_misconception_embedding = model.encode([generated_misconception])
    
    # Calculate cosine similarity between the generated misconception and all known misconceptions
    similarities = cosine_similarity(gen_misconception_embedding, misconception_embeddings)[0]
    
    # Get indices of misconceptions sorted by similarity
    sorted_indices = np.argsort(similarities)[::-1]
    
    # Get sorted misconceptions and similarity scores
    sorted_misconceptions = [(misconceptions[i], similarities[i]) for i in sorted_indices]
    
    return sorted_misconceptions

def find_id(list_misconception):
    string_id=""
    i=0
    for misc in list_misconception:
        i+=1
        ind=misconceptions.index(misc[0])
        if i==len(list_misconception):
            string_id=string_id+str(ind)
        else:
            string_id=string_id+str(ind)+" "
    return string_id

In [13]:
# # Find misconception to LLM response
MisconceptionId=[]
for i in range(len(test_long)):
    simil_misc=find_most_similar_misconception(test_long["llm_response"].tolist()[i], misconception_embeddings, misconceptions)
    MisconceptionId.append(find_id(simil_misc[:25]))

test_long['MisconceptionId'] = MisconceptionId

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
#test_long.to_csv('chk_output.csv', index=False)

In [15]:
test_long[['QuestionId_Answer', 'MisconceptionId']].to_csv("submission.csv", index=False)