# Overview

- Load AWQ Quantized Qwen2.5-32B-Instruct using vLLM on 2x T4
- For each test question, we retrive a set of similar questions that "support" it using train data.
    - Consider things like "construct" and "subject"
- Using the set of similar questions, we create a conversation that can be fed to our model
- We can then pass these messages into our quantized Qwen2.5-32B-Instruct. Towards the end of the conversation, we give it the question/answer pair we want to predict the misconception for.
    - Use the recently supported `chat()` functionality on the conversation
    - Generate `n` responses with a slightly higher temperature to create diverse outputs
- For each question/answer pair, we now have `n` inferred misconceptions, for each, we retrieve the top 25 misconceptions using BGE embeddings. 
- The 25 nearest misconceptions for each of our `n` inferred misconceptions for each question/answer pair can now be combined using Borda Ranking. This is sort of like the simplest form of ensembling.
- Submit :)

### Note:
- This notebook is a fork of the beautiful work by Rich Olson @ https://www.kaggle.com/code/richolson/eedi-phi-mini-3-5
- Credits, Load 32B AWQ on 2x T4 by Chris Deotte @ https://www.kaggle.com/code/cdeotte/infer-34b-with-vllm

# Imports

In [None]:
import os
import gc
import ctypes
import numpy as np
import pandas as pd

from random import sample
from tqdm.auto import tqdm
from eedi_metrics import mapk, apk
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity

import torch
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer, AutoModel

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"]   = "0,1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def clean_memory(deep=False):
    gc.collect()
    if deep:
        ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

# Load data

In [None]:
k = 3

train_eval = True
n_train_eval_rows = 100

comp_dir  = '/kaggle/input/eedi-mining-misconceptions-in-mathematics'

llm_model_pth   = '/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1'
embed_model_pth = '/kaggle/input/baai/transformers/bge-large-en-v1.5/1'


if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    train_eval = False

In [None]:
if train_eval:
    test       = pd.read_csv(f'{comp_dir}/train.csv').sample(n_train_eval_rows, random_state=3)
    test       = test.sort_values(['QuestionId'], ascending=True).reset_index(drop=True)
else:
    test       = pd.read_csv(f'{comp_dir}/test.csv')

train          = pd.read_csv(f'{comp_dir}/train.csv')
sample_sub     = pd.read_csv(f'{comp_dir}/sample_submission.csv')
misconceptions = pd.read_csv(f'{comp_dir}/misconception_mapping.csv')

len(train), len(test), len(misconceptions)

# Spin up our `Qwen2.5-32B-Instruct-AWQ` using vLLM

Credits: https://www.kaggle.com/code/cdeotte/infer-34b-with-vllm

The model fits very comfortably even with a max model length of 4096. Even longer can could be experimented with. Incase of OOM errors, it might be helpful to decrease `max_num_seqs` to 4 or 8, or even 1! (Default is 256)

In [None]:
llm = LLM(
    llm_model_pth,
    trust_remote_code=True,
    dtype="half", max_model_len=4096,
    tensor_parallel_size=2, gpu_memory_utilization=0.95, 
)

tokenizer = llm.get_tokenizer()

# Post process data

In [None]:
answer_cols         = ["AnswerAText", "AnswerBText", "AnswerCText", "AnswerDText"]
misconception_cols  = ["MisconceptionAId", "MisconceptionBId", "MisconceptionCId", "MisconceptionDId"]

keep_cols           = ["QuestionId", "CorrectAnswer", "ConstructName", "SubjectName", "QuestionText" ]

def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    
    # Melt the answer columns
    answers_df = pd.melt(
        id_vars=keep_cols,
        frame=df[keep_cols + answer_cols],
        var_name='Answer', value_name='Value'
    ).sort_values(["QuestionId", "Answer"]).reset_index(drop=True)
    
    if misconception_cols[0] not in df.columns:  # If test set
        return answers_df
        
    # Melt the misconception columns
    misconceptions_df = pd.melt(
        id_vars=keep_cols,
        frame=df[keep_cols + misconception_cols],
        var_name='Misconception', value_name='MisconceptionId'
    ).sort_values(["QuestionId", "Misconception"]).reset_index(drop=True)

    answers_df[['Misconception', 'MisconceptionId']] = misconceptions_df[['Misconception', 'MisconceptionId']]
    
    return answers_df


test  = wide_to_long(test)
train = wide_to_long(train)

test['AnswerId']  = test.Answer.str.replace('Answer', '').str.replace('Text', '')
train['AnswerId'] = train.Answer.str.replace('Answer', '').str.replace('Text', '')

train = pd.merge(train, misconceptions, on='MisconceptionId', how='left')
if train_eval:
    test = pd.merge(test, misconceptions, on='MisconceptionId', how='left')

In [None]:
train.head(3)

In [None]:
test.head(3)

# Helper Functions

Credits: https://www.kaggle.com/code/richolson/eedi-phi-mini-3-5

## Get the most similar question_ids' given the subject and construct

The following function return `top_k` number of question ids' first by checking for questions that have both construct and subject to be similar.

If that does not come up to `top_k`, questions with either similar subject or construct are chosen. If we are still short of question ids', we select random questions for the remainder of `top_k`

In [None]:
def get_topk_similar_rows(question_id: int, construct: str, subject: str, top_k: int) -> list[int]:
    """ Gets the top n ids of questions that most similar to the given construct and subject """
    
    # Rows with similar construct and subject
    similar_cs_rows = train[(train.ConstructName == construct) & (train.SubjectName == subject)]
    similar_cs_qids = list(set(similar_cs_rows.QuestionId.values.tolist()))
    
    if train_eval and question_id in similar_cs_qids:
        similar_cs_qids.remove(question_id)
        
    if len(similar_cs_qids) >= top_k:
        k_similar_cs_qids = sample(similar_cs_qids, top_k)
        return k_similar_cs_qids
    
    
    # Rows with similar construct or subject for remainder of top_k
    similar_s_rows = train[(train.ConstructName != construct) & (train.SubjectName == subject)]
    similar_c_rows = train[(train.ConstructName == construct) & (train.SubjectName != subject)]
    similar_c_or_s_qids = list(set(similar_s_rows.QuestionId.values.tolist() + similar_c_rows.QuestionId.values.tolist()))
    
    if train_eval and question_id in similar_c_or_s_qids:
        similar_c_or_s_qids.remove(question_id)
    
    if len(similar_c_or_s_qids) >= top_k - len(similar_cs_qids):
        n_similar_c_or_s_qids = sample(similar_c_or_s_qids, top_k - len(similar_cs_qids))
        return similar_cs_qids + n_similar_c_or_s_qids
    
    
    # Random rows for remainder of top_k
    not_so_similar_rows = train[(train.ConstructName != construct) & (train.SubjectName != subject)]
    not_so_similar_rows_qids = list(set(not_so_similar_rows.QuestionId.values.tolist()))
    
    if train_eval and question_id in not_so_similar_rows_qids:
        not_so_similar_rows_qids.remove(question_id)
    
    n_not_so_similar_rows_qids = sample(not_so_similar_rows_qids, top_k - len(similar_c_or_s_qids))
    return similar_c_or_s_qids + n_not_so_similar_rows_qids

## Get the chat conversation for each question

In [None]:
def get_conversation_msgs(question, correct_ans, incorrect_ans, misconception):
    msgs = [
        {'role': 'user',      'content': 'Question: ' + question.strip()},
        {'role': 'assistant', 'content': 'Provide me with the correct answer for a baseline.'},
        {'role': 'user',      'content': 'Correct Answer: ' + correct_ans.strip()},
        {'role': 'assistant', 'content': 'Now provide the incorrect answer and I will anaylze the difference to infer the misconception.'},
        {'role': 'user',      'content': 'Incorrect Answer: ' + incorrect_ans.strip()},
    ]
    
    if misconception is not None:
        msgs += [{'role': 'assistant', 'content': 'Misconception for incorrect answer: ' + misconception}]
        
    return msgs

# Infer the misconception using `llm.chat()`

Note: `llm.chat()` has only been introduced recently and is available only in the later releases

We generate n outputs, with a higher temperature to create diverse representations of the outputs, which can then be used later to rank our results!

In [None]:
sampling_params = SamplingParams(
    n=10,                     # Number of output sequences to return for each prompt.
    # top_p=0.5,               # Float that controls the cumulative probability of the top tokens to consider.
    temperature=0.7,          # randomness of the sampling
    seed=1,                   # Seed for reprodicibility
    skip_special_tokens=True, # Whether to skip special tokens in the output.
    max_tokens=64,            # Maximum number of tokens to generate per output sequence.
    stop=['\n\n', '. '],      # List of strings that stop the generation when they are generated.
)

In [None]:
submission = []
for idx, row in tqdm(test.iterrows(), total=len(test)):
    
    if idx % 50:
        clean_memory()
        clean_memory()
    
    if row['CorrectAnswer'] == row['AnswerId']: continue
    if train_eval and not row['MisconceptionId'] >= 0: continue
        
    context_qids   = get_topk_similar_rows(row['QuestionId'], row['ConstructName'], row['SubjectName'], k)
    correct_answer = test[(test.QuestionId == row['QuestionId']) & (test.CorrectAnswer == test.AnswerId)].Value.tolist()[0]
    
    messages = []
    for qid in context_qids:
        correct_option = train[(train.QuestionId == qid) & (train.CorrectAnswer == train.AnswerId)]
        incorrect_options = train[(train.QuestionId == qid) & (train.CorrectAnswer != train.AnswerId)]

        for idx, incorrect_option in incorrect_options.iterrows():
            if type(incorrect_option['MisconceptionName']) == str: # Filter out NaNs
                messages += get_conversation_msgs(
                    question = correct_option.QuestionText.tolist()[0],
                    correct_ans = correct_option.Value.tolist()[0],
                    incorrect_ans = incorrect_option['Value'],
                    misconception = incorrect_option['MisconceptionName'],
                )
                
    # Coversation for Incorrect answer to get misconception for
    messages += get_conversation_msgs(
        question = row['QuestionText'],
        correct_ans = correct_answer,
        incorrect_ans = row['Value'],
        misconception = None,
    )
    
    output = llm.chat(messages, sampling_params, use_tqdm=False)
    inferred_misconceptions = [imc.text.split(':')[-1].strip() for imc in output[0].outputs]
    
    if not train_eval:
        submission.append([f"{row['QuestionId']}_{row['AnswerId']}", inferred_misconceptions])
    else:
        submission.append([
            f"{row['QuestionId']}_{row['AnswerId']}", 
            inferred_misconceptions, 
            context_qids,
            [int(row['MisconceptionId'])],
            row['MisconceptionName']
        ])


submission = pd.DataFrame(submission, columns=['QuestionId_Answer', 'InferredMisconception', 'TopKQuestionIDs', 
                                               'MisconceptionIdGT', 'MisconceptionNameGT'][:len(submission[0])])

len(submission)

In [None]:
submission.head()

# Find most similar misconceptions

Delete the model and clean memory to load up embedding model

In [None]:
del llm

clean_memory(deep=True)
clean_memory(deep=True)

In [None]:
tokenizer   = AutoTokenizer.from_pretrained(embed_model_pth, trust_remote_code=True)
embed_model = AutoModel.from_pretrained(embed_model_pth, trust_remote_code=True).to("cuda:0")

In [None]:
def generate_embeddings(texts, batch_size=8):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=1024).to('cuda:0')
        with torch.no_grad():
            outputs = embed_model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        all_embeddings.append(embeddings.cpu().numpy())
        
    return np.concatenate(all_embeddings, axis=0)

In [None]:
all_ctx_vector  = generate_embeddings(list(misconceptions.MisconceptionName.values))

all_ctx_vector.shape

In [None]:
n_results = []

for results in tqdm(pd.DataFrame(submission.InferredMisconception.values.tolist()).T.values):
    all_text_vector = generate_embeddings(list(results))
    cosine_similarities = cosine_similarity(all_text_vector, all_ctx_vector)
    test_sorted_indices = np.argsort(-cosine_similarities, axis=1)
    n_results.append(test_sorted_indices)

n_results = np.array(n_results)
n_results.shape

In [None]:
n_results = np.transpose(n_results, (1, 0, 2))
n_results.shape

## Combine ranking of each generated output for each question

Borda count is a very simple ranking mechanism

In [None]:
def borda_count(rankings):
    scores = {}
    num_elements = len(next(iter(rankings)))
    
    for model_ranking in rankings:
        for idx, item in enumerate(model_ranking):
            points = num_elements - idx
            scores[item] = scores.get(item, 0) + points
            
    # Sort the misconceptions based on total points
    final_ranking = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    ranked_results = [r for r, score in final_ranking]
    return ranked_results

# Compute the final ranking
final_rankings = np.array([borda_count(result) for result in n_results])

final_rankings.shape

In [None]:
submission['MisconceptionId'] = final_rankings[:, :25].tolist()

# Submit :)

In [None]:
if train_eval:
    submission['apk@25'] = submission.apply(lambda row: apk(row['MisconceptionIdGT'], row['MisconceptionId']), axis=1)
    submission.to_csv('submission_debug.csv', index=False)
    
    print(submission['apk@25'].mean())

In [None]:
submission["MisconceptionId"] = submission["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))
submission[['QuestionId_Answer', 'MisconceptionId']].to_csv('submission.csv', index=False)

In [None]:
submission.head(25)