In [1]:
!pip install /kaggle/input/eedi-library-new/autoawq*.whl /kaggle/input/eedi-library-new/peft-0.13.2-py3-none-any.whl  --no-index --find-links=/kaggle/input/eedi-library-new 

Looking in links: /kaggle/input/eedi-library-new
Processing /kaggle/input/eedi-library-new/autoawq-0.2.7.post2-py3-none-any.whl
Processing /kaggle/input/eedi-library-new/peft-0.13.2-py3-none-any.whl
Processing /kaggle/input/eedi-library-new/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from autoawq==0.2.7.post2)
Installing collected packages: triton, peft, autoawq
Successfully installed autoawq-0.2.7.post2 peft-0.13.2 triton-3.1.0


In [2]:
%%writefile run1.py
import os, math, numpy as np
import sys
import os
from transformers import AutoTokenizer
import pandas as pd
from tqdm import tqdm
import re, gc
import torch
pd.set_option('display.max_rows', 300)
IS_SUBMISSION = True
#bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))


print('IS_SUBMISSION:', IS_SUBMISSION)

model_path = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"
df_train = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv").fillna(-1).sample(10, random_state=42).reset_index(drop=True)
df_test = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
df_misconception_mapping = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")
# tokenizer = AutoTokenizer.from_pretrained(model_path)
import pandas as pd
# from sentence_transformers import SentenceTransformer, util
if not IS_SUBMISSION:
    df_ret = df_train.copy()
else:
    df_ret = df_test.copy()
TEMPLATE_INPUT_V3 = '{QUESTION}\nCorrect answer: {CORRECT_ANSWER}\nStudent wrong answer: {STUDENT_WRONG_ANSWER}'
def format_input_v3(row, wrong_choice):

    assert wrong_choice in "ABCD"
    # Extract values from the row
    question_text = row.get("QuestionText", "No question text provided")
    subject_name = row.get("SubjectName", "Unknown subject")
    construct_name = row.get("ConstructName", "Unknown construct")
    # Extract the correct and wrong answer text based on the choice
    correct_answer = row.get("CorrectAnswer", "Unknown")
    assert wrong_choice != correct_answer
    correct_answer_text = row.get(f"Answer{correct_answer}Text", "No correct answer text available")
    wrong_answer_text = row.get(f"Answer{wrong_choice}Text", "No wrong answer text available")

    # Construct the question format
    formatted_question = f"""Question: {question_text}
    
SubjectName: {subject_name}
ConstructName: {construct_name}"""

    # Return the extracted data
    ret = {
        "QUESTION": formatted_question,
        "CORRECT_ANSWER": correct_answer_text,
        "STUDENT_WRONG_ANSWER": wrong_answer_text,
        "MISCONCEPTION_ID": row.get('Misconception{wrong_choice}Id'),
    }
    ret["PROMPT"] = TEMPLATE_INPUT_V3.format(**ret)

    return ret


items = []
target_ids = []
for _, row in df_ret.iterrows():
    for choice in ['A', 'B', 'C', 'D']:
        if choice == row["CorrectAnswer"]:
            continue
        if not IS_SUBMISSION and row[f'Misconception{choice}Id'] == -1:
            continue
            
        correct_col = f"Answer{row['CorrectAnswer']}Text"
        item = {'QuestionId_Answer': '{}_{}'.format(row['QuestionId'], choice)}
        item['Prompt'] = format_input_v3(row, choice)['PROMPT']
        items.append(item)
        target_ids.append(int(row.get(f'Misconception{choice}Id', -1)))
        
df_input = pd.DataFrame(items)
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'<instruct>{task_description}\n<query>{query}'

def get_detailed_example(task_description: str, query: str, response: str) -> str:
    return f'<instruct>{task_description}\n<query>{query}\n<response>{response}'

def get_new_queries(queries, query_max_len, examples_prefix, tokenizer):
    inputs = tokenizer(
        queries,
        max_length=query_max_len - len(tokenizer('<s>', add_special_tokens=False)['input_ids']) - len(
            tokenizer('\n<response></s>', add_special_tokens=False)['input_ids']),
        return_token_type_ids=False,
        truncation=True,
        return_tensors=None,
        add_special_tokens=False
    )
    prefix_ids = tokenizer(examples_prefix, add_special_tokens=False)['input_ids']
    suffix_ids = tokenizer('\n<response>', add_special_tokens=False)['input_ids']
    new_max_length = (len(prefix_ids) + len(suffix_ids) + query_max_len + 8) // 8 * 8 + 8
    new_queries = tokenizer.batch_decode(inputs['input_ids'])
    for i in range(len(new_queries)):
        new_queries[i] = examples_prefix + new_queries[i] + '\n<response>'
    return new_max_length, new_queries
task =  "Given a math multiple-choice problem with a student's wrong answer, retrieve the math misconceptions"
queries = [
    get_detailed_instruct(task, q) for q in df_input['Prompt']
]
documents = df_misconception_mapping['MisconceptionName'].tolist()
query_max_len, doc_max_len = 320, 48
LORA_PATH = '/kaggle/input/2211-lora-14b/transformers/default/1'
tokenizer = AutoTokenizer.from_pretrained(LORA_PATH)
examples_prefix = ''
new_query_max_len, new_queries = get_new_queries(queries, query_max_len, examples_prefix, tokenizer)

import pickle
with open('queries.pkl', 'wb') as f:
    pickle.dump(new_queries, f)
    
with open('documents.pkl', 'wb') as f:
    pickle.dump(documents, f)

import json
with open('data.json', 'w') as f:
    data = {'texts': new_queries+ documents}
    f.write(json.dumps(data))

Writing run1.py


In [3]:
%%writefile run_embed.py
import argparse
import os
import json
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import peft

MAX_LENGTH = 320


def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[
            torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths
        ]


def get_embeddings_in_batches(model, tokenizer, texts, max_length, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch_texts = texts[i : i + batch_size]
        batch_dict = tokenizer(
            batch_texts,
            max_length=max_length,
            padding=True,
            truncation=True,
            return_tensors="pt",
        ).to("cuda")
        with torch.no_grad(), torch.amp.autocast("cuda"):
            outputs = model(**batch_dict)
            batch_embeddings = last_token_pool(
                outputs.last_hidden_state, batch_dict["attention_mask"]
            )
            batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1).cpu()
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)


def load_model_and_tokenizer(base_model_path, lora_path, load_in_4bit=True):
    model = AutoModel.from_pretrained(
        base_model_path,
        device_map=0,
        torch_dtype=torch.float16,
        load_in_4bit=load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        lora_path if lora_path else base_model_path
    )
    model.resize_token_embeddings(len(tokenizer))
    if lora_path:
        model = peft.PeftModel.from_pretrained(model, lora_path)
    return model, tokenizer


def main(args):
    output_file = args.input_text.replace(
        ".json", ".pt.fold.{}.{}.embed".format(*args.fold)
    )
    if os.path.exists(output_file):
        print(f"Output file {output_file} already exists. Skipping...")
        return
    model, tokenizer = load_model_and_tokenizer(
        args.base_model, args.lora_path, load_in_4bit=args.load_in_4bit
    )
    texts = json.load(open(args.input_text))["texts"][args.fold[0] :: args.fold[1]]
    embeddings = get_embeddings_in_batches(
        model,
        tokenizer,
        texts,
        max_length=MAX_LENGTH,
        batch_size=4,
    )
    text2embeds = {text: emb for text, emb in zip(texts, embeddings)}
    torch.save(text2embeds, output_file)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--base_model",
        type=str,
        default="Qwen/Qwen2.5-7B",
        help="Path to the base model",
    )
    parser.add_argument(
        "--lora_path",
        type=str,
        default=None,
        help="Path to the LoRA model",
    )
    parser.add_argument(
        "--input_text",
        type=str,
        default=".cache/data.json",
    )
    parser.add_argument(
        "--load_in_4bit",
        action="store_true",
        help="Load model in 4-bit mode",
    )
    parser.add_argument("--fold", nargs=2, type=int, default=[0, 1])
    args = parser.parse_args()
    if not os.path.exists(args.lora_path):
        args.lora_path = None
    main(args)


Writing run_embed.py


In [4]:
!python run1.py

IS_SUBMISSION: True


In [5]:
!sleep 1 & sleep 3

In [6]:
# %%writefile run.sh
lora_path = '/kaggle/input/2211-lora-14b/transformers/default/1'
cmd = f"(CUDA_VISIBLE_DEVICES=0 python run_embed.py --base_model /kaggle/input/qw14b-awq/transformers/default/1 --lora_path {lora_path} --input_text data.json --fold 0 2) & (CUDA_VISIBLE_DEVICES=1 python run_embed.py --base_model /kaggle/input/qw14b-awq/transformers/default/1 --lora_path {lora_path} --input_text data.json --fold 1 2)"
import os
os.system(cmd)

Loading checkpoint shards: 100%|██████████| 2/2 [01:56<00:00, 58.36s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [01:57<00:00, 58.54s/it]
Embedding: 100%|██████████| 325/325 [02:58<00:00,  1.82it/s]
Embedding:  92%|█████████▏| 300/325 [02:59<00:16,  1.49it/s]

0

In [7]:
from glob import glob
import time
import torch
text_to_embed = {}
files = glob('*.pt*')
while len(files) != 2:
    time.sleep(1)
    files = glob('*.pt*')


time.sleep(3)    
for path in files:
    print(path)
    text_to_embed.update(torch.load(path))


Embedding: 100%|██████████| 325/325 [03:13<00:00,  1.68it/s]


data.pt.fold.1.2.embed
data.pt.fold.0.2.embed


  text_to_embed.update(torch.load(path))


In [8]:
import pickle
with open('/kaggle/working/queries.pkl', 'rb') as f:
    queries=pickle.load(f)
with open('/kaggle/working/documents.pkl', 'rb') as f:
    documents=pickle.load(f)

In [9]:
query_embeddings = torch.stack([text_to_embed[t] for t in queries])
doc_embeddings = torch.stack([text_to_embed[t] for t in documents])
# query_embeddings.shape, doc_embeddings.shape

In [10]:
%%time
!pip uninstall -y torch
!pip install -q --no-index --find-links=/kaggle/input/making-wheels-of-necessary-packages-for-vllm vllm
!pip install -q -U /kaggle/input/vllm-t4-fix/grpcio-1.62.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -q -U /kaggle/input/vllm-t4-fix/ray-2.11.0-cp310-cp310-manylinux2014_x86_64.whl
!pip install -q --no-deps --no-index /kaggle/input/hf-libraries/sentence-transformers/sentence_transformers-3.1.0-py3-none-any.whl
!pip install --no-deps --no-index /kaggle/input/logits-processor-zoo/logits_processor_zoo-0.1.0-py3-none-any.whl

Found existing installation: torch 2.4.0
Uninstalling torch-2.4.0:
  Successfully uninstalled torch-2.4.0
Processing /kaggle/input/logits-processor-zoo/logits_processor_zoo-0.1.0-py3-none-any.whl
Installing collected packages: logits-processor-zoo
Successfully installed logits-processor-zoo-0.1.0
CPU times: user 2.1 s, sys: 584 ms, total: 2.69 s
Wall time: 3min 30s


In [11]:
!pip install transformers peft accelerate \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files

Looking in links: /kaggle/input/lmsys-wheel-files


In [12]:
%%capture
!pip install --no-index /kaggle/input/bitsandbytes0-42-0/bitsandbytes-0.42.0-py3-none-any.whl --find-links=/kaggle/input/bitsandbytes0-42-0

In [13]:
%%writefile run2.py
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import gc
import pandas as pd
import pickle
import sys
import numpy as np
from tqdm.autonotebook import trange
from sklearn.model_selection import GroupKFold
import json
import torch
from numpy.linalg import norm
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel,BitsAndBytesConfig
from peft import (
    LoraConfig,
    get_peft_model,
)
import json
import copy
import warnings
import os
warnings.filterwarnings('ignore')

def batch_to_device(batch, target_device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    for key in batch:
        if isinstance(batch[key], Tensor):
            batch[key] = batch[key].to(target_device)
    return batch

def last_token_pool(last_hidden_states: Tensor,
                    attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

def inference(df, model, tokenizer, device):
    batch_size = 16
    max_length = 512
    sentences = list(df['query_text'].values)
    pids = list(df['order_index'].values)
    all_embeddings = []
    length_sorted_idx = np.argsort([-len(sen) for sen in sentences])
    sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
    for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=False):
        sentences_batch = sentences_sorted[start_index: start_index + batch_size]
        features = tokenizer(sentences_batch, max_length=max_length, padding=True, truncation=True,
                             return_tensors="pt")
        features = batch_to_device(features, device)
        with torch.no_grad():
            # outputs = model.model(**features)
            outputs = model(**features)
            embeddings = last_token_pool(outputs.last_hidden_state, features['attention_mask'])
            embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
            embeddings = embeddings.detach().cpu().numpy().tolist()
        all_embeddings.extend(embeddings)

    all_embeddings = [np.array(all_embeddings[idx]).reshape(1, -1) for idx in np.argsort(length_sorted_idx)]
    
    # sentence_embeddings = np.concatenate(all_embeddings, axis=0)
    # result = {pids[i]: em for i, em in enumerate(sentence_embeddings)}
    return np.concatenate(all_embeddings, axis=0)
    
path_prefix = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"
# model_path = "/kaggle/input/sfr-embedding-mistral/SFR-Embedding-2_R"
device='cuda:0'
VALID = False
model_path = "/kaggle/input/qwen2.5-14/pytorch/default/1"

lora_path='/kaggle/input/qwen14b-it-lora/lora_weights/adapter.bin'
tokenizer = AutoTokenizer.from_pretrained(lora_path.replace("/adapter.bin",""))
bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
model = AutoModel.from_pretrained(model_path, 
                                  quantization_config=bnb_config, 
                                  device_map="auto",
                                  trust_remote_code=True)

if lora_path:
    print("loading lora")
    config = LoraConfig(
        r=64,
        lora_alpha=128,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        bias="none",
        lora_dropout=0.05,  # Conventional
        task_type="FEATURE_EXTRACTION",
    )
    model = get_peft_model(model, config)
    d = torch.load(lora_path, map_location=model.device)
    model.load_state_dict(d, strict=False)
    model = model.merge_and_unload()
model = model.eval()
# model = model.to(device)
task_description = 'Given a math question with correct answer and a misconcepted incorrect answer, retrieve the most accurate misconception for the incorrect answer.'
if VALID:
    # tra = pd.read_parquet("/kaggle/input/val-parquet/v1_val.parquet")
    tra = pd.read_csv(f"{path_prefix}/train.csv").sample(10, random_state=2025, ignore_index=True)
    print(tra.shape)
else:
    tra = pd.read_csv(f"{path_prefix}/test.csv")
    print(tra.shape)
misconception_mapping = pd.read_csv(f"{path_prefix}/misconception_mapping.csv")
# if tra.shape[0]<10:
#     misconception_mapping = misconception_mapping.sample(n=5,random_state=2023)
if VALID:
    train_data = []
    for _,row in tra.iterrows():
        for c in ['A','B','C','D']:
            if str(row[f"Misconception{c}Id"])!="nan":
                # print(row[f"Misconception{c}Id"])
                real_answer_id = row['CorrectAnswer']
                real_text = row[f'Answer{real_answer_id}Text']
                query_text = f"### SubjectName: {row['SubjectName']}\n### ConstructName: {row['ConstructName']}\n### Question: {row['QuestionText']}\n### Correct Answer: {real_text}\n### Misconcepte Incorrect answer: {row[f'Answer{c}Text']}"
                row['query_text'] = get_detailed_instruct(task_description,query_text)
                row['answer_id'] = int(row[f"Misconception{c}Id"])
                train_data.append(copy.deepcopy(row))
    train_df = pd.DataFrame(train_data)
    train_df['order_index'] = list(range(len(train_df)))
else:
    train_data = []
    for _,row in tra.iterrows():
        for c in ['A','B','C','D']:
            if c ==row['CorrectAnswer']:
                continue
            if f'Answer{c}Text' not in row:
                continue
            real_answer_id = row['CorrectAnswer']
            real_text = row[f'Answer{real_answer_id}Text']
            query_text = f"### SubjectName: {row['SubjectName']}\n### ConstructName: {row['ConstructName']}\n### Question: {row['QuestionText']}\n### Correct Answer: {real_text}\n### Misconcepte Incorrect answer: {row[f'Answer{c}Text']}"
            row['query_text'] = get_detailed_instruct(task_description,query_text)
            row['answer_name'] = c
            train_data.append(copy.deepcopy(row))
    train_df = pd.DataFrame(train_data)
    train_df['order_index'] = list(range(len(train_df)))
train_embeddings = inference(train_df, model, tokenizer, device)
misconception_mapping['query_text'] = misconception_mapping['MisconceptionName']
misconception_mapping['order_index'] = misconception_mapping['MisconceptionId']
doc_embeddings = inference(misconception_mapping, model, tokenizer, device)
import pickle
with open('qwen_embeddings.pkl', 'wb') as f:
    pickle.dump(train_embeddings, f)
with open('qwen_misconception.pkl', 'wb') as f:
    pickle.dump(doc_embeddings, f)

Writing run2.py


In [14]:
!python run2.py

Loading checkpoint shards: 100%|██████████████████| 8/8 [02:09<00:00, 16.25s/it]
loading lora
(3, 11)
Batches: 100%|████████████████████████████████████| 1/1 [00:20<00:00, 20.26s/it]
Batches: 100%|████████████████████████████████| 162/162 [12:07<00:00,  4.49s/it]


In [15]:
import pickle

with open('/kaggle/working/qwen_embeddings.pkl', 'rb') as f:
    q_embeddings = pickle.load(f)
import pickle

with open('/kaggle/working/qwen_misconception.pkl', 'rb') as f:
    q_misconception_mapping = pickle.load(f)

In [16]:
embeddings_data=query_embeddings*0.6+q_embeddings*0.33

In [17]:
doc=doc_embeddings*0.7+q_misconception_mapping*0.33

In [18]:
scores = embeddings_data @ doc.T  # Shape: (M, N)
sorted_indices = torch.argsort(scores,1, descending=True)[:,:25].tolist()

### Eval (testing purpose only)
This will not be ran in real submition

In [19]:
import torch
from typing import List

# def compute_metrics(q_embeds: torch.Tensor, d_embeds: torch.Tensor, target_ids: List[int]):
#     """
#     Compute MAP@25 and Recall@100 metrics.
    
#     Args:
#         q_embeds (torch.Tensor): Query embeddings of shape (M, dim), where M is the number of queries.
#         d_embeds (torch.Tensor): Document embeddings of shape (N, dim), where N is the number of documents.
#         target_ids (List[int]): List of target document indices (length M, one target index per query).
        
#     Returns:
#         None: Prints MAP@25 and Recall@100.
#     """
#     # Compute similarity scores
#     scores = q_embeds @ d_embeds.T  # Shape: (M, N)

#     # Initialize variables for metrics
#     avg_precisions = []  # To store average precision for each query
#     recall_counts = []   # To store recall@100 counts for each query

#     # Compute metrics for each query
#     for i, target_id in enumerate(target_ids):
#         # Sort document indices by score in descending order
#         sorted_indices = torch.argsort(scores[i], descending=True)

#         # Compute precision@k and recall@100
#         relevant_docs = (sorted_indices[:100] == target_id).nonzero(as_tuple=True)[0]  # Find rank within top 100
#         recall_count = 1 if len(relevant_docs) > 0 else 0  # Check if target is in the top 100
#         recall_counts.append(recall_count)

#         # Compute average precision for top 25 (MAP@25)
#         precision_at_k = 0.0
#         num_relevant = 0
#         for rank, idx in enumerate(sorted_indices[:25]):
#             if idx == target_id:
#                 num_relevant += 1
#                 precision_at_k += num_relevant / (rank + 1)
#         avg_precisions.append(precision_at_k / 1 if num_relevant > 0 else 0)

#     # Calculate metrics
#     map25 = sum(avg_precisions) / len(avg_precisions)
#     recall100 = sum(recall_counts) / len(recall_counts)

#     # Print results
#     print(f"MAP@25: {map25:.4f}")
#     print(f"Recall@100: {recall100:.4f}")
# if not IS_SUBMISSION:
#     compute_metrics(query_embeddings, doc_embeddings, target_ids)

In [20]:
import pandas as pd
df_test = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
IS_SUBMISSION =True
df_ret = df_test.copy()
TEMPLATE_INPUT_V3 = '{QUESTION}\nCorrect answer: {CORRECT_ANSWER}\nStudent wrong answer: {STUDENT_WRONG_ANSWER}'
def format_input_v3(row, wrong_choice):

    assert wrong_choice in "ABCD"
    # Extract values from the row
    question_text = row.get("QuestionText", "No question text provided")
    subject_name = row.get("SubjectName", "Unknown subject")
    construct_name = row.get("ConstructName", "Unknown construct")
    # Extract the correct and wrong answer text based on the choice
    correct_answer = row.get("CorrectAnswer", "Unknown")
    assert wrong_choice != correct_answer
    correct_answer_text = row.get(f"Answer{correct_answer}Text", "No correct answer text available")
    wrong_answer_text = row.get(f"Answer{wrong_choice}Text", "No wrong answer text available")

    # Construct the question format
    formatted_question = f"""Question: {question_text}
    
SubjectName: {subject_name}
ConstructName: {construct_name}"""

    # Return the extracted data
    ret = {
        "QUESTION": formatted_question,
        "CORRECT_ANSWER": correct_answer_text,
        "STUDENT_WRONG_ANSWER": wrong_answer_text,
        "MISCONCEPTION_ID": row.get('Misconception{wrong_choice}Id'),
    }
    ret["PROMPT"] = TEMPLATE_INPUT_V3.format(**ret)

    return ret


items = []
target_ids = []
for _, row in df_ret.iterrows():
    for choice in ['A', 'B', 'C', 'D']:
        if choice == row["CorrectAnswer"]:
            continue
        if not IS_SUBMISSION and row[f'Misconception{choice}Id'] == -1:
            continue
            
        correct_col = f"Answer{row['CorrectAnswer']}Text"
        item = {'QuestionId_Answer': '{}_{}'.format(row['QuestionId'], choice)}
        item['Prompt'] = format_input_v3(row, choice)['PROMPT']
        items.append(item)
        target_ids.append(int(row.get(f'Misconception{choice}Id', -1)))
        
df_input = pd.DataFrame(items)

In [21]:
df_input["MisconceptionId"] = [" ".join([str(x) for x in row]) for row in sorted_indices]
df_input[["QuestionId_Answer", "MisconceptionId"]].to_csv("s1.csv", index=False)

s=pd.read_csv('s1.csv')

In [22]:
# pd.read_csv('s1.csv')

In [23]:
import numpy as np
n1=s["MisconceptionId"].apply(lambda x: [int(y) for y in x.split()])
n1=[i for i in n1]
np.save("n1.npy", np.array(n1))

In [24]:
import pandas as pd

full_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")


rows = []
for idx, row in full_df.iterrows():
    for option in ["A", "B", "C", "D"]:
        if option == row.CorrectAnswer:
            continue
            
        correct_answer = row[f"Answer{row.CorrectAnswer}Text"]

        query_text =f"###question###:{row['SubjectName']}-{row['ConstructName']}-{row['QuestionText']}\n###Correct Answer###:{correct_answer}\n###Misconcepte Incorrect answer###:{option}.{row[f'Answer{option}Text']}"

        rows.append({"query_text": query_text, 
                     "QuestionId_Answer": f"{row.QuestionId}_{option}",
                     "ConstructName": row.ConstructName,
                     "SubjectName": row.SubjectName,
                     "QuestionText": row.QuestionText,
                     "correct_answer": correct_answer,
                     "incorrect_answer": row[f"Answer{option}Text"]
                     })

df = pd.DataFrame(rows)
df.to_parquet("data.parquet", index=False)

In [25]:
%%writefile run_vllm.py

import vllm
import numpy as np
import pandas as pd
from transformers import PreTrainedTokenizer, AutoTokenizer
from typing import List
import torch
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
import re

model_path = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"
tokenizer = AutoTokenizer.from_pretrained(model_path)


def preprocess_text(x):
    x = re.sub("http\w+", '',x)   # Delete URL
    x = re.sub(r"\.+", ".", x)    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\,+", ",", x)
    x = re.sub(r"\\\(", " ", x)
    x = re.sub(r"\\\)", " ", x)
    x = re.sub(r"[ ]{1,}", " ", x)
    x = x.strip()                 # Remove empty characters at the beginning and end
    return x

PROMPT  = """Here is a question about {ConstructName}({SubjectName}).
Question: {Question}
Correct Answer: {CorrectAnswer}
Incorrect Answer: {IncorrectAnswer}

You are a Mathematics teacher. Your task is to reason and identify the misconception behind the Incorrect Answer with the Question.
Answer concisely what misconception it is to lead to getting the incorrect answer.
Pick the correct misconception number from the below:

{Retrival}
"""
# just directly give your answers.

def apply_template(row, tokenizer):
    messages = [
        {
            "role": "user", 
            "content": preprocess_text(
                PROMPT.format(
                    ConstructName=row["ConstructName"],
                    SubjectName=row["SubjectName"],
                    Question=row["QuestionText"],
                    IncorrectAnswer=row[f"incorrect_answer"],
                    CorrectAnswer=row[f"correct_answer"],
                    Retrival=row[f"retrieval"]
                )
            )
        }
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return text


misconception_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")

df = pd.read_parquet("data.parquet")
indices = np.load("n1.npy")

model_path = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"

llm = vllm.LLM(
    model_path,
    quantization="awq",
    tensor_parallel_size=2,
    gpu_memory_utilization=0.90, 
    trust_remote_code=True,
    dtype="half", 
    enforce_eager=True,
    max_model_len=5120,
    disable_log_stats=True
)
tokenizer = llm.get_tokenizer()


def get_candidates(c_indices):
    candidates = []

    mis_names = misconception_df["MisconceptionName"].values
    for ix in c_indices:
        c_names = []
        for i, name in enumerate(mis_names[ix]):
            c_names.append(f"{i+1}. {name}")

        candidates.append("\n".join(c_names))
        
    return candidates

survivors = indices[:, -1:]

for i in range(3):
    c_indices = np.concatenate([indices[:, -8*(i+1)-1:-8*i-1], survivors], axis=1)
    
    df["retrieval"] = get_candidates(c_indices)
    df["text"] = df.apply(lambda row: apply_template(row, tokenizer), axis=1)
    
    print("Example:")
    print(df["text"].values[0])
    print()
    
    responses = llm.generate(
        df["text"].values,
        vllm.SamplingParams(
            n=1,  # Number of output sequences to return for each prompt.
            top_k=1,  # Float that controls the cumulative probability of the top tokens to consider.
            temperature=0,  # randomness of the sampling
            seed=777, # Seed for reprodicibility
            skip_special_tokens=False,  # Whether to skip special tokens in the output.
            max_tokens=1,  # Maximum number of tokens to generate per output sequence.
            logits_processors=[MultipleChoiceLogitsProcessor(tokenizer, choices=["1", "2", "3", "4", "5", "6", "7", "8", "9"])]
        ),
        use_tqdm=True
    )
    
    responses = [x.outputs[0].text for x in responses]
    df["response"] = responses
    
    
    llm_choices = df["response"].astype(int).values - 1
    
    survivors = np.array([cix[best] for best, cix in zip(llm_choices, c_indices)]).reshape(-1, 1)



results1 = []

for i in range(indices.shape[0]):
    ix = indices[i]
    llm_choice = survivors[i, 0]
    
    results1.append(" ".join([str(llm_choice)] + [str(x) for x in ix if x != llm_choice]))


df["MisconceptionId"] = results1
df.to_csv("submission.csv", columns=["QuestionId_Answer", "MisconceptionId"], index=False)

Writing run_vllm.py


In [26]:
!python run_vllm.py

INFO 12-12 11:06:44 config.py:715] Defaulting to use mp for distributed inference
INFO 12-12 11:06:44 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', speculative_config=None, tokenizer='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=5120, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1, use_v2_block_manager=Fa

In [27]:
pd.read_csv("submission.csv")

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_B,1345 706 1507 2306 328 2181 1516 2532 1005 167...
1,1869_C,1345 1507 2306 706 1005 2488 2532 2181 328 251...
2,1869_D,315 2532 1345 1507 328 1516 2488 1005 2306 167...
3,1870_A,891 1755 167 418 2142 2068 979 1535 1871 1593 ...
4,1870_B,891 167 979 1755 1593 1871 2142 143 2068 418 5...
5,1870_C,891 1755 167 418 2142 2068 113 1535 979 1593 1...
6,1871_A,1287 1073 2439 1306 1059 2551 1098 1677 1200 3...
7,1871_C,1287 1073 2439 1059 2551 1306 1098 365 1677 16...
8,1871_D,1287 1073 2439 1059 2471 2551 397 365 1923 167...
