In [None]:
!pip install /kaggle/input/eedi-wheel/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --find-links /kaggle/input/eedi-wheel
!pip install /kaggle/input/eedi-wheel/autoawq-0.2.7.post2-py3-none-any.whl --find-links /kaggle/input/eedi-wheel
!pip install /kaggle/input/eedi-wheel/bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl --find-links /kaggle/input/eedi-wheel
!pip install /kaggle/input/eedi-wheel/peft-0.13.2-py3-none-any.whl --find-links /kaggle/input/eedi-wheel

In [None]:
#https://www.kaggle.com/datasets/wuwenmin/bge-large-en-v1-5
#https://www.kaggle.com/datasets/syzong/qwen2-5-14b-instruct
#https://www.kaggle.com/datasets/gmhost/qwen2-5-32b-instruct-quant
#https://www.kaggle.com/datasets/abdurrafae/vllm-t4-fix
#https://www.kaggle.com/datasets/eugenkrylov/vllm-0-6-3-post1-wheels
#https://www.kaggle.com/datasets/emiz6413/lmsys-wheel-files
#https://www.kaggle.com/datasets/nbroad/hf-libraries
#https://www.kaggle.com/models/anhvth226/2211-lora-14b/Transformers/default/1
#https://www.kaggle.com/models/anhvth226/qw14b-awq/Transformers/default/1
#https://www.kaggle.com/code/ironbar/making-wheels-of-necessary-packages-for-vllm
#https://www.kaggle.com/models/takanashihumbert/qwen2.5/Transformers/32b-instruct-awq/1

### 检索模型
     1.开源模型-Qwen-14b-AWQ的微调
     2.Qwen-14b-instruct的量化微调
     3.Qwen-32b-instruct的量化微调

In [None]:
%%writefile run_embed.py
import argparse
import os
import json
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import peft

MAX_LENGTH = 384


def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[
            torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths
        ]


def get_embeddings_in_batches(model, tokenizer, texts, max_length, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch_texts = texts[i : i + batch_size]
        batch_dict = tokenizer(
            batch_texts,
            max_length=max_length,
            padding=True,
            truncation=True,
            return_tensors="pt",
        ).to("cuda")
        with torch.no_grad(), torch.amp.autocast("cuda"):
            outputs = model(**batch_dict)
            batch_embeddings = last_token_pool(
                outputs.last_hidden_state, batch_dict["attention_mask"]
            )
            batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1).cpu()
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)


def load_model_and_tokenizer(base_model_path, lora_path, load_in_4bit=True):
    model = AutoModel.from_pretrained(
        base_model_path,
        device_map=0,
        torch_dtype=torch.float16,
        load_in_4bit=load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        lora_path if lora_path else base_model_path
    )
    model.resize_token_embeddings(len(tokenizer))
    if lora_path:
        model = peft.PeftModel.from_pretrained(model, lora_path)
    return model, tokenizer


def main(args):
    output_file = args.input_text.replace(
        ".json", ".pt.fold.{}.{}.embed".format(*args.fold)
    )
    if os.path.exists(output_file):
        print(f"Output file {output_file} already exists. Skipping...")
        return
    model, tokenizer = load_model_and_tokenizer(
        args.base_model, args.lora_path, load_in_4bit=args.load_in_4bit
    )
    texts = json.load(open(args.input_text))["texts"][args.fold[0] :: args.fold[1]]
    embeddings = get_embeddings_in_batches(
        model,
        tokenizer,
        texts,
        max_length=MAX_LENGTH,
        batch_size=4,
    )
    text2embeds = {text: emb for text, emb in zip(texts, embeddings)}
    torch.save(text2embeds, output_file)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--base_model",
        type=str,
        default="Qwen/Qwen2.5-7B",
        help="Path to the base model",
    )
    parser.add_argument(
        "--lora_path",
        type=str,
        default=None,
        help="Path to the LoRA model",
    )
    parser.add_argument(
        "--input_text",
        type=str,
        default=".cache/data.json",
    )
    parser.add_argument(
        "--load_in_4bit",
        action="store_true",
        help="Load model in 4-bit mode",
    )
    parser.add_argument("--fold", nargs=2, type=int, default=[0, 1])
    args = parser.parse_args()
    if not os.path.exists(args.lora_path):
        args.lora_path = None
    main(args)


In [None]:
%%writefile qwen14b_awq_infer.py
import os, math, numpy as np
import sys
import os
from transformers import AutoTokenizer
import pandas as pd
from tqdm import tqdm
import re, gc
import torch
pd.set_option('display.max_rows', 300)

IS_SUBMISSION = True
print('IS_SUBMISSION:', IS_SUBMISSION)
df_train = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv").fillna(-1).sample(10, random_state=42).reset_index(drop=True)
df_test = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
df_misconception_mapping = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")
if len(df_test)<10:
    df_misconception_mapping = df_misconception_mapping.head(25)

import pandas as pd
if not IS_SUBMISSION:
    df_ret = df_train.copy()
else:
    df_ret = df_test.copy()

TEMPLATE_INPUT_V3 = '{QUESTION}\nCorrect answer: {CORRECT_ANSWER}\nStudent wrong answer: {STUDENT_WRONG_ANSWER}'
def format_input_v3(row, wrong_choice):

    assert wrong_choice in "ABCD"
    # Extract values from the row
    question_text = row.get("QuestionText", "No question text provided")
    subject_name = row.get("SubjectName", "Unknown subject")
    construct_name = row.get("ConstructName", "Unknown construct")
    # Extract the correct and wrong answer text based on the choice
    correct_answer = row.get("CorrectAnswer", "Unknown")
    assert wrong_choice != correct_answer
    correct_answer_text = row.get(f"Answer{correct_answer}Text", "No correct answer text available")
    wrong_answer_text = row.get(f"Answer{wrong_choice}Text", "No wrong answer text available")

    # Construct the question format
    formatted_question = f"""Question: {question_text}
    
SubjectName: {subject_name}
ConstructName: {construct_name}"""

    # Return the extracted data
    ret = {
        "QUESTION": formatted_question,
        "CORRECT_ANSWER": correct_answer_text,
        "STUDENT_WRONG_ANSWER": wrong_answer_text,
        "MISCONCEPTION_ID": row.get('Misconception{wrong_choice}Id'),
    }
    ret["PROMPT"] = TEMPLATE_INPUT_V3.format(**ret)

    return ret


items = []
target_ids = []
for _, row in df_ret.iterrows():
    for choice in ['A', 'B', 'C', 'D']:
        if choice == row["CorrectAnswer"]:
            continue
        if not IS_SUBMISSION and row[f'Misconception{choice}Id'] == -1:
            continue
            
        correct_col = f"Answer{row['CorrectAnswer']}Text"
        item = {'QuestionId_Answer': '{}_{}'.format(row['QuestionId'], choice)}
        item['Prompt'] = format_input_v3(row, choice)['PROMPT']
        items.append(item)
        target_ids.append(int(row.get(f'Misconception{choice}Id', -1)))
        
df_input = pd.DataFrame(items)

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'<instruct>{task_description}\n<query>{query}'

def get_detailed_example(task_description: str, query: str, response: str) -> str:
    return f'<instruct>{task_description}\n<query>{query}\n<response>{response}'

def get_new_queries(queries, query_max_len, examples_prefix, tokenizer):
    inputs = tokenizer(
        queries,
        max_length=query_max_len - len(tokenizer('<s>', add_special_tokens=False)['input_ids']) - len(
            tokenizer('\n<response></s>', add_special_tokens=False)['input_ids']),
        return_token_type_ids=False,
        truncation=True,
        return_tensors=None,
        add_special_tokens=False
    )
    prefix_ids = tokenizer(examples_prefix, add_special_tokens=False)['input_ids']
    suffix_ids = tokenizer('\n<response>', add_special_tokens=False)['input_ids']
    new_max_length = (len(prefix_ids) + len(suffix_ids) + query_max_len + 8) // 8 * 8 + 8
    new_queries = tokenizer.batch_decode(inputs['input_ids'])
    for i in range(len(new_queries)):
        new_queries[i] = examples_prefix + new_queries[i] + '\n<response>'
    return new_max_length, new_queries
task =  "Given a math multiple-choice problem with a student's wrong answer, retrieve the math misconceptions"
queries = [
    get_detailed_instruct(task, q) for q in df_input['Prompt']
]
documents = df_misconception_mapping['MisconceptionName'].tolist()
query_max_len, doc_max_len = 420, 48
LORA_PATH = '/kaggle/input/2211-lora-14b/transformers/default/1'
tokenizer = AutoTokenizer.from_pretrained(LORA_PATH)
examples_prefix = ''
new_query_max_len, new_queries = get_new_queries(queries, query_max_len, examples_prefix, tokenizer)


import json
with open('data.json', 'w') as f:
    data = {'texts': new_queries+ documents}
    f.write(json.dumps(data))

lora_path = '/kaggle/input/2211-lora-14b/transformers/default/1'
cmd = f"(CUDA_VISIBLE_DEVICES=0 python run_embed.py --base_model /kaggle/input/qw14b-awq/transformers/default/1 --lora_path {lora_path} --input_text data.json --fold 0 2) & (CUDA_VISIBLE_DEVICES=1 python run_embed.py --base_model /kaggle/input/qw14b-awq/transformers/default/1 --lora_path {lora_path} --input_text data.json --fold 1 2)"
import os
os.system(cmd)

from glob import glob
import time
text_to_embed = {}
files = glob('*.pt*')
while len(files) != 2:
    time.sleep(1)
    files = glob('*.pt*')


time.sleep(3)    
for path in files:
    print(path)
    text_to_embed.update(torch.load(path))


query_embeddings = torch.stack([text_to_embed[t] for t in new_queries])
doc_embeddings = torch.stack([text_to_embed[t] for t in documents])
query_embeddings.shape, doc_embeddings.shape
query_embeddings = query_embeddings.numpy()
doc_embeddings = doc_embeddings.numpy()
np.save("qwen14b_awq_query_embeddings.npy", query_embeddings)
np.save("qwen14b_awq_doc_embeddings.npy", doc_embeddings)

In [None]:
!python qwen14b_awq_infer.py

In [None]:
%%writefile qwen14b_inst.py
import ctypes
import gc
import torch
def clean_memory(deep=True):
    gc.collect()
    if deep:
        ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()
clean_memory()
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import gc
import pandas as pd
import pickle
import sys
import numpy as np
from tqdm.auto import trange
from sklearn.model_selection import GroupKFold
import json
import torch
import torch.nn as nn
from numpy.linalg import norm
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer,AutoConfig,AutoModel,BitsAndBytesConfig
from peft import (
    LoraConfig,
    get_peft_model,
    PeftModel,
)
import json
import copy
import warnings
import re
warnings.filterwarnings('ignore')
from concurrent.futures import ThreadPoolExecutor
from sklearn.neighbors import NearestNeighbors

path_prefix = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"
model_path = "/kaggle/input/qwen2-5-14b-instruct"
lora_path="/kaggle/input/eedi-qwen-lora/qwen_v10_last/output-qwen_v10/last"
device_0 = torch.device('cuda:0')
device_1 = torch.device('cuda:1')
q_max_len = 512
p_max_len = 50
top_k = 25


# %%
def batch_to_device(batch, target_device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    for key in batch:
        if isinstance(batch[key], Tensor):
            batch[key] = batch[key].to(target_device)
    return batch

def last_token_pool(last_hidden_states: Tensor,
                    attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

# %%
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    #llm_int8_skip_modules=["proj_head"]
)

# %%
model_config = AutoConfig.from_pretrained(model_path)

# %%
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.truncation_side = "left"

# %%
class CustomSimCSEModel(nn.Module):
    def __init__(self, path, config, device, quantization_config, top_linear=True, emb_size=1024, sentence_pooling_method='last', normlized=True, temperature=0.02):
        super().__init__()
        self.model = AutoModel.from_pretrained(path, config=config, quantization_config=quantization_config, trust_remote_code=True, device_map=device)
        self.config = self.model.config
        self.top_linear = top_linear
        if self.top_linear:
            self.proj_head = nn.Linear(config.hidden_size, emb_size)
            self.proj_head.to(device)
        self.sentence_pooling_method = sentence_pooling_method
        self.normlized = normlized
        self.temperature = temperature
        self.cross_entropy = nn.CrossEntropyLoss(reduction='mean')
        
    def last_token_pool(self, last_hidden_states: Tensor,
                        attention_mask: Tensor) -> Tensor:
        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
        if left_padding:
            return last_hidden_states[:, -1]
        else:
            sequence_lengths = attention_mask.sum(dim=1) - 1
            batch_size = last_hidden_states.shape[0]
            return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
        
    def sentence_embedding(self, hidden_state, mask):
        if self.sentence_pooling_method == 'mean':
            s = torch.sum(hidden_state * mask.unsqueeze(-1).float(), dim=1)
            d = mask.sum(axis=1, keepdim=True).float()
            return s / d
        elif self.sentence_pooling_method == 'cls':
            return hidden_state[:, 0]
        elif self.sentence_pooling_method == 'last':
            return self.last_token_pool(hidden_state, mask)
        
    def encode(self, features):
        if features is None:
            return None
        psg_out = self.model(input_ids=features['input_ids'], attention_mask=features['attention_mask'],
                             return_dict=True)
        p_reps = self.sentence_embedding(psg_out.last_hidden_state, features['attention_mask'])
        p_reps = p_reps.to(torch.float32)
        if self.top_linear:
            p_reps = self.proj_head(p_reps)
        if self.normlized:
            p_reps = torch.nn.functional.normalize(p_reps, dim=-1)
        return p_reps.contiguous()
    
    def compute_similarity(self, q_reps, p_reps):
        if len(p_reps.size()) == 2:
            return torch.matmul(q_reps, p_reps.transpose(0, 1))
        return torch.matmul(q_reps, p_reps.transpose(-2, -1))
    
    def forward(self, query, doc):
        query_emb = self.encode(query)
        doc_emb = self.encode(doc)
        scores = self.compute_similarity(query_emb, doc_emb) / self.temperature
        scores = scores.view(query_emb.size(0), -1)
        target = torch.arange(scores.size(0), device=scores.device, dtype=torch.long)
        loss = self.cross_entropy(scores, target)
        return dict(
            loss=loss,
            scores=scores,
            query_emb=query_emb,
            doc_emb=doc_emb,
        )


# %%
task_description = 'Given a math question and a misconcepte incorrect answer, please retrieve the most accurate reason for the misconception.'

# %%

tra = pd.read_csv(f"{path_prefix}/test.csv")
print(tra.shape)
misconception_mapping = pd.read_csv(f"{path_prefix}/misconception_mapping.csv")
if tra.shape[0]<10:
    misconception_mapping = misconception_mapping.head(25)


# %%
def create_train_df(train_df, misconception_mapping, is_train=True):
    train_data = []
    for _,row in train_df.iterrows():
        for c in ['A','B','C','D']:
            if is_train:
                misconception_id = row[f"Misconception{c}Id"]
                if np.isnan(misconception_id):
                    continue
                misconception_id = int(misconception_id)
            if c == row['CorrectAnswer']:
                continue
            if f'Answer{c}Text' not in row:
                continue
            real_answer_id = row['CorrectAnswer']
            real_text = row[f'Answer{real_answer_id}Text']
            incorrect_text = row[f'Answer{c}Text']
            query_text =f"###question###:{row['SubjectName']}-{row['ConstructName']}-{row['QuestionText']}\n###Correct Answer###:{real_text}\n###Misconcepte Incorrect answer###:{incorrect_text}"
            row['CorrectAnswerText'] = real_text
            row['IncorrectAnswerText'] = incorrect_text
            row['query'] = get_detailed_instruct(task_description,query_text)
            row['answer_name'] = c
            if is_train:
                row['answer_id'] = misconception_id
                row['doc'] = misconception_mapping.iloc[misconception_id]['MisconceptionName']
            train_data.append(copy.deepcopy(row))
    new_train_df = pd.DataFrame(train_data)
    return new_train_df

# %%
new_val_df = create_train_df(tra, misconception_mapping, is_train=False)

# %%
def inference(df, model, tokenizer, max_length, device):
    batch_size = 8
    sentences = list(df['query'].values)
    all_embeddings = []
    length_sorted_idx = np.argsort([-len(sen) for sen in sentences])
    sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
    for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=False):
        sentences_batch = sentences_sorted[start_index: start_index + batch_size]
        features = tokenizer(sentences_batch, max_length=max_length, padding=True, truncation=True,
                             return_tensors="pt")
        features = batch_to_device(features, device)
        with torch.no_grad():
            embeddings = model.encode(features)
            embeddings = embeddings.detach().cpu().numpy().tolist()
        all_embeddings.extend(embeddings)

    all_embeddings = [np.array(all_embeddings[idx]).reshape(1, -1) for idx in np.argsort(length_sorted_idx)]

    sentence_embeddings = np.concatenate(all_embeddings, axis=0)
    return sentence_embeddings

# %%
model_0 = CustomSimCSEModel(model_path, config=model_config, quantization_config=bnb_config, top_linear=False, device=device_0)
model_0 = PeftModel.from_pretrained(model_0, lora_path, device_map=device_0)


model_1 = CustomSimCSEModel(model_path, config=model_config, quantization_config=bnb_config, top_linear=False, device=device_1)
model_1 = PeftModel.from_pretrained(model_1, lora_path, device_map=device_1)
# %%

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")


# %%
mid = len(new_val_df) // 2
sub_1 = new_val_df.iloc[:mid].copy()
sub_2 = new_val_df.iloc[mid:].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    q_results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (tokenizer, tokenizer), (q_max_len, q_max_len), (device_0, device_1))

query_embeddings = np.concatenate(list(q_results), axis=0)
np.save("qwen14b_inst_query_embeddings.npy", query_embeddings)
print("query_embeddings:")
print(query_embeddings.shape)
misconception_mapping['query'] = misconception_mapping['MisconceptionName']

mmid = len(misconception_mapping) // 2
msub_1 = misconception_mapping.iloc[:mmid].copy()
msub_2 = misconception_mapping.iloc[mmid:].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    d_results = executor.map(inference, (msub_1, msub_2), (model_0, model_1), (tokenizer, tokenizer), (p_max_len, p_max_len), (device_0, device_1))

doc_embeddings = np.concatenate(list(d_results), axis=0)
np.save("qwen14b_inst_doc_embeddings.npy", doc_embeddings)
print("doc_embeddings:")
print(doc_embeddings.shape)
#indices = get_matches(query_embeddings, doc_embeddings, n_neighbors=top_k)
new_val_df["QuestionId_Answer"] = new_val_df["QuestionId"].astype(str) + "_" + new_val_df["answer_name"]
new_val_df.to_parquet('df.parquet', index=False)
print("Recall data file created successfully!")

In [None]:
!python qwen14b_inst.py

In [None]:
%%writefile qwen32b_infer.py
import ctypes
import gc
import torch
def clean_memory(deep=True):
    gc.collect()
    if deep:
        ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()
clean_memory()
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import gc
import pandas as pd
import pickle
import sys
import numpy as np
from tqdm.auto import trange
from sklearn.model_selection import GroupKFold
import json
import torch
import torch.nn as nn
from numpy.linalg import norm
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer,AutoConfig,AutoModel,BitsAndBytesConfig
from peft import (
    LoraConfig,
    get_peft_model,
    PeftModel,
)
import json
import copy
import warnings
import re
warnings.filterwarnings('ignore')
from concurrent.futures import ThreadPoolExecutor
from sklearn.neighbors import NearestNeighbors



def batch_to_device(batch, target_device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    for key in batch:
        if isinstance(batch[key], Tensor):
            batch[key] = batch[key].to(target_device)
    return batch

def last_token_pool(last_hidden_states: Tensor,
                    attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

path_prefix = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"
model_path = "/kaggle/input/qwen2-5-32b-instruct-quant"
lora_path="/kaggle/input/qwen2-5-32b-n6-convert/pp_adapter-n6-last/"
VALID = False
q_max_len = 384
p_max_len = 50
top_k = 25
model_config = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.truncation_side = "left"
model = AutoModel.from_pretrained(model_path, config=model_config, 
                                  quantization_config=bnb_config, 
                                  trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16)

model = PeftModel.from_pretrained(model, lora_path)
model.config.use_cache = False

task_description = 'Given a math question and a misconcepte incorrect answer, please retrieve the most accurate reason for the misconception.'

tra = pd.read_csv(f"{path_prefix}/test.csv")
print(tra.shape)
misconception_mapping = pd.read_csv(f"{path_prefix}/misconception_mapping.csv")
if tra.shape[0]<10:
    misconception_mapping = misconception_mapping.head(25)

def create_train_df(train_df, misconception_mapping, is_train=True):
    train_data = []
    for _,row in train_df.iterrows():
        for c in ['A','B','C','D']:
            if is_train:
                misconception_id = row[f"Misconception{c}Id"]
                if np.isnan(misconception_id):
                    continue
                misconception_id = int(misconception_id)
            if c == row['CorrectAnswer']:
                continue
            if f'Answer{c}Text' not in row:
                continue
            real_answer_id = row['CorrectAnswer']
            real_text = row[f'Answer{real_answer_id}Text']
            incorrect_text = row[f'Answer{c}Text']
            query_text =f"###question###:{row['SubjectName']}-{row['ConstructName']}-{row['QuestionText']}\n###Correct Answer###:{real_text}\n###Misconcepte Incorrect answer###:{incorrect_text}"
            row['CorrectAnswerText'] = real_text
            row['IncorrectAnswerText'] = incorrect_text
            row['query'] = get_detailed_instruct(task_description,query_text)
            row['answer_name'] = c
            if is_train:
                row['answer_id'] = misconception_id
                row['doc'] = misconception_mapping.iloc[misconception_id]['MisconceptionName']
            train_data.append(copy.deepcopy(row))
    new_train_df = pd.DataFrame(train_data)
    return new_train_df

new_val_df = create_train_df(tra, misconception_mapping, is_train=VALID)
model = model.eval()
def inference(df, model, tokenizer, max_length):
    batch_size = 4
    sentences = list(df['query'].values)
    all_embeddings = []
    length_sorted_idx = np.argsort([-len(sen) for sen in sentences])
    sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
    for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=False):
        sentences_batch = sentences_sorted[start_index: start_index + batch_size]
        features = tokenizer(sentences_batch, max_length=max_length, padding=True, truncation=True,
                             return_tensors="pt")
        features = batch_to_device(features, 'cuda')
        with torch.no_grad():
            output = model(**features)
            last_token_rep = last_token_pool(output.last_hidden_state, features['attention_mask'])
            rep = torch.nn.functional.normalize(last_token_rep.to(torch.float32), dim=-1)
            rep = rep.detach().cpu().numpy().tolist()
        all_embeddings.extend(rep)
    all_embeddings = [np.array(all_embeddings[idx]).reshape(1, -1) for idx in np.argsort(length_sorted_idx)]
    sentence_embeddings = np.concatenate(all_embeddings, axis=0)
    return sentence_embeddings

query_embeddings = inference(new_val_df, model, tokenizer, q_max_len)
misconception_mapping['query'] = misconception_mapping['MisconceptionName']
doc_embeddings = inference(misconception_mapping, model, tokenizer, p_max_len)

np.save("qwen32b_query_embeddings.npy", query_embeddings)
print("query_embeddings:")
print(query_embeddings.shape)
print(query_embeddings)

np.save("qwen32b_doc_embeddings.npy", doc_embeddings)
print("doc_embeddings:")
print(doc_embeddings.shape)
new_val_df["QuestionId_Answer"] = new_val_df["QuestionId"].astype(str) + "_" + new_val_df["answer_name"]
new_val_df.to_parquet('df.parquet', index=False)
print("Recall data file created successfully!")

In [None]:
!python qwen32b_infer.py

#### embedding融合

In [None]:
%%writefile retrive_model.py
import ctypes
import gc
import torch
def clean_memory(deep=True):
    gc.collect()
    if deep:
        ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()
clean_memory()

import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

top_k = 25
path_prefix = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"

def get_matches(V_topic, V_content, n_neighbors=25):
    
    neighbors_model = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine', algorithm="brute", n_jobs=-1)
    neighbors_model.fit(V_content)
    dists, indices = neighbors_model.kneighbors(V_topic)
    
    return indices

new_val_df = pd.read_parquet("df.parquet")
q_emb1 = np.load('/kaggle/working/qwen14b_awq_query_embeddings.npy').astype("float16")
q_emb2 = np.load('/kaggle/working/qwen32b_query_embeddings.npy').astype("float16")
q_emb3 = np.load('/kaggle/working/qwen14b_inst_query_embeddings.npy').astype("float16")

d_emb1 = np.load('/kaggle/working/qwen14b_awq_doc_embeddings.npy').astype("float16")
d_emb2 = np.load('/kaggle/working/qwen32b_doc_embeddings.npy').astype("float16")
d_emb3 = np.load('/kaggle/working/qwen14b_inst_doc_embeddings.npy').astype("float16")

query_embeddings = np.concatenate([q_emb1,q_emb2, q_emb3], axis=1)

doc_embeddings = np.concatenate([d_emb1, d_emb2, d_emb3], axis=1)

doc_embeddings.shape

top_k = min(top_k, len(doc_embeddings))
indices = get_matches(query_embeddings, doc_embeddings, n_neighbors=top_k)
misconception_mapping = pd.read_csv(f"{path_prefix}/misconception_mapping.csv")
np.save("indices.npy", indices)
new_val_df['MisconceptionId_topk'] = indices.tolist()
new_val_df.to_parquet('submission1.parquet', index=False)
# new_val_df["MisconceptionId"] = new_val_df["recall_ids"].apply(lambda x: " ".join(map(str, x)))
# new_val_df.to_csv("submission.csv", columns=["QuestionId_Answer", "MisconceptionId"], index=False)
# print("Submission file created successfully!")

# new_val_df[['QuestionId_Answer', "MisconceptionId"]]
print(new_val_df.head())

In [None]:
!python retrive_model.py

### 重排
    1.利用Qwen2.5-32B-Instruct-AWQ的zero-shot能力输出预测的语句
    2.利用bge-large-en-v1.5检索最相似的候选

In [None]:
!pip install --no-index --find-links=/kaggle/input/vllm-0-6-3-post1-wheels torchvision==0.19.1
!pip install --no-index --find-links=/kaggle/input/vllm-0-6-3-post1-wheels vllm
!pip install --no-deps --no-index /kaggle/input/hf-libraries/sentence-transformers/sentence_transformers-3.1.0-py3-none-any.whl

In [None]:
%%writefile run_vllm.py

import pandas as pd
import numpy as np
import vllm
import re
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer
import argparse

ap = argparse.ArgumentParser()
ap.add_argument('--GROUP', type=int, default=3, required=False)
ap.add_argument('--NUM_1', type=int, default=7,required=False)
ap.add_argument('--NUM_2', type=int, default=5,required=False)
args = ap.parse_args()

test_long = pd.read_parquet('submission1.parquet')
test_sorted_indices_topk = test_long['MisconceptionId_topk'].tolist()
test_sorted_indices_topk = [item.tolist() for item in test_sorted_indices_topk]

Misconception_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv') # 修改Misconception_df名字
Misconception_id2name = dict(zip(Misconception_df['MisconceptionId'], Misconception_df['MisconceptionName']))
MisconceptionName = Misconception_df['MisconceptionName'].tolist()

llm_model_name = '/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1' # /kaggle/input/qwq-32b-preview-awq, /kaggle/input/qwen2-5-32b-instruct-awq

def apply_template(row, tokenizer):
    PROMPT  = """Here is a mathematics question about 
Curriculum knowledge: {constructName}({subjectName})
Question: {problem}
Incorrect Answer: {wrong_ans}
Correct Answer: {correctAnswerValue}
    
You are a Mathematics teacher. Your task is to reason and identify the misconception behind the Incorrect Answer with the Question.
Answer concisely what misconception it is to lead to getting the incorrect answer.
No need to give the reasoning process and do not use "The misconception is" to start your answers.
There are some relative and possible misconceptions below to help you make the decision:

{retrival}"""

    messages = [
      {
          "role": "user", 
          "content": PROMPT.format(
              constructName=row["ConstructName"],
              problem=row["QuestionText"],
              correctAnswerValue=row["CorrectAnswerText"],
              wrong_ans=row["IncorrectAnswerText"],
              subjectName=row["SubjectName"],
              retrival=row["Retrival"]
          )}
  ]
    
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return text

def number2sentence(llm_output, test_sorted_indices_top, Misconception_id2name):
    num = len(llm_output)
    for i in range(num):
        text = str(llm_output[i])
        text = text.replace("<|im_start|>", "")
        text = text.replace("<|im_end|>", "")
        text = text.rstrip("\n").strip()
        potential = re.search(r'^\w+\.{0,1}', text).group()
        # print(potential)
        if '.' in potential or re.fullmatch(r"\d+", text):
            if len(text) > 5:
                sentence = text.replace(potential, '').strip()
            else:
                text = re.sub(r"^(\d+)\.$", r"\1", text)
                try:
                    sentence = Misconception_id2name[test_sorted_indices_top[i][int(text) - 1]]
                except:
                    sentence = Misconception_id2name[test_sorted_indices_top[i][0]]
        else:
            sentence = text
        llm_output[i] = sentence
    return llm_output

llm = vllm.LLM(
        llm_model_name,
        quantization="awq",
        tensor_parallel_size=2, 
        gpu_memory_utilization=0.95,  #qwen:0.96, mistrial:0.95, Mythalion: 0.9
        trust_remote_code=False,
        dtype="half", 
        enforce_eager=True,
        max_model_len=5120,
        disable_log_stats=True,
    )

tokenizer = llm.get_tokenizer()

from sentence_transformers import SentenceTransformer
import torch

model = SentenceTransformer('/kaggle/input/bge-large-en-v1-5', trust_remote_code=True)

GROUP = args.GROUP
num_1 = args.NUM_1
num_2 = args.NUM_2

retrivals_groups = []
test_sorted_indices_search = []
for i in range(len(test_sorted_indices_topk)):
    test_sorted_indices_topk_i = test_sorted_indices_topk[i].copy()
    test_sorted_indices_topk_i = test_sorted_indices_topk_i[:num_1]
    # test_sorted_indices_topk_i.reverse()
    test_sorted_indices_search.append(test_sorted_indices_topk_i)
    descriptions = []
    for j in range(num_1):
        descriptions.append(Misconception_id2name[test_sorted_indices_topk_i[j]])
    retrival = "\n".join([f"{k+1}. {desc}" for k, desc in enumerate(descriptions)])
    retrivals_groups.append(retrival)
test_long['Retrival'] = retrivals_groups
test_long["Prompt"] = test_long.apply(lambda row: apply_template(row, tokenizer), axis=1)

# 生成每一组的Misconception_name
construct_responses = llm.generate(
test_long["Prompt"].values,
vllm.SamplingParams(
    n=1,  
    top_p=0.8,  
    temperature=0,  
    seed=777, 
    skip_special_tokens=False,  
    max_tokens=324, 
),
use_tqdm = True
)
gen_construct_texts = [x.outputs[0].text for x in construct_responses]
gen_construct_texts = number2sentence(gen_construct_texts, test_sorted_indices_search, Misconception_id2name)
test_long["llmMisconception_clean"] = gen_construct_texts

test_long_vec = model.encode(test_long['llmMisconception_clean'].tolist(), normalize_embeddings=True)
misconception_mapping_vec = model.encode(Misconception_df["MisconceptionName"].to_list(), normalize_embeddings=True)
test_cos_sim_arr = cosine_similarity(test_long_vec, misconception_mapping_vec)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)

Misconception_2d = test_sorted_indices[:, :1].tolist()

for i in range(len(Misconception_2d)):
    if Misconception_2d[i][0] in test_sorted_indices_topk[i]:
        test_sorted_indices_topk[i].remove(Misconception_2d[i][0])

for g in range(1, GROUP):
    retrivals_groups = []
    test_sorted_indices_search = []
    for i in range(len(test_sorted_indices_topk)):
        test_sorted_indices_topk_i = test_sorted_indices_topk[i].copy()
        if i == 1:
            # test_sorted_indices_topk_i = test_sorted_indices_topk_i[:num_1]
            test_sorted_indices_topk_i = test_sorted_indices_topk_i[:num_2]
        else:
            test_sorted_indices_topk_i = test_sorted_indices_topk_i[:num_2]
        test_sorted_indices_search.append(test_sorted_indices_topk_i)
        descriptions = []
        for j in range(num_2):
            descriptions.append(Misconception_id2name[test_sorted_indices_topk_i[j]])
        retrival = "\n".join([f"{k+1}. {desc}" for k, desc in enumerate(descriptions)])
        retrivals_groups.append(retrival)
    test_long['Retrival'] = retrivals_groups
    test_long["Prompt"] = test_long.apply(lambda row: apply_template(row, tokenizer), axis=1)
    
    # 生成每一组的Misconception_name
    construct_responses = llm.generate(
    test_long["Prompt"].values,
    vllm.SamplingParams(
        n=1,  
        top_p=0.8,  
        temperature=0,  
        seed=777, 
        skip_special_tokens=False,  
        max_tokens=324, 
    ),
    use_tqdm = True
    )
    gen_construct_texts = [x.outputs[0].text for x in construct_responses]
    gen_construct_texts = number2sentence(gen_construct_texts, test_sorted_indices_search, Misconception_id2name)
    test_long["llmMisconception_clean"] = gen_construct_texts

    # 计算每一次的余弦相似度
    test_long_vec = model.encode(test_long['llmMisconception_clean'].tolist(), normalize_embeddings=True)
    test_cos_sim_arr = cosine_similarity(test_long_vec, misconception_mapping_vec)
    test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)
    
    if g != GROUP-1:
        test_sorted_indices_group = test_sorted_indices[:, :1].tolist()
        for i in range(len(Misconception_2d)):
            if test_sorted_indices_group[i][0] in test_sorted_indices_topk[i]:
                test_sorted_indices_topk[i].remove(test_sorted_indices_group[i][0])
            Misconception_2d[i].extend(test_sorted_indices_group[i])
    else:
        test_sorted_indices_group = test_sorted_indices[:, :30].tolist()
        for i in range(len(Misconception_2d)):
              temp_list = test_sorted_indices_group[i].copy()
              test_sorted_indices_group[i] = [item for item in temp_list if item not in Misconception_2d[i]]
              Misconception_2d[i].extend(test_sorted_indices_group[i])

for i in range(len(Misconception_2d)):
    seen = set()
    temp_list = Misconception_2d[i].copy()
    Misconception_2d[i] = [item for item in temp_list if not (item in seen or seen.add(item))]

llm_result_2d = test_long['MisconceptionId_topk'].tolist()
llm_result_2d = [item.tolist() for item in llm_result_2d]
# print(llm_result_2d)

Misconception_2d_emsemble = []
for i in range(len(Misconception_2d)):
    llm_result_1d = llm_result_2d[i]
    Misconception_1d = Misconception_2d[i]
    Misconception_1d = Misconception_1d[:GROUP]
    for misconception in Misconception_1d:
        if misconception in llm_result_1d:
            llm_result_1d.remove(misconception)
    Misconception_1d.extend(llm_result_1d)
    Misconception_2d_emsemble.append(Misconception_1d)

test_long["MisconceptionId"] = Misconception_2d_emsemble
test_long.reset_index(drop=True, inplace=True)

test_long["MisconceptionId"] = test_long["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))
submission = test_long[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)
submission.to_csv("submission.csv", index=False)
print(submission)

In [None]:
!python run_vllm.py