Thanks to [Rich Olson](https://www.kaggle.com/richolson) for the Phi 3.5 Mini Instruct and [sinchir0](https://www.kaggle.com/sinchir0) for the BGE model.

This is a baseline hybrid search approach, where-

1) a vector search is being done directly on the dataset question + misconception

2) a keyword search is being done on Phi's response to us asking what the misconception is

In [None]:
!pip install /kaggle/input/rank-bm25/rank_bm25-0.2.2-py3-none-any.whl
!pip install /kaggle/input/sentence-transformers/sentence_transformers-3.2.0-py3-none-any.whl

In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk
import re
import os
import gc

from sentence_transformers import SentenceTransformer

In [None]:
# Configuration
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
BATCH_SIZE = 8
MAX_NEW_TOKENS = 55
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
# Load models
phi_model_name = '/kaggle/input/phi-3.5-mini-instruct/pytorch/default/1'

BGE_MODEL_PATH = '/kaggle/input/train-bge-synthetic-data/trained_model'
bge_model = SentenceTransformer(BGE_MODEL_PATH)
bge_model = bge_model.to(DEVICE)

phi_tokenizer = AutoTokenizer.from_pretrained(phi_model_name)
phi_model = AutoModelForCausalLM.from_pretrained(
    phi_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
phi_pipe = pipeline("text-generation", model=phi_model, tokenizer=phi_tokenizer, trust_remote_code=True, max_new_tokens=MAX_NEW_TOKENS)

In [None]:
# Load data
train = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")
test = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
misconception_mapping = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")

In [None]:
def generate_filtered_df(df, question, min_rows=5, max_rows=7):
    construct_id = question['ConstructId']
    subject_id = question['SubjectId']
    
    # Filter by ConstructId
    filtered_df = df[df['ConstructId'] == construct_id]
    
    # If not enough rows, add by SubjectId
    if len(filtered_df) < min_rows:
        subject_df = df[(df['SubjectId'] == subject_id) & (df['ConstructId'] != construct_id)]
        filtered_df = pd.concat([filtered_df, subject_df])
    
    # If still not enough, add random rows
    if len(filtered_df) < min_rows:
        random_df = df[~df.index.isin(filtered_df.index)].sample(n=min(min_rows - len(filtered_df), len(df) - len(filtered_df)))
        filtered_df = pd.concat([filtered_df, random_df])
    
    # Limit to max_rows
    return filtered_df.sample(n=min(max_rows, len(filtered_df)))

# Function to generate example sequences
def get_example_sequences(filtered_train_df, num_examples=3):
    examples = []
    for _, row in filtered_train_df.sample(n=min(num_examples, len(filtered_train_df))).iterrows():
        for answer_choice in ['A', 'B', 'C', 'D']:
            if answer_choice != row['CorrectAnswer']:
                misconception_id = row[f'Misconception{answer_choice}Id']
                if not pd.isna(misconception_id):
                    examples.append({
                        'question': f"{row['ConstructName']}: {row['QuestionText']}",
                        'correct_answer': row[f'Answer{row["CorrectAnswer"]}Text'],
                        'incorrect_answer': row[f'Answer{answer_choice}Text'],
                        'misconception': misconception_mapping.loc[int(misconception_id), 'MisconceptionName']
                    })
                    break  # Only use one incorrect answer per question
    return examples

In [None]:
def preprocess_text(x):
    x = x.lower()
    x = re.sub(r"[^\w\s]", '', x)
    x = re.sub(r"\s+", " ", x)
    return x.strip()

In [None]:
def generate_embeddings(texts, model, batch_size=BATCH_SIZE):
    texts = [preprocess_text(text) for text in texts]
    return model.encode(texts, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)

In [None]:
def generate_question_embeddings(questions, bge_model):
    texts = []
    for q in questions:
        for answer_choice in ['A', 'B', 'C', 'D']:
            if answer_choice != q['CorrectAnswer']:
                text = f"{q['ConstructName']}: {q['QuestionText']} {q[f'Answer{answer_choice}Text']}"
                texts.append(text)
    return generate_embeddings(texts, bge_model)

In [None]:
def predict_misconception(questions, phi_pipe):
    all_prompts = []
    for q in questions:
        correct_answer_key = f"Answer{q['CorrectAnswer']}Text"
        correct_answer = q[correct_answer_key]
        
        filtered_df = generate_filtered_df(train, q)
        examples = get_example_sequences(filtered_df)
        
        messages = []
        
        # Add example messages
        for example in examples:
            messages.extend([
                {"role": "user", "content": f"Question: {example['question']}"},
                {"role": "assistant", "content": "Provide me with the correct answer for a baseline."},
                {"role": "user", "content": f"Correct Answer: {example['correct_answer']}"},
                {"role": "assistant", "content": "Now - provide the incorrect answer and I will analyze the difference to infer the misconception."},
                {"role": "user", "content": f"Incorrect Answer: {example['incorrect_answer']}"},
                {"role": "assistant", "content": f"Misconception for incorrect answer: {example['misconception']}"}
            ])
        
        # Add the current question
        messages.extend([
            {"role": "user", "content": f"Question: {q['ConstructName']}: {q['QuestionText']}"},
            {"role": "assistant", "content": "Provide me with the correct answer for a baseline."},
            {"role": "user", "content": f"Correct Answer: {correct_answer}"},
            {"role": "assistant", "content": "Now - provide the incorrect answer and I will analyze the difference to infer the misconception."},
        ])
        
        # Add each incorrect answer as a separate prompt
        for answer_choice in ['A', 'B', 'C', 'D']:
            if answer_choice != q['CorrectAnswer']:
                incorrect_answer_key = f"Answer{answer_choice}Text"
                incorrect_answer = q[incorrect_answer_key]
                
                prompt_messages = messages.copy()
                prompt_messages.append({"role": "user", "content": f"Incorrect Answer: {incorrect_answer}"})
                
                all_prompts.append(prompt_messages)
    
    responses = phi_pipe(all_prompts, batch_size=BATCH_SIZE)
    
    # Updated response processing
    processed_responses = []
    for response in responses:
        if isinstance(response, list) and len(response) > 0:
            generated_text = response[0].get('generated_text', [])
            if isinstance(generated_text, list) and len(generated_text) > 0:
                last_message = generated_text[-1]
                if isinstance(last_message, dict) and 'content' in last_message:
                    content = last_message['content'].strip()
                    # Find the start of the misconception text
                    start_index = content.find("Misconception for incorrect answer:")
                    if start_index != -1:
                        # Extract text after the prefix
                        misconception = content[start_index + len("Misconception for incorrect answer:"):].strip()
                        # Find the first full stop
                        end_index = misconception.find('.')
                        if end_index != -1:
                            misconception = misconception[:end_index + 1].strip()
                        processed_responses.append(misconception)
                    else:
                        processed_responses.append(content)
                else:
                    processed_responses.append(str(last_message))
            else:
                processed_responses.append(str(generated_text))
        else:
            processed_responses.append(str(response))

    return processed_responses

In [None]:
def bm25_search(queries, documents, top_k=50):
    tokenized_corpus = [word_tokenize(doc.lower()) for doc in documents]
    bm25 = BM25Okapi(tokenized_corpus)
    
    results = []
    scores = []
    for query in queries:
        tokenized_query = word_tokenize(query.lower())
        doc_scores = bm25.get_scores(tokenized_query)
        top_indices = np.argsort(doc_scores)[::-1][:top_k]
        top_scores = np.sort(doc_scores)[::-1][:top_k]
        results.append(top_indices)
        scores.append(top_scores)
    return results, scores

def semantic_search(embeddings, misc_embeddings, top_k=50):
    similarities = cosine_similarity(embeddings, misc_embeddings)
    top_indices = np.argsort(-similarities, axis=1)[:, :top_k]
    top_scores = np.sort(similarities, axis=1)[:, ::-1][:, :top_k]
    return top_indices, top_scores

def combined_search(semantic_results, semantic_scores, keyword_results, keyword_scores, top_k=25, alpha=0.5):
    combined_results = []
    for sem_res, sem_scores, key_res, key_scores in zip(semantic_results, semantic_scores, keyword_results, keyword_scores):
        combined_scores = np.zeros(len(misconception_mapping))
        
        # Normalize scores using min-max normalization
        sem_scores_norm = (sem_scores - np.min(sem_scores)) / (np.max(sem_scores) - np.min(sem_scores))
        key_scores_norm = (key_scores - np.min(key_scores)) / (np.max(key_scores) - np.min(key_scores))
        
        # Combine scores by taking the weighted sum
        for idx, score in zip(sem_res, sem_scores_norm):
            combined_scores[idx] += alpha * score
        
        for idx, score in zip(key_res, key_scores_norm):
            combined_scores[idx] += (1 - alpha) * score
        
        # Get top combined results
        top_combined = np.argsort(combined_scores)[::-1][:top_k]
        combined_results.append(top_combined)
    
    return combined_results

In [None]:
def process_questions_batch(questions, misc_embeddings, bge_model, phi_pipe):
    llm_responses = predict_misconception(questions, phi_pipe)
    question_embeddings = generate_question_embeddings(questions, bge_model)
    
    semantic_results, semantic_scores = semantic_search(question_embeddings, misc_embeddings)
    keyword_results, keyword_scores = bm25_search(llm_responses, misconception_mapping['MisconceptionName'].tolist())
    
    combined_results = combined_search(semantic_results, semantic_scores, keyword_results, keyword_scores)
    
    results = []
    result_index = 0
    for question in questions:
        for answer_choice in ['A', 'B', 'C', 'D']:
            if answer_choice != question['CorrectAnswer']:
                top_misconceptions = combined_results[result_index]
                results.append({
                    'QuestionId_Answer': f"{question['QuestionId']}_{answer_choice}",
                    'MisconceptionId': ' '.join(map(str, top_misconceptions))
                })
                result_index += 1
    
    return results  # Ensure we're returning the results

In [None]:
# Generate embeddings for all misconceptions
misc_embeddings = generate_embeddings(misconception_mapping['MisconceptionName'].tolist(), bge_model)

results = []
for i in range(0, len(test), BATCH_SIZE):
    batch = test.iloc[i:i+BATCH_SIZE].to_dict('records')
    batch_results = process_questions_batch(batch, misc_embeddings, bge_model, phi_pipe)
    if batch_results:  # Add a check to ensure batch_results is not None
        results.extend(batch_results)
    else:
        print(f"Warning: No results for batch starting at index {i}")
    
    # Clear cache and collect garbage
    torch.cuda.empty_cache()
    gc.collect()
    
    print(f"Processed {i+len(batch)} out of {len(test)} questions")

submission_df = pd.DataFrame(results)
submission_df.to_csv("submission.csv", index=False)
print("Submission file created successfully!")