In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # There was a warning message

# Qwen requirements
!pip install torch transformers datasets -q
!pip install transformers -q 
!pip install transformers_stream_generator einops -q
!pip install tiktoken -q
!pip install gradio-client -q

In [None]:
import pandas as pd
import numpy as np

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from transformers.generation import GenerationConfig

import torch
import re

from gradio_client import Client, handle_file
import json

from scipy.spatial.distance import cosine

In [None]:
train_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
test_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')

misconceptions = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')

X = train_df.copy()
X_test = test_df.copy()

In [None]:
X.head()

In [None]:
misconceptions.head()

# Example use of LLMs on a sample

[OpenAI's gpt-4o](https://chatgpt.com/share/66f257f5-1a74-800a-8f87-e3ffc91bfd62)

[Qwen2.5 7B and MetaMath Mistral 7B](https://colab.research.google.com/drive/1zGwcMO5cSmKylPwIUkBQnEXJ1WMTOCpn?usp=sharing)

# Outline
---
1. Get embeddings for each misconception
2. Store them in a vector database
3. Let the model describe student's mistake in one sentence
4. Using the vector database query and output top 25 similar misconceptions


#### Qwen2.5 14B 
    @article{yang2024qwen25mathtechnicalreportmathematical,
    title={Qwen2.5-Math Technical Report: Toward Mathematical Expert Model via Self-Improvement}, 
    author={An Yang and Beichen Zhang and Binyuan Hui and Bofei Gao and Bowen Yu and Chengpeng Li and Dayiheng Liu and Jianhong Tu and Jingren Zhou and Junyang Lin and Keming Lu and Mingfeng Xue and Runji Lin and Tianyu Liu and Xingzhang Ren and Zhenru Zhang},
    journal={arXiv preprint arXiv:2409.12122},
    year={2024}

# Instantiate the base model

In [None]:
# I will be using API as it is requires less compute
client = Client("Qwen/Qwen2.5-Math-Demo")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-72B")

# Embeddings for each misconception

In [None]:
# a small utility function as tokenizer was outputing input_ids and other not needed details
def return_only_vector_embeddings(text):
    full_output = tokenizer(text, return_tensors='pt')
    return full_output['input_ids']  # Extract only 'input_ids'

embeddings = {}

for idx, name in misconceptions.values:
    embedding = return_only_vector_embeddings(name)
    embeddings[idx] = embedding

# Qwen2.5 to Describe Mistakes

In [None]:
# Example on one sample
question = X['QuestionText'][0]
answer = X['AnswerDText'][0]
correct_answer = X['AnswerAText'][0]

prompt = f"""
    Given a question and an incorrect answer to that question. 
    Describe the nature of the mistake the incorrect answer makes.
    question: ({question}).
    incorrect answer: ({answer}). 
    correct answer: ({correct_answer}).
    Put your final explanation in one simple sentence (for example: "Does not know that angles in a triangle sum to 180 degrees") and in parentheses().
"""

result = client.predict(
    image=None,
    sketchpad=None,
    question=prompt,
    api_name="/math_chat_bot"
)
print(result)


# a utility function to do the above
def ask_the_model(question, answer, correct_answer):
    prompt = f"""
        Given a question and an incorrect answer to that question. 
        Describe the nature of the mistake the incorrect answer makes.
        question: ({question}).
        incorrect answer: ({answer}). 
        correct answer: ({correct_answer}).
        Put your final explanation in one simple sentence (for example: "Does not know that angles in a triangle sum to 180 degrees") and in parentheses().
    """
    result = client.predict(
        image=None,
        sketchpad=None,
        question=prompt,
        api_name="/math_chat_bot"
    )
    
    # Extract the last sentence
    result = extract_boxed_text(result)
    
    return result

# a function to extract the final sentence from the model
def extract_boxed_text(text):
    pattern = r'\\boxed{(.*?)}'
    result = re.findall(pattern, text)
    return result

In [None]:
ask_the_model(question, answer, correct_answer)

## Create a dataset like the submission.csv to see zero-shot model performance

In [None]:
import pandas as pd

def get_answer(letter, row):
    answer_column = 'Answer' + letter + 'Text'
    return row[answer_column]

# Initialize a list to store rows
rows = []

for index, row in X.iterrows():
    q_id = row['QuestionId']
    question = row['QuestionText']
    
    # Get the correct answer based on the letter
    correct_answer_letter = row['CorrectAnswer']
    correct_answer = get_answer(correct_answer_letter, row)
    
    for letter in ['A', 'B', 'C', 'D']:
        q_letter = f"{q_id}_{letter}"
        
        # Get the misconception ID
        misconception_letter = 'Misconception' + letter + 'Id'
        m_id = row[misconception_letter]
        
        # Get the answer text
        answer = get_answer(letter, row)
        
        # Append the new row as a tuple
        rows.append({
            'q_id': q_id,
            'q_letter': q_letter,
            'm_id': m_id,
            'question': question,
            'answer': answer,
            'correct_answer': correct_answer
        })

# Create a DataFrame from the list of rows
training_data = pd.DataFrame(rows)

# Drop nulls
training_data = training_data.dropna(subset=['m_id'])

In [None]:
training_data.head()

## Loss Function (MAP@25)

In [None]:
def rel(prediction_25, m_id):
    if m_id not in prediction_25:
        return 0
    else:
        return 1 / (prediction_25.index(m_id) + 1)

def map_at_25(predictions, logits):
    """
    Calculate the MAP@25 scores for predictions, logits
    
    Args:
        predictions: list of 25 ints (25 per sample)
        logits: list of int
        
    Returns:
        float: The Mean Average Precision at 25 (Cutoff) [0-1]
    """
    total = 0
    
    for prediction_25, m_id in zip(predictions, logits):
        total += rel(prediction_25, m_id)
        
    score = total / len(logits)
            
    return score

## Eternity is a long time, so I will be using first 100 samples to evaluate the model

In [None]:
rows = []
for index, row in training_data[:50].iterrows():
    question = row['question']
    answer = row['answer']
    correct_answer = row['correct_answer']
    
    model_response = ask_the_model(question, answer, correct_answer)
    
    rows.append({
        'q_letter': row['q_letter'],
        'model_response': model_response
    })
    
    if index % 10 == 0:
        print(f'processed {index} question_letter pairs')
    
# Create a DataFrame from the list of rows
question_response_df = pd.DataFrame(rows)

question_response_df.head()

## Tokenize the models outputs

In [None]:
question_response_df['embedding'] = question_response_df['model_response'].apply(lambda x: return_only_vector_embeddings(x))

## For each explanation (embedding from the model) get 25 most similar embeddings from the misconception

In [None]:
def normalize_token_length(embedding, token_length=100):
    """
    Normalize the token length of an embedding tensor to a fixed size.
    
    Parameters:
    embedding (torch.Tensor): The input tensor with variable token length.
    token_length (int): The desired token length for normalization.
    
    Returns:
    torch.Tensor: A tensor with the specified fixed length.
    """
    current_length = embedding.shape[1]  # Get the current token length

    # If current length is less than the target, pad with zeros
    if current_length < token_length:
        padding_length = token_length - current_length
        # Pad tensor with zeros on the right
        padding = torch.zeros((embedding.shape[0], padding_length))
        embedding = torch.cat((embedding, padding), dim=1)

    # If current length is more, truncate to the target length
    elif current_length > token_length:
        embedding = embedding[:, :token_length]

    return embedding


def get_25_most_similar(query_embedding, embeddings):
    """
    Find the 25 most similar vectors based on cosine similarity.

    Parameters:
    query_embedding (torch.Tensor): The embedding to compare against.
    embeddings (dict): A dictionary of index and their corresponding embedding tensors.
    token_length (int): The fixed length for token normalization.

    Returns:
    List[int]: A list of top 25 most similar indices.
    """
    similarities = []

    # Normalize the query embedding length
    query_embedding = normalize_token_length(query_embedding).flatten()

    # Calculate cosine similarity for each vector in the dictionary
    for idx, embedding in embeddings.items():
        # Normalize each embedding in the dictionary to the same token length
        normalized_embedding = normalize_token_length(embedding).flatten()

        # Compute cosine similarity (1 - cosine distance)
        similarity = 1 - cosine(query_embedding.detach().numpy(), normalized_embedding.detach().numpy())
        similarities.append((idx, similarity))

    # Sort by similarity score in descending order
    sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

    # Return only the indices of the top 25 most similar items
    top_25_indices = [idx for idx, _ in sorted_similarities[:25]]
    
    return top_25_indices


In [None]:
get_25_most_similar(question_response_df['embedding'][1], embeddings)

In [None]:
question_response_df['top_25'] = question_response_df['embedding'].apply(lambda x: get_25_most_similar(x, embeddings))

## Model Performance Evaluation

In [None]:
# Merge based on 'q_letter' and get 'm_id' column
question_response_df = question_response_df.merge(training_data[['q_letter', 'm_id']], on='q_letter', how='left')


In [None]:
question_response_df.columns

In [None]:
predictions = question_response_df['top_25'].values
logits = question_response_df['m_id'].values

performance = map_at_25(predictions, logits)
    
print(f'Zero-shot model performance: {performance:.5f}')