In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
test_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')

In [None]:
misconception_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')

In [None]:
def cleanTrainData(df):

    options = ['A','B','C','D']
    originalCols = list(df.columns)

    newCols = originalCols
    newCols.append('AnswerText')
    newCols.append('MisconceptionId')
    
    
    newDf = pd.DataFrame(columns=newCols)

    for i in range(df.shape[0]):
    
        for option in options:
            new_row = df.iloc[i,:].copy()
            new_row['QuestionId'] = f"{new_row['QuestionId']}_{option}"
            new_row['AnswerText'] = new_row[f"Answer{option}Text"]
            new_row['MisconceptionId'] = new_row[f"Misconception{option}Id"]
            newDf = pd.concat([newDf, new_row.to_frame().T], ignore_index=True)

    newDf = newDf.drop(['ConstructId','SubjectId','AnswerAText','AnswerBText','AnswerCText','AnswerDText','MisconceptionAId','MisconceptionBId','MisconceptionCId','MisconceptionDId'],axis=1)

    return newDf.dropna()
            
            
def cleanTestData(df):
    options = ['A', 'B', 'C', 'D']
    
    # Prepare a list to collect new rows
    new_rows = []

    for i in range(df.shape[0]):
        for option in options:
            if df.iloc[i]['CorrectAnswer'] == option:
                continue
            
            new_row = df.iloc[i].copy()
            new_row['QuestionId'] = f"{new_row['QuestionId']}_{option}"
            new_row['AnswerText'] = new_row[f"Answer{option}Text"]
            new_row['All'] = f"Describe the misconception in this answer. Question : {new_row['ConstructName']} {new_row['SubjectName']} {new_row['QuestionText']}, Answer: {new_row['AnswerText']}."
            
            # Append the new_row to the list
            new_rows.append(new_row)

    # Create a DataFrame from the list of new rows
    newDf = pd.DataFrame(new_rows)

    # Drop unnecessary columns
    newDf = newDf.drop(['ConstructId', 'SubjectId', 'AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText'], axis=1)

    return newDf.dropna()

In [None]:
clean_train_df = cleanTrainData(train_df)
clean_test_df = cleanTestData(test_df)


In [None]:
clean_train_df

In [None]:
clean_test_df

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


model_id = "/kaggle/input/phi-3.5-mini-instruct/pytorch/default/1"
tokenizer = AutoTokenizer.from_pretrained(model_id)


model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    max_new_tokens=256, 
    device_map="auto"
)

In [None]:
def get_response(pipeline,message):
    
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    outputs = pipeline(
        message,
        max_new_tokens=256,
        do_sample=False,
        eos_token_id=terminators,  # Ensure EOS token is used for stopping generation
        return_full_text=False  # Prevent the model from repeating the input prompt
    )
    # Extract and return the generated text
    return outputs[0]["generated_text"].strip()

In [None]:
test_result = []

for i in range(clean_test_df.shape[0]):
    
    row = clean_test_df.iloc[i,:]
    prompt = row['All']
    
    answer = get_response(pipe,prompt)
    test_result.append(answer)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer_mis = AutoTokenizer.from_pretrained('/kaggle/input/bge-small-en-v1.5/transformers/bge/2')
model_mis = AutoModel.from_pretrained('/kaggle/input/bge-small-en-v1.5/transformers/bge/2').to(device)
model_mis.eval()

In [None]:
# Prepare inputs
def prepare_inputs(texts, tokenizer, device):
    encoded = tokenizer.batch_encode_plus(
        texts,
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=520
    )
    return {key: value.to(device) for key, value in encoded.items()}

In [None]:
from tqdm import tqdm

# Compute embeddings
def compute_embeddings(texts, tokenizer, model, device, batch_size):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        inputs = prepare_inputs(batch_texts, tokenizer, device)
        with torch.no_grad():
            embeddings = model(**inputs).last_hidden_state[:, 0]
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        all_embeddings.append(embeddings.cpu().numpy())
    return np.concatenate(all_embeddings, axis=0)

In [None]:
misconception = list(misconception_df['MisconceptionName'].values)

embeddings_misconception = compute_embeddings(misconception,tokenizer_mis,model_mis,device,8);

# test = list(clean_test_df['All'].values)


In [None]:
embeddings_test = compute_embeddings(test_result,tokenizer_mis,model_mis,device,8);

In [None]:
cosine_sim = cosine_similarity(embeddings_test,embeddings_misconception)
test_sorted_indices = np.argsort(-cosine_sim, axis=1)
test_sorted_indices

In [None]:
top_n = 25

top_misconception_ids = []

for i in range(cosine_sim.shape[0]):
    # Get the indices of the top_n highest similarity values
    top_indices = np.argsort(cosine_sim[i])[::-1][:top_n]  # Sort in descending order and take the top n

    # Get the corresponding MisconceptionIds from your DataFrame
    top_ids = misconception_df.iloc[top_indices]['MisconceptionId'].tolist()
    
    top_misconception_ids.append(top_ids)

In [None]:
submission_df = pd.DataFrame(columns=['QuestionId_Answer','MisconceptionId'])
submission_df['QuestionId_Answer'] = clean_test_df['QuestionId']

In [None]:
for i in range(clean_test_df.shape[0]):
    row = clean_test_df.iloc[i, :]
    
    # Retrieve the top 25 closest misconceptions
    top_25_misconception_ids = top_misconception_ids[i]
    result = ' '.join(map(str,top_25_misconception_ids))
    submission_df.loc[submission_df['QuestionId_Answer'] == row['QuestionId'], 'MisconceptionId'] = result

In [None]:
submission_df

In [None]:
submission_df.to_csv('submission.csv',index=False)