### Make dataset for top100 multiple-choice task

In [1]:
import gc
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer

from eedi_metrics import mapk

pd.set_option("display.max_rows", 60)

  from .autonotebook import tqdm as notebook_tqdm


## Constants

In [2]:
# Competition data
DATA_ROOT = Path("/home/yoku/compe/eedi/input/eedi-mining-misconceptions-in-mathematics")

# Retriever model
# Model weights and training code are available at: https://www.kaggle.com/datasets/yokuyama/eedi-models-stella400m
RETRIEVER_MODEL_ROOT = Path("./stella_400m")
second_retrieval_model_paths = [
    {"fold": 0, "path": RETRIEVER_MODEL_ROOT / ("stella_second_v9_fold0")},
    {"fold": 1, "path": RETRIEVER_MODEL_ROOT / ("stella_second_v9_fold1")},
    {"fold": 2, "path": RETRIEVER_MODEL_ROOT / ("stella_second_v9_fold2")},
    {"fold": 3, "path": RETRIEVER_MODEL_ROOT / ("stella_second_v9_fold3")},
    {"fold": 4, "path": RETRIEVER_MODEL_ROOT / ("stella_second_v9_fold4")},
]

# Synthetic Data
# Download the required data from https://github.com/wangqihanginthesky/Eedi_kaggle/tree/rihanpiggy/data/retrieve_train
# and place it in the directory specified below.
SYNTH_DATA_ROOT = Path("/home/yoku/compe/eedi/data")

In [3]:
# prepare dataframe
df_train = pd.read_csv("train_5folds_with_llm_infer.csv")
df_train["fold"] = df_train["fold"].astype(int)

df_train = pl.from_pandas(df_train)

df_misconception_mapping = pd.read_csv(DATA_ROOT / "misconception_mapping.csv")


### Sythetic data Gen.2

In [4]:
# Synthetic data gen.2
df_synth_g2 = pd.read_csv(SYNTH_DATA_ROOT / "synthetic-round2-render.csv")
df_synth_g2 = df_synth_g2.rename({"ConstructName-qwen25-72b-instruct": "ConstructName"}, axis=1)

# Filter items with high scores from the gpt4o-mini
df_synth_g2 = df_synth_g2[df_synth_g2["quality-gpt4o-mini"] > 2]
df_synth_g2 = df_synth_g2[~df_synth_g2.isna().any(axis=1)].reset_index(drop=True)
df_synth_g2 = df_synth_g2.sample(n=8000, random_state=1).reset_index(drop=True)

# Add folds
np.random.seed(0)
num_fold = df_train["fold"].max() + 1
df_synth_g2["fold"] = np.random.randint(0, num_fold, size=len(df_synth_g2))
df_synth_g2 = pl.from_pandas(df_synth_g2)

In [5]:
df_synth_g2["p000-qwen25-32b-instruct-cot_misunderstanding"]

p000-qwen25-32b-instruct-cot_misunderstanding
str
"""The misunderstanding likely st…"
"""The misunderstanding likely st…"
"""The misunderstanding likely st…"
"""The misunderstanding likely st…"
"""The misunderstanding likely st…"
…
"""The misunderstanding here is t…"
"""The student's wrong answer, \(…"
"""The misunderstanding likely st…"
"""The misunderstanding here is t…"


### Sythetic data Gen.1

In [6]:
# Synthetic data gen.1
df_synth_g1 = pd.read_csv(SYNTH_DATA_ROOT / "synthetic_questions_render_with_answer_render_v1.csv")

# Filter items with high scores from the gpt4o-mini
df_synth_g1 = df_synth_g1[df_synth_g1["quality-gpt4o-mini"] > 2]
df_synth_g1 = df_synth_g1[~df_synth_g1.isna().any(axis=1)].reset_index(drop=True)
df_synth_g1 = df_synth_g1.sample(n=4000, random_state=0).reset_index(drop=True)

# Add folds
np.random.seed(0)
num_fold = df_train["fold"].max() + 1
df_synth_g1["fold"] = np.random.randint(0, num_fold, size=len(df_synth_g1))
df_synth_g1 = pl.from_pandas(df_synth_g1)

### Sythetic data by GPT-4o mini

In [7]:
df_gpt = pd.read_csv(SYNTH_DATA_ROOT / "gpt-4o-mini-q-a_v2_render_v1.csv")
df_gpt = df_gpt.rename({"ConstructName-qwen25-72b-instruct": "ConstructName"}, axis=1)

# Filter items with high scores from the gpt4o-mini
df_gpt = df_gpt[df_gpt["quality-gpt4o-mini"] > 2].reset_index(drop=True)

# Add folds
np.random.seed(0)
num_fold = df_train["fold"].max() + 1
df_gpt["fold"] = np.random.randint(0, num_fold, size=len(df_gpt))

df_gpt = pl.from_pandas(df_gpt)

In [8]:
# Merge all synthetic datasets
df_synth = pl.concat([
    df_synth_g1, df_synth_g2, df_gpt,
], how="diagonal_relaxed")

## Make prompts

In [9]:
df_train = df_train.with_columns(
        pl.concat_str(
            "ConstructName: " +  pl.col("ConstructName") + \
            " Subject: " + pl.col("SubjectName") + \
            " Question: " + pl.col("QuestionText") + \
            " CorrectAnswer: " + pl.col("CorrectAnswerText") + \
            " Answer: " + pl.col("AnswerText"),
        ).alias("search_prompt"),
    ).to_pandas()

In [10]:
df_synth = df_synth.with_columns(
        pl.concat_str(
            "ConstructName: " +  pl.col("ConstructName") + \
            " Subject: " + pl.col("ThirdSubjectName") + \
            " Question: " + pl.col("QuestionText") + \
            " CorrectAnswer: " + pl.col("CorrectAnswerText") + \
            " Answer: " + pl.col("AnswerText"),
        ).alias("search_prompt"),
    ).to_pandas()

In [11]:
# Data cleansing
# Remove prompts with excessively long token lengths
synth_len = df_synth["search_prompt"].str.len()

print(synth_len.describe())

q_cut = 1024

df_synth = df_synth[synth_len < q_cut].reset_index(drop=True)

count    14185.000000
mean       286.449700
std        216.208686
min        121.000000
25%        218.000000
50%        266.000000
75%        323.000000
max       8529.000000
Name: search_prompt, dtype: float64


### Retrieving negative samples

In [12]:
def make_embeddings(df: pd.DataFrame, df_misconception_mapping: pd.DataFrame) -> list[list[dict[str, int | float]]]:
    base_order = df["QuestionId_Answer"]
    misconception_name = df_misconception_mapping["MisconceptionName"]

    # TFIDF
    vectorizer = TfidfVectorizer()

    tfidf_matrix = vectorizer.fit_transform(
        np.concatenate([df["search_prompt"], misconception_name]),
    )

    embedding_query_tfidf = tfidf_matrix.toarray()[:len(df)].astype(np.float32)
    embedding_misconception_tfidf = tfidf_matrix.toarray()[len(df):].astype(np.float32)

    # retrieval
    embedding_query_oof = []
    embedding_misconception_oof = []
    for record in second_retrieval_model_paths:
        fold = record["fold"]
        df_validation = df.query(f"fold == {fold}").reset_index(drop=True)

        model = SentenceTransformer(str(record["path"]), trust_remote_code=True)

        embedding_query = model.encode(df_validation["search_prompt"])
        embedding_query_oof.append(
            pd.DataFrame(data=embedding_query, index=df_validation["QuestionId_Answer"])
        )

        embedding_misconception = model.encode(misconception_name.values)
        embedding_misconception_oof.append(embedding_misconception)

    # oof embedding vector for the question texts
    embedding_query_oof = pd.concat(embedding_query_oof)

    # oof embedding vector for the misconceptions
    embedding_misconception_oof = np.mean(embedding_misconception_oof, axis=0)

    # Sort by original order
    embedding_query_oof = embedding_query_oof.loc[base_order]

    # search
    def make_norm_tensor(x: np.ndarray) -> torch.Tensor:
        x = torch.from_numpy(x)
        x = F.normalize(x, p=2, dim=1)
        return x

    x = make_norm_tensor(np.concatenate([0.2*embedding_query_tfidf, embedding_query_oof.to_numpy()], axis=1))
    y = make_norm_tensor(np.concatenate([0.2*embedding_misconception_tfidf, embedding_misconception_oof], axis=1))

    top25ids = util.semantic_search(x, y, top_k=25)

    return top25ids

In [13]:
top25ids = make_embeddings(
    df_train,
    df_misconception_mapping,
)
df_train["MisconceptionId_pred"] = [" ".join([str(x["corpus_id"]) for x in top25id]) for top25id in top25ids]

You try to use a model that was created with version 3.3.0.dev0, however, your version is 3.3.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
You try to use a model that was created with version 3.3.0.dev0, however, your version is 3.3.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 3.3.0.dev0, however, your version is 3.3.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 3.3.0.dev0, however, your version is 3.3.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 3.3.0.dev0, however, your versi

In [14]:
synth_top25ids = make_embeddings(
    df_synth,
    df_misconception_mapping,
)
df_synth["MisconceptionId_pred"] = [" ".join([str(x["corpus_id"]) for x in top25id]) for top25id in synth_top25ids]

You try to use a model that was created with version 3.3.0.dev0, however, your version is 3.3.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 3.3.0.dev0, however, your version is 3.3.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 3.3.0.dev0, however, your version is 3.3.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 3.3.0.dev0, however, your version is 3.3.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 3.3.0.dev0, however, your version is 3.3.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [16]:
# Save the retrieval results
df_train.to_parquet("df_train.parquet")
df_synth.to_parquet("df_synth.parquet")

In [17]:
torch.cuda.empty_cache()
gc.collect()

0

### Validation

In [18]:
# sort by label
predicted = df_train["MisconceptionId_pred"].apply(lambda x: [int(y) for y in x.split()])
label = [[_id] for _id in df_train["MisconceptionId"]]
print("Train Validation: ", mapk(label, predicted))

Train Validation:  0.6076630397326385


In [19]:
# sort by label
predicted = df_synth["MisconceptionId_pred"].apply(lambda x: [int(y) for y in x.split()])
label = [[_id] for _id in df_synth["MisconceptionId"]]
print("Validation: ", mapk(label, predicted))

Validation:  0.4474016139304796
