In [1]:
DATA_PATH = "data/synthetic_generation"
MODEL_NAME = "Qwen/QwQ-32B-Preview"
OUTPUT_PATH = "Pipeline1/Linq-Embed-Mistral"
MODEL_OUTPUT_PATH = f"{OUTPUT_PATH}/output_retrieval"

RETRIEVE_NUM = 25
SEED = 985
EPOCH = 10
LR = 4e-05
BS = 32

TRAINING = True
DEBUG = False
WANDB = False
REPORT_TO = "none"

In [2]:
import gc
import os
import random

import datasets
import numpy as np
import pandas as pd
import polars as pl
import sentence_transformers
import wandb
from accelerate.utils import release_memory
from datasets import Dataset, load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    models
)
from sentence_transformers.evaluation import InformationRetrievalEvaluator, TripletEvaluator
from sentence_transformers.losses import CachedMultipleNegativesRankingLoss, MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sklearn.metrics.pairwise import cosine_similarity
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    # prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from peft import prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig, AutoModel, AutoTokenizer, AutoModelForCausalLM
import torch
from llm2vec import LLM2Vec

2024-11-29 01:33:35.863192: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732811615.938360 2485954 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732811615.958887 2485954 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-29 01:33:36.095256: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import json
import logging
import os
from fnmatch import fnmatch
from pathlib import Path
from typing import Any, Callable

import huggingface_hub
import torch
from torch import nn
from transformers import AutoConfig, AutoModel, AutoTokenizer, MT5Config, T5Config
from transformers.utils import is_peft_available

def _save_pretrained_wrapper(_save_pretrained_fn: Callable, subfolder: str) -> Callable[..., None]:
    def wrapper(save_directory: str | Path, **kwargs) -> None:
        os.makedirs(Path(save_directory) / subfolder, exist_ok=True)
        return _save_pretrained_fn(Path(save_directory) / subfolder, **kwargs)

    return wrapper

class CustomTransformer(models.Transformer):
    def __init__(
        self,
        model_name_or_path: str,
        max_seq_length: int | None = None,
        model_args: dict[str, Any] | None = None,
        tokenizer_args: dict[str, Any] | None = None,
        config_args: dict[str, Any] | None = None,
        cache_dir: str | None = None,
        do_lower_case: bool = False,
        tokenizer_name_or_path: str = None,
        backend: str = "torch",
    ) -> None:
        super().__init__(model_name_or_path)
        self.config_keys = ["max_seq_length", "do_lower_case"]
        self.do_lower_case = do_lower_case
        self.backend = backend
        if model_args is None:
            model_args = {}
        if tokenizer_args is None:
            tokenizer_args = {}
        if config_args is None:
            config_args = {}

        config = AutoConfig.from_pretrained(model_name_or_path, **config_args, cache_dir=cache_dir)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        self.auto_model = LLM2Vec.from_pretrained(
            model_name_or_path,
            device_map="cuda" if torch.cuda.is_available() else "cpu",
            torch_dtype=torch.bfloat16,
            quantization_config=bnb_config,
        ).model
        config = LoraConfig(
            r=64,
            lora_alpha=128,
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
            ],
            bias="none",
            lora_dropout=0.05,
            task_type="CAUSAL_LM",
        )

        self.auto_model = get_peft_model(self.auto_model, config)
        print(self.auto_model.print_trainable_parameters())

        if max_seq_length is not None and "model_max_length" not in tokenizer_args:
            tokenizer_args["model_max_length"] = max_seq_length
        self.tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path,
            cache_dir=cache_dir,
            **tokenizer_args,
        )

        # No max_seq_length set. Try to infer from model
        if max_seq_length is None:
            if (
                hasattr(self.auto_model, "config")
                and hasattr(self.auto_model.config, "max_position_embeddings")
                and hasattr(self.tokenizer, "model_max_length")
            ):
                max_seq_length = min(self.auto_model.config.max_position_embeddings, self.tokenizer.model_max_length)

        self.max_seq_length = max_seq_length

        if tokenizer_name_or_path is not None:
            self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__

    def _load_model(self, model_name_or_path, config, cache_dir, backend, **model_args) -> None:
        """Loads the transformer model"""
        if backend == "torch":
            if isinstance(config, T5Config):
                self._load_t5_model(model_name_or_path, config, cache_dir, **model_args)
            elif isinstance(config, MT5Config):
                self._load_mt5_model(model_name_or_path, config, cache_dir, **model_args)
            else:
                self.auto_model = AutoModel.from_pretrained(
                    model_name_or_path, config=config, cache_dir=cache_dir, **model_args
                )
        elif backend == "onnx":
            self._load_onnx_model(model_name_or_path, config, cache_dir, **model_args)
        elif backend == "openvino":
            self._load_openvino_model(model_name_or_path, config, cache_dir, **model_args)
        else:
            raise ValueError(f"Unsupported backend '{backend}'. `backend` should be `torch`, `onnx`, or `openvino`.")

    def _load_openvino_model(self, model_name_or_path, config, cache_dir, **model_args) -> None:
        if isinstance(config, T5Config) or isinstance(config, MT5Config):
            raise ValueError("T5 models are not yet supported by the OpenVINO backend.")

        try:
            from optimum.intel import OVModelForFeatureExtraction
            from optimum.intel.openvino import OV_XML_FILE_NAME
        except ModuleNotFoundError:
            raise Exception(
                "Using the OpenVINO backend requires installing Optimum and OpenVINO. "
                "You can install them with pip: `pip install optimum[openvino]`."
            )

        load_path = Path(model_name_or_path)
        is_local = load_path.exists()
        backend_name = "OpenVINO"
        target_file_glob = "openvino*.xml"

        # Determine whether the model should be exported or whether we can load it directly
        export, model_args = self._backend_should_export(
            load_path, is_local, model_args, OV_XML_FILE_NAME, target_file_glob, backend_name
        )

        # If we're exporting, then there's no need for a file_name to load the model from
        if export:
            model_args.pop("file_name", None)

        # ov_config can be either a dictionary, or point to a json file with an OpenVINO config
        if "ov_config" in model_args:
            ov_config = model_args["ov_config"]
            if not isinstance(ov_config, dict):
                if not Path(ov_config).exists():
                    raise ValueError(
                        "ov_config should be a dictionary or a path to a .json file containing an OpenVINO config"
                    )
                with open(ov_config, encoding="utf-8") as f:
                    model_args["ov_config"] = json.load(f)
        else:
            model_args["ov_config"] = {}

        # Either load an exported model, or export the model to OpenVINO
        self.auto_model: OVModelForFeatureExtraction = OVModelForFeatureExtraction.from_pretrained(
            model_name_or_path,
            config=config,
            cache_dir=cache_dir,
            export=export,
            **model_args,
        )
        # Wrap the save_pretrained method to save the model in the correct subfolder
        self.auto_model._save_pretrained = _save_pretrained_wrapper(self.auto_model._save_pretrained, self.backend)

        # Warn the user to save the model if they haven't already
        if export:
            self._backend_warn_to_save(model_name_or_path, is_local, backend_name)

    def _load_onnx_model(self, model_name_or_path, config, cache_dir, **model_args) -> None:
        try:
            import onnxruntime as ort
            from optimum.onnxruntime import ONNX_WEIGHTS_NAME, ORTModelForFeatureExtraction
        except ModuleNotFoundError:
            raise Exception(
                "Using the ONNX backend requires installing Optimum and ONNX Runtime. "
                "You can install them with pip: `pip install optimum[onnxruntime]` "
                "or `pip install optimum[onnxruntime-gpu]`"
            )

        # Default to the highest priority available provider if not specified
        # E.g. Tensorrt > CUDA > CPU
        model_args["provider"] = model_args.pop("provider", ort.get_available_providers()[0])

        load_path = Path(model_name_or_path)
        is_local = load_path.exists()
        backend_name = "ONNX"
        target_file_glob = "*.onnx"

        # Determine whether the model should be exported or whether we can load it directly
        export, model_args = self._backend_should_export(
            load_path, is_local, model_args, ONNX_WEIGHTS_NAME, target_file_glob, backend_name
        )

        # If we're exporting, then there's no need for a file_name to load the model from
        if export:
            model_args.pop("file_name", None)

        # Either load an exported model, or export the model to ONNX
        self.auto_model: ORTModelForFeatureExtraction = ORTModelForFeatureExtraction.from_pretrained(
            model_name_or_path,
            config=config,
            cache_dir=cache_dir,
            export=export,
            **model_args,
        )
        # Wrap the save_pretrained method to save the model in the correct subfolder
        self.auto_model._save_pretrained = _save_pretrained_wrapper(self.auto_model._save_pretrained, self.backend)

        # Warn the user to save the model if they haven't already
        if export:
            self._backend_warn_to_save(model_name_or_path, is_local, backend_name)

    def _backend_should_export(
        self,
        load_path: Path,
        is_local: bool,
        model_args: dict[str, Any],
        target_file_name: str,
        target_file_glob: str,
        backend_name: str,
    ) -> tuple[bool, dict[str, Any]]:
        """
        Determines whether the model should be exported to the backend, or if it can be loaded directly.
        Also update the `file_name` and `subfolder` model_args if necessary.

        These are the cases:

        1. If export is set in model_args, just return export
        2. If `<subfolder>/<file_name>` exists; set export to False
        3. If `<backend>/<file_name>` exists; set export to False and set subfolder to the backend (e.g. "onnx")
        4. If `<file_name>` contains a folder, add those folders to the subfolder and set the file_name to the last part

        We will warn if:

        1. The expected file does not exist in the model directory given the optional file_name and subfolder.
           If there are valid files for this backend, but they're don't align with file_name, then we give a useful warning.
        2. Multiple files are found in the model directory that match the target file name and the user did not
           specify the desired file name via `model_kwargs={"file_name": "<file_name>"}`

        Args:
            load_path: The model repository or directory, as a Path instance
            is_local: Whether the model is local or remote, i.e. whether load_path is a local directory
            model_args: The model_args dictionary. Notable keys are "export", "file_name", and "subfolder"
            target_file_name: The expected file name in the model directory, e.g. "model.onnx" or "openvino_model.xml"
            target_file_glob: The glob pattern to match the target file name, e.g. "*.onnx" or "openvino*.xml"
            backend_name: The human-readable name of the backend for use in warnings, e.g. "ONNX" or "OpenVINO"

        Returns:
            Tuple[bool, dict[str, Any]]: A tuple of the export boolean and the updated model_args dictionary.
        """

        export = model_args.pop("export", None)
        if export is not None:
            return export, model_args

        file_name = model_args.get("file_name", target_file_name)
        subfolder = model_args.get("subfolder", None)
        primary_full_path = Path(subfolder, file_name).as_posix() if subfolder else Path(file_name).as_posix()
        secondary_full_path = (
            Path(subfolder, self.backend, file_name).as_posix()
            if subfolder
            else Path(self.backend, file_name).as_posix()
        )
        glob_pattern = f"{subfolder}/**/{target_file_glob}" if subfolder else f"**/{target_file_glob}"

        # Get the list of files in the model directory that match the target file name
        if is_local:
            model_file_names = [path.relative_to(load_path).as_posix() for path in load_path.glob(glob_pattern)]
        else:
            all_files = huggingface_hub.list_repo_files(
                load_path.as_posix(),
                repo_type="model",
                revision=model_args.get("revision", None),
                token=model_args.get("token", None),
            )
            model_file_names = [fname for fname in all_files if fnmatch(fname, glob_pattern)]

        # First check if the expected file exists in the root of the model directory
        # If it doesn't, check if it exists in the backend subfolder.
        # If it does, set the subfolder to include the backend
        export = primary_full_path not in model_file_names
        if export and "subfolder" not in model_args:
            export = secondary_full_path not in model_file_names
            if not export:
                if len(model_file_names) > 1 and "file_name" not in model_args:
                    logger.warning(
                        f"Multiple {backend_name} files found in {load_path.as_posix()!r}: {model_file_names}, defaulting to {secondary_full_path!r}. "
                        f'Please specify the desired file name via `model_kwargs={{"file_name": "<file_name>"}}`.'
                    )
                model_args["subfolder"] = self.backend
                model_args["file_name"] = file_name

        # If the file_name contains subfolders, set it as the subfolder instead
        file_name_parts = Path(file_name).parts
        if len(file_name_parts) > 1:
            model_args["file_name"] = file_name_parts[-1]
            model_args["subfolder"] = Path(model_args.get("subfolder", ""), *file_name_parts[:-1]).as_posix()

        if export:
            logger.warning(
                f"No {file_name!r} found in {load_path.as_posix()!r}. Exporting the model to {backend_name}."
            )
            if model_file_names:
                logger.warning(
                    f"If you intended to load one of the {model_file_names} {backend_name} files, "
                    f'please specify the desired file name via `model_kwargs={{"file_name": "{model_file_names[0]}"}}`.'
                )

        return export, model_args

    def _backend_warn_to_save(self, model_name_or_path: str, is_local: str, backend_name: str) -> None:
        to_log = f"Saving the exported {backend_name} model is heavily recommended to avoid having to export it again."
        if is_local:
            to_log += f" Do so with `model.save_pretrained({model_name_or_path!r})`."
        else:
            to_log += f" Do so with `model.push_to_hub({model_name_or_path!r}, create_pr=True)`."
        logger.warning(to_log)

    def _load_t5_model(self, model_name_or_path, config, cache_dir, **model_args) -> None:
        """Loads the encoder model from T5"""
        from transformers import T5EncoderModel

        T5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"]
        self.auto_model = T5EncoderModel.from_pretrained(
            model_name_or_path, config=config, cache_dir=cache_dir, **model_args
        )

    def _load_mt5_model(self, model_name_or_path, config, cache_dir, **model_args) -> None:
        """Loads the encoder model from T5"""
        from transformers import MT5EncoderModel

        MT5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"]
        self.auto_model = MT5EncoderModel.from_pretrained(
            model_name_or_path, config=config, cache_dir=cache_dir, **model_args
        )

    def __repr__(self) -> str:
        return f"Transformer({self.get_config_dict()}) with Transformer model: {self.auto_model.__class__.__name__} "

    def forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
        """Returns token_embeddings, cls_token"""
        trans_features = {"input_ids": features["input_ids"], "attention_mask": features["attention_mask"]}
        if "token_type_ids" in features:
            trans_features["token_type_ids"] = features["token_type_ids"]

        output_states = self.auto_model(**trans_features, **kwargs, return_dict=False)
        output_tokens = output_states[0]

        # If the AutoModel is wrapped with a PeftModelForFeatureExtraction, then it may have added virtual tokens
        # We need to extend the attention mask to include these virtual tokens, or the pooling will fail
        if is_peft_available():
            from peft import PeftModelForFeatureExtraction

            if (
                isinstance(self.auto_model, PeftModelForFeatureExtraction)
                and self.auto_model.active_peft_config.is_prompt_learning
            ):
                batch_size = output_tokens.size(0)
                attention_mask = features["attention_mask"]
                prefix_attention_mask = torch.ones(
                    batch_size, self.auto_model.active_peft_config.num_virtual_tokens, device=attention_mask.device
                )
                features["attention_mask"] = torch.cat((prefix_attention_mask, attention_mask), dim=1)

        features["token_embeddings"] = output_tokens

        if self.auto_model.config.output_hidden_states and len(output_states) > 2:
            all_layer_idx = 2  # I.e. after last_hidden_states and pooler_output
            if len(output_states) < 3:  # Some models only output last_hidden_states and all_hidden_states
                all_layer_idx = 1

            hidden_states = output_states[all_layer_idx]
            features["all_layer_embeddings"] = hidden_states

        return features

    def get_word_embedding_dimension(self) -> int:
        return self.auto_model.config.hidden_size

    def tokenize(
        self, texts: list[str] | list[dict] | list[tuple[str, str]], padding: str | bool = True
    ) -> dict[str, torch.Tensor]:
        """Tokenizes a text and maps tokens to token-ids"""
        output = {}
        if isinstance(texts[0], str):
            to_tokenize = [texts]
        elif isinstance(texts[0], dict):
            to_tokenize = []
            output["text_keys"] = []
            for lookup in texts:
                text_key, text = next(iter(lookup.items()))
                to_tokenize.append(text)
                output["text_keys"].append(text_key)
            to_tokenize = [to_tokenize]
        else:
            batch1, batch2 = [], []
            for text_tuple in texts:
                batch1.append(text_tuple[0])
                batch2.append(text_tuple[1])
            to_tokenize = [batch1, batch2]

        # strip
        to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize]

        # Lowercase
        if self.do_lower_case:
            to_tokenize = [[s.lower() for s in col] for col in to_tokenize]

        output.update(
            self.tokenizer(
                *to_tokenize,
                padding=padding,
                truncation="longest_first",
                return_tensors="pt",
                max_length=self.max_seq_length,
            )
        )
        return output

    def get_config_dict(self) -> dict[str, Any]:
        return {key: self.__dict__[key] for key in self.config_keys}

    def save(self, output_path: str, safe_serialization: bool = True) -> None:
        self.auto_model.save_pretrained(output_path, safe_serialization=safe_serialization)
        self.tokenizer.save_pretrained(output_path)

        with open(os.path.join(output_path, "sentence_bert_config.json"), "w") as fOut:
            json.dump(self.get_config_dict(), fOut, indent=2)

    @classmethod
    def load(cls, input_path: str):
        # Old classes used other config names than 'sentence_bert_config.json'
        for config_name in [
            "sentence_bert_config.json",
            "sentence_roberta_config.json",
            "sentence_distilbert_config.json",
            "sentence_camembert_config.json",
            "sentence_albert_config.json",
            "sentence_xlm-roberta_config.json",
            "sentence_xlnet_config.json",
        ]:
            sbert_config_path = os.path.join(input_path, config_name)
            if os.path.exists(sbert_config_path):
                break

        with open(sbert_config_path) as fIn:
            config = json.load(fIn)
        # Don't allow configs to set trust_remote_code
        if "model_args" in config and "trust_remote_code" in config["model_args"]:
            config["model_args"].pop("trust_remote_code")
        if "tokenizer_args" in config and "trust_remote_code" in config["tokenizer_args"]:
            config["tokenizer_args"].pop("trust_remote_code")
        if "config_args" in config and "trust_remote_code" in config["config_args"]:
            config["config_args"].pop("trust_remote_code")
        return cls(model_name_or_path=input_path, **config)

In [4]:
NUM_PROC = 16
print(NUM_PROC)

def seed_everything(seed: int) -> None:
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    pl.set_random_seed(seed)

seed_everything(SEED)

16


In [5]:
# /content/drive/MyDrive/kaggle-eedi/input/train_5folds_with_llm_infer.csv
df = pd.read_csv(f"{DATA_PATH}/train_5folds_with_llm_infer.csv")
df['is_synthetic'] = False
print(df.shape)

(4370, 33)


In [7]:
# Read synthetic_data
df_synth = pd.read_csv(f"{DATA_PATH}/synthetic_questions_render_with_answer_render_v1.csv")
df_synth = df_synth[~df_synth.isna().any(axis=1)].reset_index(drop=True)
df_synth = df_synth[df_synth["quality-gpt4o-mini"] > 2].reset_index(drop=True)
df_synth = df_synth.sample(n=4000, random_state=0).reset_index(drop=True)

# 3rd subject name
df_synth = df_synth.rename({"ThirdSubjectName": "SubjectName"}, axis=1)
df_synth = df_synth.rename({"MisconceptionName": "Misconception"}, axis=1)

df_synth["is_synthetic"] = True
df_synth["fold"] = -1
print(df_synth.shape)

(16312, 26)


Unnamed: 0,MisconceptionId,MisconceptionName,ThirdSubjectName,QuestionText,author,ThirdSubjectId,SecondSubjectId,SecondSubjectName,FirstSubjectId,FirstSubjectName,...,AnswerText-qwen-answer-final-seed42,CorrectAnswerText,AnswerText,ConstructName-qwen25-72b-instruct-seed99,ConstructName,QuestionId,QuestionId_Answer,quality-gpt4o-mini,p000-qwen25-32b-instruct-cot_misunderstanding,p000-qwen25-32b-instruct-cot-v2_misunderstanding
0,2,Believes there are 100 degrees in a full turn,"Basic Angle Facts (straight line, opposite, ar...","If Sally spins around in a full circle, how ma...",llama,181,74.0,Angles,71.0,Geometry and Measure,...,100,360,100,Understand the measure of a full turn in degrees,Understand the measure of a full turn in degrees,10000,10000_A,5,**Explanation:**\n\n**Step-by-Step Brief Expla...,STEP-BY-STEP PROCESS OF GETTING THE WRONG ANSW...
1,2,Believes there are 100 degrees in a full turn,"Basic Angle Facts (straight line, opposite, ar...",What is the sum of the angles around a point?,llama,181,74.0,Angles,71.0,Geometry and Measure,...,100,360,100,Understand the sum of angles around a point,Understand the sum of angles around a point,10001,10001_A,5,**Explanation:**\n\n**Step-by-Step Brief Expla...,STEP-BY-STEP PROCESS OF GETTING THE WRONG ANSW...
2,2,Believes there are 100 degrees in a full turn,Angles-Others,"A car wheel makes one full rotation, how many ...",llama,1177,74.0,Angles,71.0,Geometry and Measure,...,100,360,100,Understand that a full rotation is 360 degrees,Understand that a full rotation is 360 degrees,10002,10002_A,5,Explanation:\nMisunderstanding: The students s...,STEP-BY-STEP PROCESS OF GETTING THE WRONG ANSW...
4,2,Believes there are 100 degrees in a full turn,Angles-Others,If a person turns 180 degrees and then another...,llama,1177,74.0,Angles,71.0,Geometry and Measure,...,100,360,100,Calculate the total degrees turned after multi...,Calculate the total degrees turned after multi...,10004,10004_A,5,It seems that the students have made a signifi...,STEP-BY-STEP PROCESS OF GETTING THE WRONG ANSW...
5,3,Thinks a quadratic without a non variable term...,Factorising into a Double Bracket,Can the expression x^2 - 3x be factorised?,llama,53,153.0,Factorising,49.0,Algebra,...,No,x(x-3),No,Factorise a quadratic expression in the form x...,Factorise a quadratic expression in the form x...,10005,10005_A,5,### Explanation of the Mistake\n\n**Step-by-St...,STEP-BY-STEP PROCESS OF GETTING THE WRONG ANSW...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16303,1103,"When reading integers on a number line, assume...",Place Value,"On the number line below, what number does the...",llama,202,144.0,Basic Arithmetic,32.0,Number,...,6,3.5,6,Read an integer on a number line where the req...,Read an integer on a number line where the req...,26352,26352_A,4,It seems that the students have misunderstood ...,STEP-BY-STEP PROCESS OF GETTING THE WRONG ANSW...
16304,1237,Believes that 1 part of a bar model will alway...,Written Division,Tom has 18 pencils in a box. He wants to share...,llama,208,144.0,Basic Arithmetic,32.0,Number,...,1,3,1,Interpret and create bar models for equal shar...,Interpret and create bar models for equal shar...,26353,26353_A,5,### Explanation of the Mistake\n\n**Step-by-St...,STEP-BY-STEP PROCESS OF GETTING THE WRONG ANSW...
16305,1474,Forgets to add on the starting value when iden...,Basic Arithmetic-Others,"If a bookshelf has 8 shelves, and a book is mo...",llama,1203,144.0,Basic Arithmetic,32.0,Number,...,-1,1,-1,Understand and apply the concept of position a...,Understand and apply the concept of position a...,26354,26354_A,3,The misunderstanding likely stems from a conce...,STEP-BY-STEP PROCESS OF GETTING THE WRONG ANSW...
16307,2053,"When reading integers on a number line, assume...",Mental Addition and Subtraction,On a number line with dashes marked at every 5...,llama,203,144.0,Basic Arithmetic,32.0,Number,...,13,25 or -5,13,Interpret and use number lines with intervals,Interpret and use number lines with intervals,26356,26356_A,5,**Misunderstanding:**\n\nThe students likely m...,STEP-BY-STEP PROCESS OF GETTING THE WRONG ANSW...


In [None]:
df_synth

In [7]:
# synthetic_data
df_gpt = pd.read_csv(f"{DATA_PATH}/synthetic-round1-render.csv")

# rename
df_gpt = df_gpt.rename({"ConstructName-qwen25-72b-instruct": "ConstructName"}, axis=1)
df_gpt = df_gpt.rename({"MisconceptionName": "Misconception"}, axis=1)

# Qualityで絞り込み
df_gpt = df_gpt[df_gpt["quality-gpt4o-mini"] > 2].reset_index(drop=True)

df_gpt["is_synthetic"] = True
df_gpt["fold"] = -2
print(df_gpt.shape)

(2185, 24)


In [8]:
# synthetic_data
df_synth2 = pd.read_csv(f"{DATA_PATH}/synthetic-round2-render.csv")
df_synth2 = df_synth2[~df_synth2.isna().any(axis=1)].reset_index(drop=True)

df_synth2 = df_synth2.rename({"ConstructName-qwen25-72b-instruct": "ConstructName"}, axis=1)
df_synth2 = df_synth2.rename({"MisconceptionName": "Misconception"}, axis=1)

df_synth2 = df_synth2[df_synth2["quality-gpt4o-mini"] > 2].reset_index(drop=True)

df_synth2["is_synthetic"] = True
df_synth2["fold"] = -3
print(df_synth2.shape)

(31868, 27)


In [9]:
# synthetic_data
df_synth3 = pd.read_csv(f"{DATA_PATH}/synthetic-round3-render.csv")
df_synth3 = df_synth3[~df_synth3.isna().any(axis=1)].reset_index(drop=True)

df_synth3 = df_synth3.rename({"MisconceptionName": "Misconception"}, axis=1)

df_synth3 = df_synth3[df_synth3["quality-gpt4o-mini"] > 2].reset_index(drop=True)

df_synth3["is_synthetic"] = True
df_synth3["fold"] = -4
print(df_synth3.shape)

(29916, 25)


In [10]:
df = pd.concat([df, df_synth, df_gpt, df_synth2, df_synth3], axis=0).reset_index(drop=True)
print(df.shape)

(72339, 52)


In [11]:
df.head()

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,QuestionText,Answer,AnswerText,Correct,CorrectAnswer,...,quality-gpt4o-mini,p000-qwen25-32b-instruct-cot-v2_misunderstanding,Answer_CoT,CorrectAnswer_CoT,SubjectNameLLM,wrong_answers,correct_answers,AnswerText-qwen,CorrectAnswerText-qwen,ConstructName-qwen25-72b-instruct
0,0,856.0,Use the order of operations to carry out calcu...,33.0,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,D,Does not need brackets,0.0,A,...,,,,,,,,,,
1,1,1612.0,Simplify an algebraic fraction by factorising ...,1077.0,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",A,\( m+1 \),0.0,D,...,,,,,,,,,,
2,1,1612.0,Simplify an algebraic fraction by factorising ...,1077.0,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",B,\( m+2 \),0.0,D,...,,,,,,,,,,
3,1,1612.0,Simplify an algebraic fraction by factorising ...,1077.0,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",C,\( m-1 \),0.0,D,...,,,,,,,,,,
4,2,2774.0,Calculate the range from a list of data,339.0,Range and Interquartile Range from a List of Data,Tom and Katie are discussing the \( 5 \) plant...,A,Only\nTom,0.0,B,...,,,,,,,,,,


In [12]:
df[["SubjectName", "ConstructName", "QuestionText", "CorrectAnswerText", "AnswerText", "Misconception"]].isnull().sum(0)

SubjectName          0
ConstructName        0
QuestionText         0
CorrectAnswerText    0
AnswerText           0
Misconception        0
dtype: int64

In [13]:
def get_query_text(row):
    first_subject = row["FirstSubjectName"]
    second_subject = row["SecondSubjectName"]
    third_subject = row["ThirdSubjectName"]
    construct = row["ConstructName"]
    task_description = f'You are an excellent math teacher about to teach students of year group 1 to 14. Here is the detail of your lesson: {first_subject}-{second_subject}-{third_subject}-{construct}. You will be provided a question with a wrong answer from your student. Please retrieve the most relevant misconception behind the wrong answer.'
    query_text = f"###Question###: {row['QuestionText']}\n###Correct Answer###: {row['CorrectAnswerText']}\n###Misconcept Wrong answer###: {row['AnswerText']}\n###Analysis###: {row['p000-qwen25-32b-instruct-cot_misunderstanding']}"
    return f'Instruct: {task_description}\nQuery: {query_text}'

In [14]:
df['InputText'] = df.apply(lambda x: get_query_text(x), axis=1)

In [15]:
print(df['InputText'].values[0])

Instruct: You are an excellent math teacher about to teach students of year group 1 to 14. Here is the detail of your lesson: Number-Basic Arithmetic-BIDMAS-Use the order of operations to carry out calculations involving powers. You will be provided a question with a wrong answer from your student. Please retrieve the most relevant misconception behind the wrong answer.
Query: ###question###: \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ?
###Correct Answer###: \( 3 \times(2+4)-5 \)
###Misconcept Wrong answer###: Does not need brackets
###Analysis###: The students' misunderstanding lies in their lack of recognition of the importance of the order of operations, which is governed by the BIDMAS rule (Brackets, Indices, Division/Multiplication, Addition/Subtraction). BIDMAS helps determine the sequence in which arithmetic operations should be performed to ensure the correct result.

In the expression \( 3 \times 2 + 4 - 5 \), the students might hav

In [16]:
df["InputText"].map(len).describe()

count    72339.000000
mean      2011.721063
std        404.317849
min        687.000000
25%       1748.000000
50%       1980.000000
75%       2242.000000
max      10347.000000
Name: InputText, dtype: float64

In [17]:
df = df[(df["InputText"].map(len) < 3000) | (~df['is_synthetic'])].reset_index(drop=True)
len(df)

71769

In [18]:
df_mis = pd.read_csv(f"{DATA_PATH}/misconception_mapping_with_paragraph_v3.csv")
mis_map = df_mis.set_index("MisconceptionId")['a000-llama3-mega-misconception-aug-seed201_misunderstanding'].to_dict()
df['a000-llama3-mega-misconception-aug-seed201_misunderstanding'] = df['MisconceptionId'].map(mis_map)
df['Misconception'] = df['Misconception'] + ' ' + df['a000-llama3-mega-misconception-aug-seed201_misunderstanding']
df["Misconception"] = df["Misconception"].apply(lambda x: 'boxed{' + x + '}')

In [19]:
df_not_synthetic = df[~df['is_synthetic']].reset_index(drop=True)
df_synthetic = df[df['is_synthetic']].reset_index(drop=True)
len(df_not_synthetic), len(df_synthetic)

(4370, 67399)

In [20]:
def _sample_synthetic(df_synthetic):
    mis_idx_map = {}
    for i in range(len(df_synthetic)):
        row = df_synthetic.iloc[i]
        misconception_id = row['MisconceptionId']
        if misconception_id not in mis_idx_map:
            mis_idx_map[misconception_id] = []
        mis_idx_map[misconception_id].append(i)
    sampled_idxs = []
    for misconception_id, idx_list in mis_idx_map.items():
        sampled_idx = np.random.choice(idx_list, 1, replace=False)
        sampled_idxs.append(sampled_idx[0])
    sampled_df = df_synthetic.iloc[sampled_idxs].reset_index(drop=True)
    other_df = df_synthetic.drop(sampled_idxs).reset_index(drop=True)
    return sampled_df, other_df

def sample_synthetic(df_synthetic, num_synthetic):
    count = 0
    dfs = []
    while count < num_synthetic:
        print('rest',len(df_synthetic))
        sampled_df, other_df = _sample_synthetic(df_synthetic)
        dfs.append(sampled_df)
        count += len(sampled_df)
        df_synthetic = other_df
        print('sampled_df_len',len(sampled_df))
        print('sampled_mis',len(set(sampled_df['MisconceptionId'])))
        
    return pd.concat(dfs, axis=0).reset_index(drop=True)

In [27]:
df_synthetic_sampled = sample_synthetic(df_synthetic, 36000)
len(df_synthetic_sampled)

rest 67399
sampled_df_len 2587
sampled_mis 2587
rest 64812
sampled_df_len 2586
sampled_mis 2586
rest 62226
sampled_df_len 2576
sampled_mis 2576
rest 59650
sampled_df_len 2562
sampled_mis 2562
rest 57088
sampled_df_len 2545
sampled_mis 2545
rest 54543
sampled_df_len 2526
sampled_mis 2526
rest 52017
sampled_df_len 2499
sampled_mis 2499
rest 49518
sampled_df_len 2470
sampled_mis 2470
rest 47048
sampled_df_len 2437
sampled_mis 2437
rest 44611
sampled_df_len 2392
sampled_mis 2392
rest 42219
sampled_df_len 2337
sampled_mis 2337
rest 39882
sampled_df_len 2288
sampled_mis 2288
rest 37594
sampled_df_len 2226
sampled_mis 2226
rest 35368
sampled_df_len 2173
sampled_mis 2173
rest 33195
sampled_df_len 2110
sampled_mis 2110


36314

In [28]:
len(df_synthetic_sampled) + 4370

40684

In [23]:
df = pd.concat([df_not_synthetic, df_synthetic_sampled], axis=0).reset_index(drop=True)
len(df)

42731

# Dataset

In [23]:
df_train = df

In [24]:
train_ds = Dataset.from_pandas(df_train)

In [26]:
df_courpus = pd.read_csv(f"{DATA_PATH}/misconception_mapping_with_paragraph_v3.csv")
df_courpus["MisconceptionName"] = df_courpus["MisconceptionName"] + ' ' + df_courpus['a000-llama3-mega-misconception-aug-seed201_misunderstanding']
df_courpus["MisconceptionName"] = df_courpus["MisconceptionName"].apply(lambda x: 'boxed{' + x + '}')
ir_corpus = df_courpus[["MisconceptionId", "MisconceptionName"]].drop_duplicates(['MisconceptionId']).reset_index(drop=True)
ir_corpus = dict(zip(ir_corpus.MisconceptionId, ir_corpus.MisconceptionName))

In [27]:
print(df_courpus["MisconceptionName"].values[0])

boxed{Does not know that angles in a triangle sum to 180 degrees Explanation: This misconception arises when students are not aware of or have not fully understood the fundamental property of triangles that the sum of the interior angles of a triangle is always 180 degrees. They may believe that the sum of the angles can be any value or may think that it varies depending on the type of triangle (e.g., equilateral, isosceles, scalene). 

This misconception can lead to errors when students are asked to find missing angles in a triangle, determine if a set of angles can form a triangle, or solve problems involving the interior angles of triangles.

Short cases where this misconception may occur:

1. When given two angles of a triangle and asked to find the third angle, a student with this misconception may not know how to proceed or may give an incorrect answer.
2. When asked to determine if a set of three angles (e.g., 60°, 80°, 100°) can form a triangle, a student with this misconceptio

# Model

In [30]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

In [31]:
base_model = LLM2Vec.from_pretrained(
    MODEL_NAME,
    device_map="cuda" if torch.cuda.is_available() else "cpu",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    # attn_implementation="flash_attention_2"
)

Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]

In [32]:
base_tokenizer = base_model.tokenizer

In [33]:
base_model.model

Qwen2BiModel(
  (embed_tokens): Embedding(152064, 5120)
  (layers): ModuleList(
    (0-63): 64 x ModifiedQwen2DecoderLayer(
      (self_attn): ModifiedQwen2SdpaAttention(
        (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=True)
        (k_proj): Linear4bit(in_features=5120, out_features=1024, bias=True)
        (v_proj): Linear4bit(in_features=5120, out_features=1024, bias=True)
        (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
        (rotary_emb): Qwen2RotaryEmbedding()
      )
      (mlp): Qwen2MLP(
        (gate_proj): Linear4bit(in_features=5120, out_features=27648, bias=False)
        (up_proj): Linear4bit(in_features=5120, out_features=27648, bias=False)
        (down_proj): Linear4bit(in_features=27648, out_features=5120, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): Qwen2RMSNorm((5120,), eps=1e-05)
      (post_attention_layernorm): Qwen2RMSNorm((5120,), eps=1e-05)
    )
  )
  (norm): Qwen2RMSNorm((5120,), 

In [34]:
config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="FEATURE_EXTRACTION",
)

In [35]:
base_model = prepare_model_for_kbit_training(base_model.model)
base_model = get_peft_model(base_model, config)
# base_model = get_peft_model(base_model.model, config)
base_model.print_trainable_parameters()

trainable params: 536,870,912 || all params: 32,522,179,584 || trainable%: 1.6508


In [None]:
model = SentenceTransformer("Qwen/Qwen2.5-0.5B-Instruct", trust_remote_code=True)

No sentence-transformers model found with name Qwen/Qwen2.5-0.5B-Instruct. Creating a new one with mean pooling.


In [37]:
model._first_module().tokenizer

Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-0.5B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=Fals

In [38]:
model._first_module().auto_model = base_model
model._first_module().tokenizer = base_tokenizer

In [39]:
model[1].pooling_mode_mean_tokens = False
model[1].pooling_mode_lasttoken = True

In [40]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 32768, 'do_lower_case': False}) with Transformer model: PeftModelForFeatureExtraction 
  (1): Pooling({'word_embedding_dimension': 896, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': True, 'include_prompt': True})
)

In [41]:
del base_model
torch.cuda.empty_cache()

# Training

In [42]:
loss = CachedMultipleNegativesRankingLoss(model, mini_batch_size=16)
# loss = MultipleNegativesRankingLoss(model)

In [43]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=MODEL_OUTPUT_PATH,
    # Optional training parameters:
    optim="paged_adamw_8bit",
    num_train_epochs=EPOCH,
    dataloader_num_workers=NUM_PROC,
    per_device_train_batch_size=BS,
#    gradient_accumulation_steps = 2,
#    per_device_eval_batch_size=BS,
    # learning_rate=LR,
    warmup_ratio=0.0,
    fp16=False,
    bf16=True,
    # batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    # lr_scheduler_type="cosine_with_restarts",
#    eval_strategy="epoch",
#    eval_steps=8,
    save_strategy="epoch",
    save_steps=1,
    save_total_limit=10,
    logging_steps=1000,
    report_to=REPORT_TO,  # Will be used in W&B if `wandb` is installed
#    metric_for_best_model="eval_cosine_map@25", # eval_cosine_recall@25
    do_eval=False,
    push_to_hub=False,
#    load_best_model_at_end=True,
    # gradient_checkpointing_kwargs=True
)

In [44]:
# dev_evaluator = InformationRetrievalEvaluator(
#    ir_queries, ir_corpus, ir_relevant_docs,
#    accuracy_at_k=[25],
#    precision_recall_at_k=[25, 50, 100],
#    map_at_k=[25])

In [45]:
trainer = SentenceTransformerTrainer(
        model=model,
        args=args,
        train_dataset=train_ds.select_columns(
            ["InputText", "Misconception"]
        ),
#        eval_dataset=test_ds.select_columns(
#            ["InputText", "Misconception"]
#        ),
        loss=loss,
#        evaluator=dev_evaluator,
    )

In [None]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
