In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import json
from tqdm import tqdm, trange
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "Qwen/Qwen2.5-14B-Instruct-AWQ"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [4]:
full_train_df = pd.read_csv('../comp_data/train.csv')
mis_map = pd.read_csv('../comp_data/misconception_mapping.csv')

mis_map['MisconceptionName'] = mis_map['MisconceptionName'].str.strip()

In [5]:
eval_df = full_train_df.iloc[:373]
train_df = full_train_df.iloc[373:]

print(f"train_df.shape: {train_df.shape}, eval_df.shape: {eval_df.shape}")

train_df.shape: (1496, 15), eval_df.shape: (373, 15)


In [6]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

fold_data = []
for fold, (train_idx, val_idx) in enumerate(kfold.split(train_df)):
    fold_data.append({
        'train': train_df.iloc[train_idx],
        'val': train_df.iloc[val_idx],
    })

In [7]:
def get_query_template():
    return """Subject: {SubjectName}

Construct: {ConstructName}

Question:
{QuestionText}

Correct Option:
{CorrectAnswerText}

Incorrect Option 1:
{IncorrectAnswer1Text}

Incorrect Option 2:
{IncorrectAnswer2Text}

Incorrect Option 3:
{IncorrectAnswer3Text}"""

def get_task_description():
    return 'Given a math question with options, retrieve the most relevant misconceptions for the incorrect answers.'

def process_data(df, mis_map):
    train_data = []
    for idx, row in tqdm(df.iterrows()):
        correct_option = row['CorrectAnswer']
        correct_option_text = row[f'Answer{correct_option}Text']
        
        incorrect_options_texts = [row[f'Answer{option}Text'] for option in ['A', 'B', 'C', 'D'] if option != correct_option]

        query_text = get_query_template().format(**{
            'SubjectName': row['SubjectName'],
            'ConstructName': row['ConstructName'],
            'QuestionText': row['QuestionText'],
            'CorrectAnswerText': correct_option_text,
            'IncorrectAnswer1Text': incorrect_options_texts[0],
            'IncorrectAnswer2Text': incorrect_options_texts[1],
            'IncorrectAnswer3Text': incorrect_options_texts[2],
        })
        
        related_mis_ids = [row[f'Misconception{option}Id'] for option in ['A', 'B', 'C', 'D']]
        related_mis_ids = [mis_id for mis_id in related_mis_ids if pd.notna(mis_id)]
        related_mis_ids = list(set(related_mis_ids))
        related_mis_texts = [mis_map.loc[mis_id]['MisconceptionName'] for mis_id in related_mis_ids]
        
        train_data.append({
            "query": query_text,
            "pos": related_mis_texts,
            "neg": [],
            "prompt": get_task_description(),
        })
    return train_data

# Process each fold
fold_train_data = []
for fold_dict in fold_data:
    fold_train_data.append({
        'train': process_data(fold_dict['train'], mis_map),
        'val': process_data(fold_dict['val'], mis_map)
    })


1196it [00:00, 7059.22it/s]
300it [00:00, 7014.12it/s]
1197it [00:00, 7170.06it/s]
299it [00:00, 7186.47it/s]
1197it [00:00, 7088.10it/s]
299it [00:00, 7250.50it/s]
1197it [00:00, 7147.82it/s]
299it [00:00, 7086.29it/s]
1197it [00:00, 4645.58it/s]
299it [00:00, 7258.64it/s]


In [8]:
all_fold_train_data = process_data(train_df, mis_map)

1496it [00:00, 7063.03it/s]


In [9]:
SAVE_DIR = 'stage1_data'

os.makedirs(SAVE_DIR, exist_ok=True)
for fold, fold_dict in enumerate(fold_train_data):
    with open(f'{SAVE_DIR}/fold_{fold}_train.jsonl', 'w') as f:
        for item in fold_dict['train']:
            f.write(json.dumps(item) + '\n')
    with open(f'{SAVE_DIR}/fold_{fold}_val.jsonl', 'w') as f:
        for item in fold_dict['val']:
            f.write(json.dumps(item) + '\n')
            
with open(f'{SAVE_DIR}/train_all_folds.jsonl', 'w') as f:
    for item in all_fold_train_data:
        f.write(json.dumps(item) + '\n')

In [10]:
candidate_pool = []
for idx, row in mis_map.iterrows():
    candidate_pool.append({
        "text": row['MisconceptionName'],
    })
    
with open(f'{SAVE_DIR}/candidate_pool.jsonl', 'w') as f:
    for data in candidate_pool:
        f.write(json.dumps(data) + '\n')

In [15]:
# bash
for i_fold in range(5):
    ! CUDA_VISIBLE_DEVICES=0 python scripts/hn_mine.py \
    --embedder_name_or_path BAAI/bge-en-icl \
    --input_file stage1_data/fold_{i_fold}_train.jsonl \
    --output_file stage1_data/fold_{i_fold}_train_minedHN.jsonl \
    --candidate_pool stage1_data/candidate_pool.jsonl \
    --range_for_sampling 2-150 \
    --negative_number 25

Loading checkpoint shards: 100%|██████████████████| 3/3 [00:00<00:00,  7.57it/s]
inferencing embedding for corpus (number=2587)--------------
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|███████████████████████| 3/3 [00:07<00:00,  2.39s/it]
inferencing embedding for queries (number=1196)--------------
Inference Embeddings: 100%|█████████████████████| 13/13 [00:23<00:00,  1.81s/it]
create index and search------------------
Batches: 100%|██████████████████████████████████| 19/19 [00:00<00:00, 20.04it/s]
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:00<00:00,  9.67it/s]
inferencing embedding for corpus (number=2587)--------------
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method