# Clone the repository if it doesn't exist

In [None]:
# Please set this to True if you want to clone the repository
SHOULD_CLONE = False
project_name = "Small-Qwen-Coding-Multiple-Choice"

import os, subprocess

if SHOULD_CLONE:
    subprocess.run(
        ["git", "clone", "https://github.com/tuandung222/Small-Qwen-Coding-Multiple-Choice.git"],
        check=True,
    )
    os.chdir(project_name)  # Change directory to the cloned project

# If running this notebook in the notebooks folder, go outside

In [9]:
import os

# Check if in the notebooks folder and go outside
if os.getcwd().endswith("notebooks"):
    os.chdir("..")  # Change directory to the parent folder

print(os.getcwd())  # Print the current working directory

/workspace/Small-Qwen-Coding-Multiple-Choice


# Import the necessary libraries

In [33]:
import os
import datasets
from pprint import pprint
import os
import torch
import random
from datasets import Dataset
from huggingface_hub import login, HfApi
import os
from getpass import getpass
from tqdm import tqdm
import torch
import re
import pandas as pd
from peft import PeftModel
import unsloth, transformers
from unsloth.models import FastLanguageModel

# Import the necessary modules from the src folder

In [17]:
from src.model.qwen_handler import QwenModelHandler
from src.testing.tester import MultipleChoiceTester
from src.prompt_processors.prompt_creator import PromptCreator
from src.testing.tester import MultipleChoiceTester

# Load the test data

In [32]:
test_path = os.path.join("src", "data", "b6_test_data.csv")
if not os.path.exists(test_path):
    raise FileNotFoundError(f"Data file not found at: {test_path}")

test_data = datasets.load_dataset("csv", data_files=test_path)["train"]

# pprint(test_data[0])
# pprint(test_data.features)
# pprint(test_data.info)
pprint(test_data)

Dataset({
    features: ['task_id', 'question', 'choices'],
    num_rows: 1253
})


# Load the lastest model checkpoint from HuggingFace Hub

In [13]:
# Set HuggingFace Hub credentials if available
hf_token = os.environ.get("HF_TOKEN")

# Model ID on HuggingFace Hub
hub_model_id = "tuandunghcmut/Qwen25_Coder_MultipleChoice_v4"

print(f"Loading model from HuggingFace Hub: {hub_model_id}")


lastest_model_handler = QwenModelHandler(
    model_name=hub_model_id,
    max_seq_length=2048,
    quantization="4bit",
    model_source="unsloth",
    device_map="auto",
    attn_implementation="flash_attention_2",
)

# Use FastLanguageModel

FastLanguageModel.for_inference(lastest_model_handler.model)
prompt_creator = PromptCreator(PromptCreator.YAML_REASONING)
# Create a tester with the loaded model
latest_tester = MultipleChoiceTester(lastest_model_handler, prompt_creator=prompt_creator)

print("Successfully loaded model from HuggingFace Hub!")

Loading model from HuggingFace Hub: tuandunghcmut/Qwen25_Coder_MultipleChoice_v4
2025-04-04 18:14:08 - src.model.qwen_handler - INFO - Loading tuandunghcmut/Qwen25_Coder_MultipleChoice_v4 from unsloth, max_seq_length=2048


INFO:src.model.qwen_handler:Loading tuandunghcmut/Qwen25_Coder_MultipleChoice_v4 from unsloth, max_seq_length=2048


2025-04-04 18:14:08 - src.model.qwen_handler - INFO - Flash Attention 2 is available (package flash-attn detected)


INFO:src.model.qwen_handler:Flash Attention 2 is available (package flash-attn detected)


2025-04-04 18:14:08 - src.model.qwen_handler - INFO - Flash Attention 2 version: 2.7.4.post1


INFO:src.model.qwen_handler:Flash Attention 2 version: 2.7.4.post1


2025-04-04 18:14:08 - src.model.qwen_handler - INFO - xFormers is available (version: 0.0.29.post3)


INFO:src.model.qwen_handler:xFormers is available (version: 0.0.29.post3)


2025-04-04 18:14:08 - src.model.qwen_handler - INFO - CUDA is available (version: 12.4)


INFO:src.model.qwen_handler:CUDA is available (version: 12.4)


2025-04-04 18:14:08 - src.model.qwen_handler - INFO - Using attention implementation: flash_attention_2


INFO:src.model.qwen_handler:Using attention implementation: flash_attention_2


2025-04-04 18:14:08 - src.model.qwen_handler - INFO - Setting max memory: {0: '27620MiB'}


INFO:src.model.qwen_handler:Setting max memory: {0: '27620MiB'}


2025-04-04 18:14:08 - src.model.qwen_handler - INFO - Using attention implementation: flash_attention_2


INFO:src.model.qwen_handler:Using attention implementation: flash_attention_2


==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.50.3.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.3.19 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


2025-04-04 18:14:19 - src.model.qwen_handler - INFO - Model loaded successfully: tuandunghcmut/Qwen25_Coder_MultipleChoice_v4


INFO:src.model.qwen_handler:Model loaded successfully: tuandunghcmut/Qwen25_Coder_MultipleChoice_v4


2025-04-04 18:14:19 - src.model.qwen_handler - INFO - Model type: qwen2


INFO:src.model.qwen_handler:Model type: qwen2


2025-04-04 18:14:19 - src.model.qwen_handler - INFO - hidden_size: 1536


INFO:src.model.qwen_handler:hidden_size: 1536


2025-04-04 18:14:19 - src.model.qwen_handler - INFO - intermediate_size: 8960


INFO:src.model.qwen_handler:intermediate_size: 8960


2025-04-04 18:14:19 - src.model.qwen_handler - INFO - num_hidden_layers: 28


INFO:src.model.qwen_handler:num_hidden_layers: 28


2025-04-04 18:14:19 - src.model.qwen_handler - INFO - num_attention_heads: 12


INFO:src.model.qwen_handler:num_attention_heads: 12


2025-04-04 18:14:19 - src.model.qwen_handler - INFO - torch_dtype: float16


INFO:src.model.qwen_handler:torch_dtype: float16


Successfully loaded model from HuggingFace Hub!


In [14]:
print(lastest_model_handler.tokenizer.padding_side)
print(lastest_model_handler.tokenizer.eos_token_id)
print(lastest_model_handler.tokenizer.pad_token_id)

left
151645
151665


# Define a function to perform batch inference, generate answers for multiple-choice questions and debug results

In [34]:
def generate_answer_csv_with_batch_inference(
    test_dataset,
    model_handler,
    prompt_creator,
    tester,
    output_folder="./outputs/inference_batch",
    batch_size=64,
    temperature=0.001,
    max_new_tokens=1024,
    do_sample=True,
    top_p=0.95,
    top_k=90,
    debug_samples=10,
):
    """
    Generate answers for multiple-choice questions in batch mode.

    Args:
        test_dataset: Dataset containing test examples
        model_handler: Model handler instance
        prompt_creator: Prompt creator instance
        tester: Tester instance
        output_folder: Folder to save all output files
        batch_size: Number of examples to process at once
        temperature: Sampling temperature for generation
        max_new_tokens: Maximum number of tokens to generate
        do_sample: Whether to use sampling for generation
        top_p: Top-p sampling parameter
        top_k: Top-k sampling parameter
        debug_samples: Number of random samples to debug
    """
    import os
    import re
    import random
    import pandas as pd
    import torch
    from tqdm import tqdm
    from datetime import datetime

    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Create timestamp for the run
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Create a dictionary to store results
    results_dict = {}
    
    # Create a list to store all detailed results for each example
    all_completions = []

    # Create a list to store detailed results for debugging
    debug_results = []

    # Select a few random examples for detailed debugging
    all_examples = list(test_dataset)
    num_examples = len(all_examples)
    debug_indices = random.sample(range(num_examples), min(debug_samples, num_examples))

    # Process examples in batches
    for batch_start in tqdm(range(0, num_examples, batch_size), desc="Processing batches"):
        # Get the current batch
        batch_end = min(batch_start + batch_size, num_examples)
        batch = all_examples[batch_start:batch_end]

        # Prepare batch inputs
        batch_inputs = []
        batch_task_ids = []
        batch_prompts = []
        batch_examples = []

        for example in batch:
            # Create prompt for each example
            prompt = prompt_creator.create_inference_prompt(
                example["question"], eval(example["choices"])
            )

            # Format as chat
            messages = [{"role": "user", "content": prompt}]
            chat_text = model_handler.tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )

            batch_inputs.append(chat_text)
            batch_task_ids.append(example["task_id"])
            batch_prompts.append(prompt)
            batch_examples.append(example)

        # Tokenize all inputs at once
        model_handler.tokenizer.padding_side = "left"
        tokenized_inputs = model_handler.tokenizer(
            batch_inputs,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=model_handler.max_seq_length,
        ).to(model_handler.model.device)

        # Perform batch inference using the model directly
        with torch.inference_mode():
            generated_ids = model_handler.model.generate(
                input_ids=tokenized_inputs.input_ids,
                attention_mask=tokenized_inputs.attention_mask,
                temperature=temperature,
                max_new_tokens=max_new_tokens,
                do_sample=do_sample,
                top_p=top_p,
                top_k=top_k,
                use_cache=True,
                pad_token_id=model_handler.tokenizer.pad_token_id,
            )

            # Extract only the generated part (not the input)
            batch_outputs = []
            for i, gen_ids in enumerate(generated_ids):
                # Get the length of the input
                input_length = tokenized_inputs.input_ids[i].shape[0]
                # Decode only the generated part
                output_text = model_handler.tokenizer.decode(
                    gen_ids[input_length:], skip_special_tokens=True
                )
                batch_outputs.append(output_text)

        # Process batch results and write to batch file
        batch_file = os.path.join(output_folder, f"batch_{batch_start//batch_size + 1}_{timestamp}.txt")
        batch_results = []
        
        with open(batch_file, "w", encoding="utf-8") as f:
            for i, (task_id, output, example) in enumerate(zip(batch_task_ids, batch_outputs, batch_examples)):
                # Extract predicted answer from model output
                result = {}
                
                # Simple regex to find the answer (A, B, C, or D)
                answer_match = re.search(r"answer\s*(?:is|:)?\s*([ABCD])", output, re.IGNORECASE)
                if answer_match:
                    result["predicted_answer"] = answer_match.group(1).upper()
                else:
                    # Fallback: look for the first occurrence of A, B, C, or D
                    for letter in ["A", "B", "C", "D", "E", "F", "G", "H", "I"]:
                        if (
                            f"({letter})" in output
                            or f"{letter}." in output
                            or f"answer {letter}" in output.lower()
                        ):
                            result["predicted_answer"] = letter
                            break
                    else:
                        # If no answer found, default to A
                        result["predicted_answer"] = "A"

                # Extract reasoning if available
                result["reasoning"] = output
                result["task_id"] = task_id
                
                # Store all original example data along with completions
                completion_record = {
                    "task_id": task_id,
                    "question": example["question"],
                    "choices": example["choices"],
                    "predicted_answer": result["predicted_answer"],
                    "completion": output,
                }
                if "answer" in example:
                    completion_record["ground_truth"] = example["answer"]
                
                all_completions.append(completion_record)
                
                # Write to batch file
                f.write(f"Task ID: {task_id}\n")
                f.write(f"Question: {example['question']}\n")
                f.write(f"Choices: {example['choices']}\n")
                f.write(f"Predicted Answer: {result['predicted_answer']}\n")
                f.write(f"Completion: {output}\n")
                if "answer" in example:
                    f.write(f"Ground Truth: {example['answer']}\n")
                f.write("\n" + "-"*80 + "\n\n")  # Separator
                
                # Store the result for submission
                results_dict[task_id] = result["predicted_answer"]
                batch_results.append(result)

                # Print progress update for every 10th example or last in batch
                if i % 10 == 0 or i == len(batch) - 1:
                    print(f"Batch {batch_start//batch_size + 1}: Processed {i+1}/{len(batch)} examples")

                # For selected examples, save detailed results for debugging
                global_index = batch_start + i
                if global_index in debug_indices:
                    example = batch[i]
                    debug_results.append(
                        {
                            "task_id": task_id,
                            "question": example["question"],
                            "choices": example["choices"],
                            "predicted_answer": result["predicted_answer"],
                            "reasoning": result.get("reasoning", "No reasoning provided"),
                        }
                    )

                    # Print detailed debug information for these examples
                    print(f"\n--- DETAILED DEBUG FOR TASK {task_id} ---")
                    print(f"Question: {example['question'][:100]}...")
                    print(f"Choices: {example['choices']}")
                    print(f"Predicted: {result['predicted_answer']}")
                    print(f"Reasoning snippet: {str(result.get('reasoning', 'No reasoning'))[:200]}...")
                    print("-----------------------------------\n")

    # Create a DataFrame from the results dictionary for submission
    submission_df = pd.DataFrame(list(results_dict.items()), columns=["task_id", "answer"])

    # Create a DataFrame with all completions
    all_completions_df = pd.DataFrame(all_completions)

    # Save all the files to the output folder
    submission_file = os.path.join(output_folder, f"submission_{timestamp}.csv")
    debug_file = os.path.join(output_folder, f"debug_results_{timestamp}.csv")
    all_completions_file = os.path.join(output_folder, f"all_completions_{timestamp}.csv")
    all_completions_json = os.path.join(output_folder, f"all_completions_{timestamp}.json")

    # Save files
    submission_df.to_csv(submission_file, index=False)
    pd.DataFrame(debug_results).to_csv(debug_file, index=False)
    all_completions_df.to_csv(all_completions_file, index=False)
    
    # Save as JSON for better inspection of large text fields
    import json
    with open(all_completions_json, 'w', encoding='utf-8') as f:
        json.dump(all_completions, f, indent=2, ensure_ascii=False)

    print(f"Submission saved to {submission_file}")
    print(f"Debug results saved to {debug_file}")
    print(f"All completions saved to {all_completions_file} and {all_completions_json}")
    
    return {
        "submission_file": submission_file,
        "debug_file": debug_file, 
        "all_completions_file": all_completions_file,
        "all_completions_json": all_completions_json,
        "results": results_dict,
        "debug_results": debug_results,
        "all_completions": all_completions
    }

# Inference on the test dataset and save the results to a CSV file, then push the results to Hugging Face

In [None]:
dict_results = generate_answer_csv_with_batch_inference(
    test_data,
    lastest_model_handler,
    prompt_creator,
    latest_tester
)

Processing batches:   5%|▌         | 1/20 [01:07<21:13, 67.01s/it]

Batch 1: Processed 1/64 examples
Batch 1: Processed 11/64 examples

--- DETAILED DEBUG FOR TASK k10254 ---
Question: Question: What is the output of this program?
   #! /usr/bin/awk -f   BEGIN {       a=5       while ...
Choices: ['nothing will print', '“sanfoundry” will print 5 times', 'program will generate syntax error', 'none of the mentioned']
Predicted: C
Reasoning snippet: understanding: |
  The question asks for the output of an AWK script that prints "hello_world" repeatedly until a condition is met. Key concepts include understanding AWK's control structures and synt...
-----------------------------------

Batch 1: Processed 21/64 examples
Batch 1: Processed 31/64 examples
Batch 1: Processed 41/64 examples
Batch 1: Processed 51/64 examples
Batch 1: Processed 61/64 examples
Batch 1: Processed 64/64 examples


In [None]:
# Post-processing and pushing the test dataset with completions to Hugging Face

# Create a dataset from the completions
def prepare_dataset_for_upload():
    # Extract the necessary data from all_completions
    upload_data = []
    
    for item in dict_results["all_completions"]:
        entry = {
            "task_id": item["task_id"],
            "question": item["question"],
            "choices": item["choices"],
            "predicted_answer": item["predicted_answer"],
            "completion": item["completion"],
            "reasoning": item.get("reasoning", ""),
            "understanding": item.get("understanding", ""),
            "analysis": item.get("analysis", ""),
            "conclusion": item.get("conclusion", "")
        }
        upload_data.append(entry)
    
    # Convert to Hugging Face dataset
    dataset = Dataset.from_list(upload_data)
    return dataset

# Function to push the dataset to Hugging Face
def push_to_huggingface(dataset, repo_id="tuandunghcmut/coding-mcq-reasoning_evaluated_on_test_set"):
    # Ask for Hugging Face token if not already logged in
    try:
        # Check if already logged in
        api = HfApi()
        api.whoami()
        print("Already logged in to Hugging Face")
    except Exception:
        # If not logged in, ask for token
        print("Please enter your Hugging Face token:")
        token = getpass()
        login(token=token)
    
    # Push the dataset to Hugging Face
    dataset.push_to_hub(
        repo_id,
        commit_message="Add model evaluation results on test set",
        private=False
    )
    
    print(f"Dataset successfully pushed to {repo_id}")

# Execute the functions
print("Preparing dataset for upload...")
dataset = prepare_dataset_for_upload()
print(f"Dataset prepared with {len(dataset)} entries")

# Ask for confirmation before pushing
push_confirmation = input("Do you want to push this dataset to Hugging Face? (yes/no): ")
if push_confirmation.lower() in ["yes", "y"]:
    push_to_huggingface(dataset)
else:
    print("Dataset push cancelled")

# Display a sample of the dataset
print("\nSample of the dataset that would be pushed:")
print(dataset[:2])




