# Clone the repository if it doesn't exist

In [1]:
# Please set this to True if you want to clone the repository
SHOULD_CLONE = False
project_name = "Small-Qwen-Coding-Multiple-Choice"

import os, subprocess

if SHOULD_CLONE:
    subprocess.run(
        ["git", "clone", "https://github.com/tuandung222/Small-Qwen-Coding-Multiple-Choice.git"],
        check=True,
    )
    os.chdir(project_name)  # Change directory to the cloned project

# If running this notebook in the notebooks folder, go outside

In [2]:
import os

# Check if in the notebooks folder and go outside
if os.getcwd().endswith("notebooks"):
    os.chdir("..")  # Change directory to the parent folder

print(os.getcwd())  # Print the current working directory

/workspace/Small-Qwen-Coding-Multiple-Choice


# Import the necessary libraries

In [3]:
import os
import datasets
from pprint import pprint
import os
import torch
import random
from datasets import Dataset
from huggingface_hub import login, HfApi
import os
from getpass import getpass
from tqdm import tqdm
import torch
import re
import pandas as pd
from peft import PeftModel
import unsloth, transformers
from unsloth.models import FastLanguageModel


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth, transformers


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Import the necessary modules from the src folder

In [4]:
from src.model.qwen_handler import QwenModelHandler
from src.testing.tester import MultipleChoiceTester
from src.prompt_processors.prompt_creator import PromptCreator
from src.testing.tester import MultipleChoiceTester

# Load the test data

In [5]:
test_path = os.path.join("src", "data", "b6_test_data.csv")
if not os.path.exists(test_path):
    raise FileNotFoundError(f"Data file not found at: {test_path}")

test_data = datasets.load_dataset("csv", data_files=test_path)["train"]

# pprint(test_data[0])
# pprint(test_data.features)
# pprint(test_data.info)
pprint(test_data)

Dataset({
    features: ['task_id', 'question', 'choices'],
    num_rows: 1253
})


# Load the lastest model checkpoint from HuggingFace Hub

In [6]:
# Set HuggingFace Hub credentials if available
hf_token = os.environ.get("HF_TOKEN")

# Model ID on HuggingFace Hub
hub_model_id = "tuandunghcmut/Qwen25_Coder_MultipleChoice_v4"

print(f"Loading model from HuggingFace Hub: {hub_model_id}")


lastest_model_handler = QwenModelHandler(
    model_name=hub_model_id,
    max_seq_length=2048,
    quantization="4bit",
    model_source="unsloth",
    device_map="auto",
    attn_implementation="flash_attention_2",
)

# Use FastLanguageModel

FastLanguageModel.for_inference(lastest_model_handler.model)
prompt_creator = PromptCreator(PromptCreator.YAML_REASONING)
# Create a tester with the loaded model
latest_tester = MultipleChoiceTester(lastest_model_handler, prompt_creator=prompt_creator)

print("Successfully loaded model from HuggingFace Hub!")

Loading model from HuggingFace Hub: tuandunghcmut/Qwen25_Coder_MultipleChoice_v4
2025-04-04 19:18:25 - src.model.qwen_handler - INFO - Loading tuandunghcmut/Qwen25_Coder_MultipleChoice_v4 from unsloth, max_seq_length=2048


INFO:src.model.qwen_handler:Loading tuandunghcmut/Qwen25_Coder_MultipleChoice_v4 from unsloth, max_seq_length=2048


2025-04-04 19:18:25 - src.model.qwen_handler - INFO - Flash Attention 2 is available (package flash-attn detected)


INFO:src.model.qwen_handler:Flash Attention 2 is available (package flash-attn detected)


2025-04-04 19:18:25 - src.model.qwen_handler - INFO - Flash Attention 2 version: 2.7.4.post1


INFO:src.model.qwen_handler:Flash Attention 2 version: 2.7.4.post1


2025-04-04 19:18:25 - src.model.qwen_handler - INFO - xFormers is available (version: 0.0.29.post3)


INFO:src.model.qwen_handler:xFormers is available (version: 0.0.29.post3)


2025-04-04 19:18:25 - src.model.qwen_handler - INFO - CUDA is available (version: 12.4)


INFO:src.model.qwen_handler:CUDA is available (version: 12.4)


2025-04-04 19:18:25 - src.model.qwen_handler - INFO - Using attention implementation: flash_attention_2


INFO:src.model.qwen_handler:Using attention implementation: flash_attention_2


2025-04-04 19:18:25 - src.model.qwen_handler - INFO - Setting max memory: {0: '27620MiB'}


INFO:src.model.qwen_handler:Setting max memory: {0: '27620MiB'}


2025-04-04 19:18:25 - src.model.qwen_handler - INFO - Using attention implementation: flash_attention_2


INFO:src.model.qwen_handler:Using attention implementation: flash_attention_2


==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.50.3.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.3.19 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


2025-04-04 19:18:34 - src.model.qwen_handler - INFO - Model loaded successfully: tuandunghcmut/Qwen25_Coder_MultipleChoice_v4


INFO:src.model.qwen_handler:Model loaded successfully: tuandunghcmut/Qwen25_Coder_MultipleChoice_v4


2025-04-04 19:18:34 - src.model.qwen_handler - INFO - Model type: qwen2


INFO:src.model.qwen_handler:Model type: qwen2


2025-04-04 19:18:34 - src.model.qwen_handler - INFO - hidden_size: 1536


INFO:src.model.qwen_handler:hidden_size: 1536


2025-04-04 19:18:34 - src.model.qwen_handler - INFO - intermediate_size: 8960


INFO:src.model.qwen_handler:intermediate_size: 8960


2025-04-04 19:18:34 - src.model.qwen_handler - INFO - num_hidden_layers: 28


INFO:src.model.qwen_handler:num_hidden_layers: 28


2025-04-04 19:18:34 - src.model.qwen_handler - INFO - num_attention_heads: 12


INFO:src.model.qwen_handler:num_attention_heads: 12


2025-04-04 19:18:34 - src.model.qwen_handler - INFO - torch_dtype: float16


INFO:src.model.qwen_handler:torch_dtype: float16


Successfully loaded model from HuggingFace Hub!


In [7]:
print(lastest_model_handler.tokenizer.padding_side)
print(lastest_model_handler.tokenizer.eos_token_id)
print(lastest_model_handler.tokenizer.pad_token_id)

left
151645
151665


# Define a function to perform batch inference
* Generate answers for multiple-choice questions 
* Save results to a CSV file
* Track the results for each example
* Display the results as yaml format
* Display a sample of the results
* Push the results to Hugging Face


In [8]:
def generate_answer_csv_with_batch_inference(
    test_dataset,
    model_handler,
    prompt_creator,
    tester,
    output_folder="./outputs/inference_batch",
    batch_size=16,
    temperature=0.001,
    max_new_tokens=512,
    do_sample=True,
    top_p=0.95,
    top_k=80,
    debug_samples=10,
):
    """
    Generate answers for multiple-choice questions in batch mode.

    Args:
        test_dataset: Dataset containing test examples
        model_handler: Model handler instance
        prompt_creator: Prompt creator instance
        tester: Tester instance
        output_folder: Folder to save all output files
        batch_size: Number of examples to process at once
        temperature: Sampling temperature for generation
        max_new_tokens: Maximum number of tokens to generate
        do_sample: Whether to use sampling for generation
        top_p: Top-p sampling parameter
        top_k: Top-k sampling parameter
        debug_samples: Number of random samples to debug
    """
    import os
    import re
    import random
    import pandas as pd
    import torch
    from tqdm import tqdm
    from datetime import datetime

    
    # Create timestamp for the run
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    output_folder = os.path.join(output_folder, timestamp)
    os.makedirs(output_folder, exist_ok=True)
    
    # Create a dictionary to store results
    results_dict = {}
    
    # Create a list to store all detailed results for each example
    all_completions = []

    # Create a list to store detailed results for debugging
    debug_results = []

    # Select a few random examples for detailed debugging
    all_examples = list(test_dataset)
    num_examples = len(all_examples)
    debug_indices = random.sample(range(num_examples), min(debug_samples, num_examples))

    # Process examples in batches
    for batch_start in tqdm(range(0, num_examples, batch_size), desc="Processing batches"):
        # Get the current batch
        batch_end = min(batch_start + batch_size, num_examples)
        batch = all_examples[batch_start:batch_end]

        # Prepare batch inputs
        batch_inputs = []
        batch_task_ids = []
        batch_prompts = []
        batch_examples = []

        for example in batch:
            # Create prompt for each example
            prompt = prompt_creator.create_inference_prompt(
                example["question"], eval(example["choices"])
            )

            # Format as chat
            messages = [{"role": "user", "content": prompt}]
            chat_text = model_handler.tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )

            batch_inputs.append(chat_text)
            batch_task_ids.append(example["task_id"])
            batch_prompts.append(prompt)
            batch_examples.append(example)

        # Tokenize all inputs at once
        model_handler.tokenizer.padding_side = "left"
        tokenized_inputs = model_handler.tokenizer(
            batch_inputs,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=model_handler.max_seq_length,
        ).to(model_handler.model.device)

        # Perform batch inference using the model directly
        with torch.inference_mode():
            generated_ids = model_handler.model.generate(
                input_ids=tokenized_inputs.input_ids,
                attention_mask=tokenized_inputs.attention_mask,
                temperature=temperature,
                max_new_tokens=max_new_tokens,
                do_sample=do_sample,
                top_p=top_p,
                top_k=top_k,
                use_cache=True,
                pad_token_id=model_handler.tokenizer.pad_token_id,
            )

            # Extract only the generated part (not the input)
            batch_outputs = []
            for i, gen_ids in enumerate(generated_ids):
                # Get the length of the input
                input_length = tokenized_inputs.input_ids[i].shape[0]
                # Decode only the generated part
                output_text = model_handler.tokenizer.decode(
                    gen_ids[input_length:], skip_special_tokens=True
                )
                batch_outputs.append(output_text)

        # Process batch results and write to batch file
        batch_file = os.path.join(output_folder, f"batch_{batch_start//batch_size + 1}_{timestamp}.txt")
        batch_results = []
        
        with open(batch_file, "w", encoding="utf-8") as f:
            for i, (task_id, output, example) in enumerate(zip(batch_task_ids, batch_outputs, batch_examples)):
                # Extract predicted answer from model output
                result = {}
                
                # Simple regex to find the answer (A, B, C, or D)
                answer_match = re.search(r"answer\s*(?:is|:)?\s*([ABCD])", output, re.IGNORECASE)
                if answer_match:
                    result["predicted_answer"] = answer_match.group(1).upper()
                else:
                    # Fallback: look for the first occurrence of A, B, C, or D
                    for letter in ["A", "B", "C", "D", "E", "F", "G", "H", "I"]:
                        if (
                            f"({letter})" in output
                            or f"{letter}." in output
                            or f"answer {letter}" in output.lower()
                        ):
                            result["predicted_answer"] = letter
                            break
                    else:
                        # If no answer found, default to A
                        result["predicted_answer"] = "A"

                # Extract reasoning if available
                result["reasoning"] = output
                result["task_id"] = task_id
                
                # Store all original example data along with completions
                completion_record = {
                    "task_id": task_id,
                    "question": example["question"],
                    "choices": example["choices"],
                    "predicted_answer": result["predicted_answer"],
                    "completion": output,
                }
                if "answer" in example:
                    completion_record["ground_truth"] = example["answer"]
                
                all_completions.append(completion_record)
                
                # Write to batch file
                f.write(f"Task ID: {task_id}\n")
                f.write(f"Question: {example['question']}\n")
                f.write(f"Choices: {example['choices']}\n")
                f.write(f"Predicted Answer: {result['predicted_answer']}\n")
                f.write(f"Completion: {output}\n")
                if "answer" in example:
                    f.write(f"Ground Truth: {example['answer']}\n")
                f.write("\n" + "-"*80 + "\n\n")  # Separator
                
                # Store the result for submission
                results_dict[task_id] = result["predicted_answer"]
                batch_results.append(result)

                # Print progress update for every 10th example or last in batch
                if i % 10 == 0 or i == len(batch) - 1:
                    print(f"Batch {batch_start//batch_size + 1}: Processed {i+1}/{len(batch)} examples")

                # For selected examples, save detailed results for debugging
                global_index = batch_start + i
                if global_index in debug_indices:
                    example = batch[i]
                    debug_results.append(
                        {
                            "task_id": task_id,
                            "question": example["question"],
                            "choices": example["choices"],
                            "predicted_answer": result["predicted_answer"],
                            "reasoning": result.get("reasoning", "No reasoning provided"),
                        }
                    )

                    # Print detailed debug information for these examples
                    print(f"\n--- DETAILED DEBUG FOR TASK {task_id} ---")
                    print(f"Question: {example['question'][:100]}...")
                    print(f"Choices: {example['choices']}")
                    print(f"Predicted: {result['predicted_answer']}")
                    print(f"Reasoning snippet: {str(result.get('reasoning', 'No reasoning'))[:200]}...")
                    print("-----------------------------------\n")

    # Create a DataFrame from the results dictionary for submission
    submission_df = pd.DataFrame(list(results_dict.items()), columns=["task_id", "answer"])

    # Create a DataFrame with all completions
    all_completions_df = pd.DataFrame(all_completions)

    # Save all the files to the output folder
    submission_file = os.path.join(output_folder, f"submission_{timestamp}.csv")
    debug_file = os.path.join(output_folder, f"debug_results_{timestamp}.csv")
    all_completions_file = os.path.join(output_folder, f"all_completions_{timestamp}.csv")
    all_completions_json = os.path.join(output_folder, f"all_completions_{timestamp}.json")

    # Save files
    submission_df.to_csv(submission_file, index=False)
    pd.DataFrame(debug_results).to_csv(debug_file, index=False)
    all_completions_df.to_csv(all_completions_file, index=False)
    
    # Save as JSON for better inspection of large text fields
    import json
    with open(all_completions_json, 'w', encoding='utf-8') as f:
        json.dump(all_completions, f, indent=2, ensure_ascii=False)

    print(f"Submission saved to {submission_file}")
    print(f"Debug results saved to {debug_file}")
    print(f"All completions saved to {all_completions_file} and {all_completions_json}")
    
    return {
        "submission_file": submission_file,
        "debug_file": debug_file, 
        "all_completions_file": all_completions_file,
        "all_completions_json": all_completions_json,
        "results": results_dict,
        "debug_results": debug_results,
        "all_completions": all_completions, 
        "output_folder": output_folder
    }

# Inference on the test dataset and save the results to a CSV file

In [9]:
dict_results = generate_answer_csv_with_batch_inference(
    test_data,
    lastest_model_handler,
    prompt_creator,
    latest_tester
)

Processing batches:   0%|          | 0/79 [00:00<?, ?it/s]

Processing batches:   1%|▏         | 1/79 [00:34<44:35, 34.30s/it]

Batch 1: Processed 1/16 examples
Batch 1: Processed 11/16 examples
Batch 1: Processed 16/16 examples


Processing batches:   3%|▎         | 2/79 [00:56<34:56, 27.23s/it]

Batch 2: Processed 1/16 examples
Batch 2: Processed 11/16 examples
Batch 2: Processed 16/16 examples


Processing batches:   4%|▍         | 3/79 [01:18<31:41, 25.02s/it]

Batch 3: Processed 1/16 examples

--- DETAILED DEBUG FOR TASK k10466 ---
Question: Question: Leapfrog migration strategy allows faster development of new features with less risk that ...
Choices: ['True', 'False']
Predicted: A
Reasoning snippet: understanding: |
  The question is evaluating the distinction between leapfrog and strangler migration strategies in software development. It asks whether one strategy facilitates faster feature devel...
-----------------------------------

Batch 3: Processed 11/16 examples
Batch 3: Processed 16/16 examples


Processing batches:   5%|▌         | 4/79 [01:47<32:50, 26.28s/it]

Batch 4: Processed 1/16 examples
Batch 4: Processed 11/16 examples
Batch 4: Processed 16/16 examples


Processing batches:   6%|▋         | 5/79 [02:16<33:49, 27.42s/it]

Batch 5: Processed 1/16 examples
Batch 5: Processed 11/16 examples
Batch 5: Processed 16/16 examples


Processing batches:   8%|▊         | 6/79 [02:52<36:55, 30.35s/it]

Batch 6: Processed 1/16 examples
Batch 6: Processed 11/16 examples
Batch 6: Processed 16/16 examples


Processing batches:   9%|▉         | 7/79 [03:48<46:26, 38.70s/it]

Batch 7: Processed 1/16 examples

--- DETAILED DEBUG FOR TASK k10869 ---
Question: Question: The distance between two stations M and N is L kilometers. All frames are K bits long. The...
Choices: ['A', 'B', 'C', 'D']
Predicted: C
Reasoning snippet: understanding: |
  The question asks for the minimum number of bits needed for the sequence number field in a frame using the sliding window protocol to maximize channel utilization. Key factors inclu...
-----------------------------------

Batch 7: Processed 11/16 examples
Batch 7: Processed 16/16 examples


Processing batches:  10%|█         | 8/79 [04:45<52:47, 44.61s/it]

Batch 8: Processed 1/16 examples
Batch 8: Processed 11/16 examples
Batch 8: Processed 16/16 examples


Processing batches:  11%|█▏        | 9/79 [05:34<53:37, 45.96s/it]

Batch 9: Processed 1/16 examples
Batch 9: Processed 11/16 examples
Batch 9: Processed 16/16 examples


Processing batches:  13%|█▎        | 10/79 [06:09<48:50, 42.48s/it]

Batch 10: Processed 1/16 examples
Batch 10: Processed 11/16 examples
Batch 10: Processed 16/16 examples


Processing batches:  14%|█▍        | 11/79 [06:48<47:03, 41.52s/it]

Batch 11: Processed 1/16 examples
Batch 11: Processed 11/16 examples
Batch 11: Processed 16/16 examples


Processing batches:  15%|█▌        | 12/79 [07:24<44:29, 39.84s/it]

Batch 12: Processed 1/16 examples
Batch 12: Processed 11/16 examples
Batch 12: Processed 16/16 examples


Processing batches:  16%|█▋        | 13/79 [07:41<36:09, 32.87s/it]

Batch 13: Processed 1/16 examples
Batch 13: Processed 11/16 examples
Batch 13: Processed 16/16 examples


Processing batches:  18%|█▊        | 14/79 [08:01<31:28, 29.05s/it]

Batch 14: Processed 1/16 examples
Batch 14: Processed 11/16 examples
Batch 14: Processed 16/16 examples


Processing batches:  19%|█▉        | 15/79 [08:23<28:29, 26.71s/it]

Batch 15: Processed 1/16 examples
Batch 15: Processed 11/16 examples
Batch 15: Processed 16/16 examples


Processing batches:  20%|██        | 16/79 [08:45<26:33, 25.29s/it]

Batch 16: Processed 1/16 examples

--- DETAILED DEBUG FOR TASK rt00563 ---
Question: Question: Look at the problem below, the solution is missing a part, which option is the most likely...
Choices: ["    d = {3: 'Fizz', 5: 'Buzz'}", "    d = {4: 'Quux', 9: 'Quuz'}", "    d = {3: 'F', 5: 'B'}", "    d = {2: 'Foo', 7: 'Bar'}"]
Predicted: A
Reasoning snippet: understanding: |
  The question asks for the completion of a function that generates a list of strings representing numbers from 1 to n, replacing multiples of 3 with "Fizz", multiples of 5 with "Buzz...
-----------------------------------

Batch 16: Processed 11/16 examples
Batch 16: Processed 16/16 examples


Processing batches:  22%|██▏       | 17/79 [09:07<25:15, 24.44s/it]

Batch 17: Processed 1/16 examples
Batch 17: Processed 11/16 examples
Batch 17: Processed 16/16 examples


Processing batches:  23%|██▎       | 18/79 [09:29<24:07, 23.72s/it]

Batch 18: Processed 1/16 examples
Batch 18: Processed 11/16 examples
Batch 18: Processed 16/16 examples


Processing batches:  24%|██▍       | 19/79 [09:51<23:06, 23.11s/it]

Batch 19: Processed 1/16 examples
Batch 19: Processed 11/16 examples
Batch 19: Processed 16/16 examples


Processing batches:  25%|██▌       | 20/79 [10:10<21:31, 21.89s/it]

Batch 20: Processed 1/16 examples
Batch 20: Processed 11/16 examples

--- DETAILED DEBUG FOR TASK rt01153 ---
Question: Question: Look at the problem below, the solution is missing a part, which option is the most likely...
Choices: ['        prefix += 1 << index', '        prefix ^= 1 << index', '        prefix |= 1 << index', '        prefix |= (1 << index) - 1']
Predicted: C
Reasoning snippet: understanding: |
  The problem requires finding the length of the longest substring where each vowel appears an even number of times. The solution involves using bitwise operations to track the parity...
-----------------------------------

Batch 20: Processed 16/16 examples


Processing batches:  27%|██▋       | 21/79 [10:35<22:06, 22.87s/it]

Batch 21: Processed 1/16 examples
Batch 21: Processed 11/16 examples
Batch 21: Processed 16/16 examples

--- DETAILED DEBUG FOR TASK rt01288 ---
Question: Question: Look at the problem below, the solution is missing a part, which option is the most likely...
Choices: ['    return self.minimumOneBitOperations(n & (x | x >> 1)) + 1 + x - 1', '    return self.minimumOneBitOperations(n ^ (x | x >> 1)) + 1 + x - 1', '    return self.minimumOneBitOperations(n ^ (x | x << 1)) + 1 + x - 1', '    return self.minimumOneBitOperations(n | (x | x >> 1)) + 1 + x - 1']
Predicted: B
Reasoning snippet: understanding: |
  The question asks for the completion of a recursive function that calculates the minimum number of operations needed to convert an integer `n` to zero using specific bitwise operati...
-----------------------------------



Processing batches:  28%|██▊       | 22/79 [10:56<21:19, 22.44s/it]

Batch 22: Processed 1/16 examples
Batch 22: Processed 11/16 examples
Batch 22: Processed 16/16 examples


Processing batches:  29%|██▉       | 23/79 [11:21<21:38, 23.20s/it]

Batch 23: Processed 1/16 examples
Batch 23: Processed 11/16 examples
Batch 23: Processed 16/16 examples


Processing batches:  30%|███       | 24/79 [11:47<21:55, 23.92s/it]

Batch 24: Processed 1/16 examples
Batch 24: Processed 11/16 examples
Batch 24: Processed 16/16 examples


Processing batches:  32%|███▏      | 25/79 [12:08<20:46, 23.08s/it]

Batch 25: Processed 1/16 examples
Batch 25: Processed 11/16 examples
Batch 25: Processed 16/16 examples


Processing batches:  33%|███▎      | 26/79 [12:31<20:26, 23.15s/it]

Batch 26: Processed 1/16 examples
Batch 26: Processed 11/16 examples
Batch 26: Processed 16/16 examples


Processing batches:  34%|███▍      | 27/79 [12:59<21:10, 24.44s/it]

Batch 27: Processed 1/16 examples
Batch 27: Processed 11/16 examples
Batch 27: Processed 16/16 examples


Processing batches:  35%|███▌      | 28/79 [13:21<20:08, 23.70s/it]

Batch 28: Processed 1/16 examples

--- DETAILED DEBUG FOR TASK rt02309 ---
Question: Question: Look at the problem below, the solution is missing a part, which option is the most likely...
Choices: ['    return l if l <= len(changeIndices) else -1', '    return min(l, len(changeIndices)) if l > 0 else -1', '    return l if l <= len(changeIndices) and l > 0 else -1', '    return l if 1 <= l <= len(changeIndices) else -1']
Predicted: A
Reasoning snippet: understanding: |
  The problem involves determining the earliest second when all indices in an array can be marked using specific operations. The solution requires checking if marking can occur up to ...
-----------------------------------

Batch 28: Processed 11/16 examples
Batch 28: Processed 16/16 examples


Processing batches:  37%|███▋      | 29/79 [13:41<18:56, 22.73s/it]

Batch 29: Processed 1/16 examples
Batch 29: Processed 11/16 examples
Batch 29: Processed 16/16 examples


Processing batches:  38%|███▊      | 30/79 [14:07<19:13, 23.55s/it]

Batch 30: Processed 1/16 examples
Batch 30: Processed 11/16 examples
Batch 30: Processed 16/16 examples


Processing batches:  39%|███▉      | 31/79 [14:29<18:37, 23.29s/it]

Batch 31: Processed 1/16 examples
Batch 31: Processed 11/16 examples
Batch 31: Processed 16/16 examples


Processing batches:  41%|████      | 32/79 [14:48<17:04, 21.80s/it]

Batch 32: Processed 1/16 examples
Batch 32: Processed 11/16 examples
Batch 32: Processed 16/16 examples


Processing batches:  42%|████▏     | 33/79 [15:07<16:05, 20.99s/it]

Batch 33: Processed 1/16 examples
Batch 33: Processed 11/16 examples
Batch 33: Processed 16/16 examples


Processing batches:  43%|████▎     | 34/79 [15:32<16:36, 22.15s/it]

Batch 34: Processed 1/16 examples
Batch 34: Processed 11/16 examples
Batch 34: Processed 16/16 examples


Processing batches:  44%|████▍     | 35/79 [16:04<18:26, 25.14s/it]

Batch 35: Processed 1/16 examples
Batch 35: Processed 11/16 examples
Batch 35: Processed 16/16 examples


Processing batches:  46%|████▌     | 36/79 [16:30<18:17, 25.52s/it]

Batch 36: Processed 1/16 examples
Batch 36: Processed 11/16 examples
Batch 36: Processed 16/16 examples


Processing batches:  47%|████▋     | 37/79 [16:59<18:30, 26.44s/it]

Batch 37: Processed 1/16 examples
Batch 37: Processed 11/16 examples
Batch 37: Processed 16/16 examples


Processing batches:  48%|████▊     | 38/79 [17:20<17:01, 24.92s/it]

Batch 38: Processed 1/16 examples
Batch 38: Processed 11/16 examples

--- DETAILED DEBUG FOR TASK rt06381 ---
Question: Question: Given a code snippet below, which behavior most likely to occur when execute it?
A,B=map(i...
Choices: ['Compile Error', 'Memory Limit Exceeded', 'Runtime Error', 'Internal error']
Predicted: A
Reasoning snippet: understanding: |
  The question asks about the expected behavior when executing a Python code snippet that reads two integers from input, performs arithmetic operations, and prints the maximum result....
-----------------------------------

Batch 38: Processed 16/16 examples


Processing batches:  49%|████▉     | 39/79 [18:11<21:51, 32.78s/it]

Batch 39: Processed 1/16 examples
Batch 39: Processed 11/16 examples
Batch 39: Processed 16/16 examples


Processing batches:  51%|█████     | 40/79 [18:45<21:28, 33.04s/it]

Batch 40: Processed 1/16 examples
Batch 40: Processed 11/16 examples
Batch 40: Processed 16/16 examples


Processing batches:  52%|█████▏    | 41/79 [19:14<20:09, 31.83s/it]

Batch 41: Processed 1/16 examples
Batch 41: Processed 11/16 examples
Batch 41: Processed 16/16 examples


Processing batches:  53%|█████▎    | 42/79 [19:51<20:37, 33.45s/it]

Batch 42: Processed 1/16 examples
Batch 42: Processed 11/16 examples
Batch 42: Processed 16/16 examples


Processing batches:  54%|█████▍    | 43/79 [20:16<18:30, 30.86s/it]

Batch 43: Processed 1/16 examples
Batch 43: Processed 11/16 examples
Batch 43: Processed 16/16 examples


Processing batches:  56%|█████▌    | 44/79 [20:46<17:49, 30.56s/it]

Batch 44: Processed 1/16 examples
Batch 44: Processed 11/16 examples
Batch 44: Processed 16/16 examples


Processing batches:  57%|█████▋    | 45/79 [21:02<14:50, 26.19s/it]

Batch 45: Processed 1/16 examples
Batch 45: Processed 11/16 examples
Batch 45: Processed 16/16 examples


Processing batches:  58%|█████▊    | 46/79 [21:23<13:30, 24.57s/it]

Batch 46: Processed 1/16 examples
Batch 46: Processed 11/16 examples
Batch 46: Processed 16/16 examples


Processing batches:  59%|█████▉    | 47/79 [21:44<12:32, 23.50s/it]

Batch 47: Processed 1/16 examples
Batch 47: Processed 11/16 examples
Batch 47: Processed 16/16 examples


Processing batches:  61%|██████    | 48/79 [22:07<12:03, 23.34s/it]

Batch 48: Processed 1/16 examples
Batch 48: Processed 11/16 examples
Batch 48: Processed 16/16 examples


Processing batches:  62%|██████▏   | 49/79 [22:30<11:36, 23.22s/it]

Batch 49: Processed 1/16 examples
Batch 49: Processed 11/16 examples

--- DETAILED DEBUG FOR TASK k08745 ---
Question: Question: Consider a double hashing scheme in which the primary hash function is h1(k) = k mod 23, a...
Choices: ['13', '15', '21', '23']
Predicted: A
Reasoning snippet: understanding: |
  The question involves using double hashing to determine the address where a key should be placed in a hash table. The primary hash function uses modular arithmetic, and the secondar...
-----------------------------------

Batch 49: Processed 16/16 examples


Processing batches:  63%|██████▎   | 50/79 [22:49<10:39, 22.04s/it]

Batch 50: Processed 1/16 examples
Batch 50: Processed 11/16 examples
Batch 50: Processed 16/16 examples


Processing batches:  65%|██████▍   | 51/79 [23:12<10:27, 22.40s/it]

Batch 51: Processed 1/16 examples
Batch 51: Processed 11/16 examples
Batch 51: Processed 16/16 examples


Processing batches:  66%|██████▌   | 52/79 [23:36<10:14, 22.78s/it]

Batch 52: Processed 1/16 examples
Batch 52: Processed 11/16 examples
Batch 52: Processed 16/16 examples


Processing batches:  67%|██████▋   | 53/79 [23:55<09:24, 21.72s/it]

Batch 53: Processed 1/16 examples
Batch 53: Processed 11/16 examples
Batch 53: Processed 16/16 examples


Processing batches:  68%|██████▊   | 54/79 [24:17<09:02, 21.68s/it]

Batch 54: Processed 1/16 examples
Batch 54: Processed 11/16 examples
Batch 54: Processed 16/16 examples


Processing batches:  70%|██████▉   | 55/79 [24:38<08:34, 21.43s/it]

Batch 55: Processed 1/16 examples
Batch 55: Processed 11/16 examples
Batch 55: Processed 16/16 examples


Processing batches:  71%|███████   | 56/79 [25:02<08:32, 22.28s/it]

Batch 56: Processed 1/16 examples
Batch 56: Processed 11/16 examples
Batch 56: Processed 16/16 examples


Processing batches:  72%|███████▏  | 57/79 [25:20<07:43, 21.07s/it]

Batch 57: Processed 1/16 examples
Batch 57: Processed 11/16 examples
Batch 57: Processed 16/16 examples


Processing batches:  73%|███████▎  | 58/79 [25:43<07:36, 21.74s/it]

Batch 58: Processed 1/16 examples
Batch 58: Processed 11/16 examples
Batch 58: Processed 16/16 examples


Processing batches:  75%|███████▍  | 59/79 [26:06<07:18, 21.95s/it]

Batch 59: Processed 1/16 examples
Batch 59: Processed 11/16 examples
Batch 59: Processed 16/16 examples


Processing batches:  76%|███████▌  | 60/79 [26:29<07:05, 22.38s/it]

Batch 60: Processed 1/16 examples
Batch 60: Processed 11/16 examples
Batch 60: Processed 16/16 examples


Processing batches:  77%|███████▋  | 61/79 [26:50<06:33, 21.88s/it]

Batch 61: Processed 1/16 examples
Batch 61: Processed 11/16 examples
Batch 61: Processed 16/16 examples


Processing batches:  78%|███████▊  | 62/79 [27:11<06:06, 21.54s/it]

Batch 62: Processed 1/16 examples
Batch 62: Processed 11/16 examples
Batch 62: Processed 16/16 examples


Processing batches:  80%|███████▉  | 63/79 [27:30<05:33, 20.83s/it]

Batch 63: Processed 1/16 examples
Batch 63: Processed 11/16 examples
Batch 63: Processed 16/16 examples


Processing batches:  81%|████████  | 64/79 [27:48<05:00, 20.02s/it]

Batch 64: Processed 1/16 examples
Batch 64: Processed 11/16 examples
Batch 64: Processed 16/16 examples


Processing batches:  82%|████████▏ | 65/79 [28:10<04:50, 20.78s/it]

Batch 65: Processed 1/16 examples
Batch 65: Processed 11/16 examples
Batch 65: Processed 16/16 examples


Processing batches:  84%|████████▎ | 66/79 [28:32<04:32, 20.93s/it]

Batch 66: Processed 1/16 examples
Batch 66: Processed 11/16 examples
Batch 66: Processed 16/16 examples


Processing batches:  85%|████████▍ | 67/79 [28:53<04:12, 21.03s/it]

Batch 67: Processed 1/16 examples

--- DETAILED DEBUG FOR TASK k05597 ---
Question: Question: The value printed by the following program is 


void f(int* p, int m)
{
    m = m + 5;
  ...
Choices: ['10', '20', '30', '40']
Predicted: B
Reasoning snippet: understanding: |
  The question tests understanding of pointer usage and function parameters in C. It involves modifying variables through pointers and observing the effect on the original variables.
...
-----------------------------------

Batch 67: Processed 11/16 examples
Batch 67: Processed 16/16 examples


Processing batches:  86%|████████▌ | 68/79 [29:14<03:49, 20.88s/it]

Batch 68: Processed 1/16 examples
Batch 68: Processed 11/16 examples
Batch 68: Processed 16/16 examples


Processing batches:  87%|████████▋ | 69/79 [29:33<03:25, 20.55s/it]

Batch 69: Processed 1/16 examples
Batch 69: Processed 11/16 examples
Batch 69: Processed 16/16 examples


Processing batches:  89%|████████▊ | 70/79 [29:50<02:54, 19.44s/it]

Batch 70: Processed 1/16 examples
Batch 70: Processed 11/16 examples
Batch 70: Processed 16/16 examples


Processing batches:  90%|████████▉ | 71/79 [30:06<02:27, 18.39s/it]

Batch 71: Processed 1/16 examples
Batch 71: Processed 11/16 examples
Batch 71: Processed 16/16 examples


Processing batches:  91%|█████████ | 72/79 [30:22<02:04, 17.71s/it]

Batch 72: Processed 1/16 examples
Batch 72: Processed 11/16 examples
Batch 72: Processed 16/16 examples


Processing batches:  92%|█████████▏| 73/79 [30:38<01:42, 17.16s/it]

Batch 73: Processed 1/16 examples
Batch 73: Processed 11/16 examples
Batch 73: Processed 16/16 examples


Processing batches:  94%|█████████▎| 74/79 [30:55<01:25, 17.17s/it]

Batch 74: Processed 1/16 examples
Batch 74: Processed 11/16 examples
Batch 74: Processed 16/16 examples


Processing batches:  95%|█████████▍| 75/79 [31:14<01:10, 17.59s/it]

Batch 75: Processed 1/16 examples
Batch 75: Processed 11/16 examples
Batch 75: Processed 16/16 examples


Processing batches:  96%|█████████▌| 76/79 [31:30<00:51, 17.28s/it]

Batch 76: Processed 1/16 examples

--- DETAILED DEBUG FOR TASK k00476 ---
Question: Question: What is the purpose of the np.clip() function in NumPy?...
Choices: ['Clips the array values to be within a specified range', 'Clips the array to its maximum value', 'Clips the array to its minimum value', 'Clips the array based on a boolean condition']
Predicted: A
Reasoning snippet: understanding: |
  The question asks about the functionality of the np.clip() function in the NumPy library, which is used for clipping array elements.
analysis: |
  A. Correct. np.clip() clips array ...
-----------------------------------

Batch 76: Processed 11/16 examples
Batch 76: Processed 16/16 examples


Processing batches:  97%|█████████▋| 77/79 [31:46<00:33, 16.89s/it]

Batch 77: Processed 1/16 examples
Batch 77: Processed 11/16 examples
Batch 77: Processed 16/16 examples


Processing batches:  99%|█████████▊| 78/79 [32:03<00:16, 16.92s/it]

Batch 78: Processed 1/16 examples
Batch 78: Processed 11/16 examples
Batch 78: Processed 16/16 examples


Processing batches: 100%|██████████| 79/79 [32:19<00:00, 24.55s/it]

Batch 79: Processed 1/5 examples
Batch 79: Processed 5/5 examples
Submission saved to ./outputs/inference_batch/20250404_191836/submission_20250404_191836.csv
Debug results saved to ./outputs/inference_batch/20250404_191836/debug_results_20250404_191836.csv
All completions saved to ./outputs/inference_batch/20250404_191836/all_completions_20250404_191836.csv and ./outputs/inference_batch/20250404_191836/all_completions_20250404_191836.json





In [None]:
# dump dict_results to a json file
try:
    import json
    with open("dict_results.json", "w") as f:
        json.dump(dict_results, f, indent=2)
except:
    # write text to dict_results.txt
    with open("dict_results.txt", "w") as f:
        f.write(str(dict_results))


# Display the results as yaml format

In [12]:
import yaml
from IPython.display import display, HTML

# Function to format the completion as HTML with proper formatting
def format_completion_as_html(completion):
    # Extract components from the completion
    understanding = completion.get("understanding", "")
    analysis = completion.get("analysis", "")
    reasoning = completion.get("reasoning", "")
    conclusion = completion.get("conclusion", "")
    answer = completion.get("answer", "")
    
    # Create HTML with proper formatting and styling
    html = f"""
    <div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; margin-bottom: 20px;">
        <h3 style="color: #2c3e50; border-bottom: 1px solid #eee; padding-bottom: 10px;">Completion Analysis</h3>
        
        <div style="margin-top: 10px;">
            <h4 style="color: #3498db;">Understanding</h4>
            <p style="white-space: pre-wrap;">{understanding}</p>
        </div>
        
        <div style="margin-top: 10px;">
            <h4 style="color: #3498db;">Analysis</h4>
            <p style="white-space: pre-wrap;">{analysis}</p>
        </div>
        
        <div style="margin-top: 10px;">
            <h4 style="color: #3498db;">Reasoning</h4>
            <p style="white-space: pre-wrap;">{reasoning}</p>
        </div>
        
        <div style="margin-top: 10px;">
            <h4 style="color: #3498db;">Conclusion</h4>
            <p style="white-space: pre-wrap;">{conclusion}</p>
        </div>
        
        <div style="margin-top: 10px; font-weight: bold;">
            <h4 style="color: #e74c3c;">Answer</h4>
            <p>{answer}</p>
        </div>
    </div>
    """
    return html

# Display a sample of the results
def display_random_example(dict_results):
    """
    Display a random example from the completions with formatted output.
    
    Args:
        dict_results (dict): Dictionary containing all completion results
    """
    if dict_results["all_completions"]:
        # Get a random completion instead of always the first one
        import random
        sample_completion = random.choice(dict_results["all_completions"])
        
        print("Sample Task ID:", sample_completion["task_id"])
        print("Sample Question:", sample_completion["question"][:400] + "..." if len(sample_completion["question"]) > 400 else sample_completion["question"])
        print("Sample Choices:", sample_completion["choices"])
        print("Predicted Answer:", sample_completion["predicted_answer"])
        
        # Display the completion in YAML format for better readability
        print("\nCompletion in YAML format:")
        print(yaml.dump(sample_completion["completion"], default_flow_style=False, sort_keys=False))
        
        # Display the completion in a nicely formatted HTML
        print("\nFormatted Completion:")
        # Parse the completion string into a dictionary if it's a string
        completion_dict = {}
        if isinstance(sample_completion["completion"], str):
            # Extract sections from the completion string
            completion_text = sample_completion["completion"]
            
            sections = ["understanding", "analysis", "reasoning", "conclusion", "answer"]
            current_section = None
            section_content = {}
            
            for line in completion_text.split('\n'):
                line = line.strip()
                if not line:
                    continue
                    
                for section in sections:
                    if line.lower().startswith(f"{section}:") or line.lower() == f"{section}: |":
                        current_section = section
                        section_content[current_section] = ""
                        break
                        
                if current_section and not any(line.lower().startswith(f"{s}:") for s in sections):
                    if line.startswith('|'):
                        line = line[1:].strip()
                    section_content[current_section] += line + "\n"
            
            # Clean up the extracted content
            completion_dict = {k: v.strip() for k, v in section_content.items()}
        else:
            completion_dict = sample_completion["completion"]
            
        display(HTML(format_completion_as_html(completion_dict)))
        
        # Display some statistics
        print(f"\nTotal completions: {len(dict_results['all_completions'])}")
    else:
        print("No completions available in the results.")

In [14]:
# Call the function to display a random example,
# Each time you run the cell, it will display a different example
display_random_example(dict_results)

Sample Task ID: k02198
Sample Question: Question: What will be the output of the following C++ code?
#include <iostream> 
#include <string>
#include <cstring>
using namespace std; 
int main(int argc, char const *argv[])
{
	string s("a");
	cout<<s;
	return 0;
}
Sample Choices: ['a', 'empty string', 'Error', 'Segmentation fault']
Predicted Answer: C

Completion in YAML format:
"understanding: |\n  The question tests knowledge of how C++ handles string objects\
  \ and their default behavior when initialized without arguments.\nanalysis: |\n\
  \  A. \"a\" - This would be correct if the string object was properly constructed\
  \ with an initial value. However, the code does not initialize it correctly.\n \
  \ B. empty string - This is incorrect because the code attempts to print the string\
  \ object `s`, which is not empty.\n  C. Error - This is correct because the code\
  \ uses `#include <cstring>` instead of `<cstring>`, leading to a compilation error\
  \ due to undefined behavior


Total completions: 1253


In [15]:
# Post-processing and pushing the test dataset with completions to Hugging Face

# Create a dataset from the completions
def prepare_dataset_for_upload():
    # Extract the necessary data from all_completions
    upload_data = []
    
    for item in dict_results["all_completions"]:
        entry = {
            "task_id": item["task_id"],
            "question": item["question"],
            "choices": item["choices"],
            "predicted_answer": item["predicted_answer"],
            "completion": item["completion"],
            "reasoning": item.get("reasoning", ""),
            "understanding": item.get("understanding", ""),
            "analysis": item.get("analysis", ""),
            "conclusion": item.get("conclusion", "")
        }
        upload_data.append(entry)
    
    # Convert to Hugging Face dataset
    dataset = Dataset.from_list(upload_data)
    return dataset

# Function to push the dataset to Hugging Face
def push_to_huggingface(dataset, repo_id="tuandunghcmut/coding-mcq-reasoning_evaluated_on_test_set"):
    # Ask for Hugging Face token if not already logged in
    try:
        # Check if already logged in
        api = HfApi()
        api.whoami()
        print("Already logged in to Hugging Face")
    except Exception:
        # If not logged in, ask for token
        print("Please enter your Hugging Face token:")
        token = getpass()
        login(token=token)
    
    # Push the dataset to Hugging Face
    dataset.push_to_hub(
        repo_id,
        commit_message="Add model evaluation results on test set",
        private=False
    )
    
    print(f"Dataset successfully pushed to {repo_id}")

# Execute the functions
print("Preparing dataset for upload...")
dataset = prepare_dataset_for_upload()
print(f"Dataset prepared with {len(dataset)} entries")
push_to_huggingface(dataset)

Preparing dataset for upload...
Dataset prepared with 1253 entries
Already logged in to Hugging Face


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Dataset successfully pushed to tuandunghcmut/coding-mcq-reasoning_evaluated_on_test_set
