### Installing packages on collab

In [1]:
!pip install -q "transformers[torch]" trl datasets peft accelerate bitsandbytes pandasql python-Levenshtein
!pip install -U bitsandbytes
!pip install -U transformers accelerate peft
!pip install -q gdown
!apt-get install -y p7zip-full

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.2/517.2 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m110.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pandasql (setup.py) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
p7zip-full is already the newest version (16.02+dfsg-8).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


### Necessary imports

In [2]:
# @title Imports & global config

import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
import ast
import re
import shutil
import base64
from IPython.display import HTML, display
from datasets import Dataset

import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import Levenshtein as lev
import os

#MODEL_ID = "Qwen/Qwen2.5-Coder-7B"
MODEL_ID = "Qwen/Qwen3-1.7B"
# Subset pour tester rapidement
TRAIN_NUM_SHARDS = 10
TRAIN_SHARD_INDEX = 0
TEST_NUM_SHARDS = 5
TEST_SHARD_INDEX = 0

MAX_EVAL_SAMPLES = 200  # nb d'exemples d'évaluation
DO_TRAIN = True
      
 

### Loading and formatting dataset

In [3]:
from google.colab import files
import zipfile
import os
import json
import gdown

def folder_to_dataframe(folder_path):
    """
    Reads all JSON files from a specified folder and converts them into a pandas DataFrame.
    
    Args:
    folder_path (str): The path to the folder containing JSON files.
    
    Returns:
    pd.DataFrame: A DataFrame containing the combined data from all JSON files.
    """
    data_list = []

    if not os.path.exists(folder_path):
        print(f"Error: The folder '{folder_path}' does not exist.")
        return pd.DataFrame()

    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    
                    if isinstance(data, list):
                        data_list.extend(data)
                    else:
                        data_list.append(data)
                        
            except json.JSONDecodeError:
                print(f"Warning: Could not decode {filename}. Skipping.")
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    df = pd.DataFrame(data_list)
    return df

def clean_format_dataframe(df):
    """
    Formats the DataFrame by ensuring consistent column names, data types,
    and one-hot encoding the tags column.
    """
    columns_to_keep = ['tags', 'source_code', 'difficulty','prob_desc_input_spec','prob_desc_output_spec','prob_desc_description','prob_desc_notes']
    df_cleaned = df[columns_to_keep].copy()
    
    distinct_tuples = df_cleaned['tags'].apply(tuple).unique()
    distinct_lists = [list(x) for x in distinct_tuples]
    print(f"Distinct tag combinations found: {len(distinct_lists)}")
    
    tags_to_keep = ['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']
    
    def filter_tags(tag_list):
        return [tag for tag in tag_list if tag in tags_to_keep]
    
    df_cleaned['tags'] = df_cleaned['tags'].apply(filter_tags)

    for tag in tags_to_keep:
        col_name = f"tag_{tag.replace(' ', '_')}" 
        df_cleaned[col_name] = df_cleaned['tags'].apply(lambda x: 1 if tag in x else 0)
        
    return df_cleaned


### This part is dedicated to downloading and extracting the dataset
### Uncomment and run this section if you need to download the dataset on collab extension in vscode

file_id = '1x-6BoOuGfY3HQnpixDiu8wWQHWHwyrfi'
output_file = 'code_classification_dataset.zip'

print(f"Downloading file with ID: {file_id}...")
gdown.download(id=file_id, output=output_file, quiet=False)

file_size = os.path.getsize(output_file)
print(f"Downloaded file size: {file_size / 1024:.2f} KB")

filename = "code_classification_dataset.zip"

print("Extracting 7z archive...")
!7z x code_classification_dataset.zip -o/content/code_classification_dataset/


df = folder_to_dataframe("/content/code_classification_dataset/code_classification_dataset/")
df = clean_format_dataframe(df)
dataset = Dataset.from_pandas(df)

Downloading file with ID: 1x-6BoOuGfY3HQnpixDiu8wWQHWHwyrfi...


Downloading...
From: https://drive.google.com/uc?id=1x-6BoOuGfY3HQnpixDiu8wWQHWHwyrfi
To: /content/code_classification_dataset.zip
100%|██████████| 3.66M/3.66M [00:00<00:00, 20.9MB/s]


Downloaded file size: 3572.02 KB
Extracting 7z archive...

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Sca        1 file, 3657745 bytes (3573 KiB)

Extracting archive: code_classification_dataset.zip
code_classification_dataset.zip
Can not open the file as [zip] archive
The file is open as [7z] archive

--
Path = code_classification_dataset.zip
Type = 7z
Physical Size = 3657745
Headers Size = 42550
Method = LZMA2:24
Solid = +
Blocks = 1

      0% 849 - code_classification_dataset/sample_1761.jso                                                       26% 1240 - code_classification_dataset/sample_2112.js                                                       26% 1667 - code_classification_dataset/sample_2498.js                                                       26% 2015 - code_classification_dataset

### Spliting train, test and valiation data

In [4]:
train_testvalid = dataset.train_test_split(test_size=0.5, seed=42)
train_dataset = train_testvalid['train']
test_valid_dataset = train_testvalid['test']

valid_test = test_valid_dataset.train_test_split(test_size=0.5, seed=42)
val_dataset = valid_test['train']
test_dataset = valid_test['test']

train_small = train_dataset

print(f"Train size: {len(train_dataset)} ({len(train_dataset)/len(dataset):.1%})")
print(f"Val size:   {len(val_dataset)} ({len(val_dataset)/len(dataset):.1%})")
print(f"Test size:  {len(test_dataset)} ({len(test_dataset)/len(dataset):.1%})")

Train size: 2491 (50.0%)
Val size:   1245 (25.0%)
Test size:  1246 (25.0%)


### Formating prompts for train and validation

In [5]:
def format_prompt_train(example):
    return f"""### INSTRUCTION
Given the following problem description, classify it to the following categories:['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities'].


### PROB DESCRIPTION NOTES
{example['prob_desc_notes']}

### PROB DESCRIPTION OUTPUT SPEC
{example['prob_desc_output_spec']}

### PROB DESCRIPTION INPUT SPEC
{example['prob_desc_input_spec']}

### PROB DESCRIPTION 
{example['prob_desc_description']}

### CATEGORY
{example['tags']}
"""

def format_prompt_eval(example):
    return f"""### INSTRUCTION
Given the following problem description, classify it to the following categories:['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities'].


### PROB DESCRIPTION NOTES
{example['prob_desc_notes']}

### PROB DESCRIPTION OUTPUT SPEC
{example['prob_desc_output_spec']}

### PROB DESCRIPTION INPUT SPEC
{example['prob_desc_input_spec']}

### PROB DESCRIPTION 
{example['prob_desc_description']}

### CATEGORY
[
""".strip()

In [6]:
def build_eval_prompt(row):
    return format_prompt_eval(row)

### Loading the llm model and the tokenizer

In [7]:
from transformers import BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.model_max_length = 2048
tokenizer.truncation_side = "left"

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     bnb_4bit_use_double_quant=True,
# )

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,     
    device_map="auto",          
    trust_remote_code=True,
)

base_model.config.use_cache = False
base_model.gradient_checkpointing_enable()
base_model.enable_input_require_grads()

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/622M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

### Lora configuration and setup of training hyperparameters

In [8]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

training_args = TrainingArguments(
    output_dir="./sql_finetune_results",
    per_device_train_batch_size=2,     
    gradient_accumulation_steps=4,     
    learning_rate=2e-4,
    num_train_epochs=1,               
    logging_steps=50,
    save_strategy="epoch",

    fp16=False,
    bf16=True,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    report_to="none",
)

### Finetuning launch

In [9]:
trainer = SFTTrainer(
    model=base_model,
    processing_class=tokenizer,
    train_dataset=train_small,
    formatting_func=format_prompt_train,
    peft_config=peft_config,
    args=training_args,
)


if DO_TRAIN:
    gc.collect()
    torch.cuda.empty_cache()
    print("=== Start training ===")
    trainer.train()
    adapter_save_path = "./llm_challenge_adapter"
    trainer.save_model(adapter_save_path)
    print("LoRA adapter saved to", adapter_save_path)
else:
    adapter_save_path = "./llm_challenge_adapter"
    print("Training skipped, expecting existing adapter at", adapter_save_path)

Applying formatting function to train dataset:   0%|          | 0/2491 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/2491 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2491 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2491 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


=== Start training ===


KeyboardInterrupt: 

### Save and download the model after fine tuning from colab (do not launch if you didn't train the model)

In [9]:

def download_directory(path, filename=None):
    """
    Zips a directory and creates a clickable download link for the zip file.
    
    path: full path to the directory (e.g. '/content/my_model')
    filename: optional name for the output zip (without .zip extension)
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Path not found: {path}")

    folder_name = os.path.basename(path.rstrip(os.sep))
    filename = filename or folder_name

    print(f"Zipping directory '{path}'...")
    zip_path = shutil.make_archive(filename, 'zip', path)
    
    with open(zip_path, "rb") as f:
        data = f.read()

    b64 = base64.b64encode(data).decode("utf-8")
    download_filename = os.path.basename(zip_path)
    href = f'data:application/zip;base64,{b64}'
    html = f'<a download="{download_filename}" href="{href}">Download {download_filename}</a>'

    display(HTML(html))
    

download_directory(adapter_save_path)

NameError: name 'adapter_save_path' is not defined

### reload the model for evaluation (avoid retraining the model by downloading weights)

In [10]:

### This part is dedicated to downloading and extracting the model
### Uncomment and run this section if you need to download the model on collab extension in vscode

file_id = '1fEvoMixqRp3svBscQWCWrVQBU-UACmlw'
output_file = 'llm_challenge_adapter.zip'

print(f"Downloading file with ID: {file_id}...")
gdown.download(id=file_id, output=output_file, quiet=False)

file_size = os.path.getsize(output_file)
print(f"Downloaded file size: {file_size / 1024:.2f} KB")

filename = "llm_challenge_adapter.zip"

print("Extracting 7z archive...")
!7z x llm_challenge_adapter.zip -o/content/llm_challenge_adapter/

zip_file_path = "llm_challenge_adapter.zip" 
extract_path = "./llm_challenge_adapter"

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Extracted adapter to: {extract_path}")

Downloading file with ID: 1fEvoMixqRp3svBscQWCWrVQBU-UACmlw...


Downloading...
From: https://drive.google.com/uc?id=1fEvoMixqRp3svBscQWCWrVQBU-UACmlw
To: /content/llm_challenge_adapter.zip
100%|██████████| 15.8M/15.8M [00:00<00:00, 49.7MB/s]


Downloaded file size: 15436.95 KB
Extracting 7z archive...

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Sca        1 file, 15807440 bytes (16 MiB)

Extracting archive: llm_challenge_adapter.zip
--
Path = llm_challenge_adapter.zip
Type = zip
Physical Size = 15807440

     90% 10 - vocab.js                  Everything is Ok

Files: 11
Size:       28754604
Compressed: 15807440
Extracted adapter to: ./llm_challenge_adapter


### Initializing the finetuned LLM

In [11]:
ADAPTER_PATH = "./llm_challenge_adapter"


tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 

print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

print(f"Loading adapter from {ADAPTER_PATH}...")
try:
    model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
    print("Adapter loaded successfully.")
except Exception as e:
    print(f"Error loading adapter. Check if path exists and contains adapter_config.json.\nError: {e}")

model.eval()

Loading base model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading adapter from ./llm_challenge_adapter...
Adapter loaded successfully.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 2048)
        (layers): ModuleList(
          (0-27): 28 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_f

### Evaluation setup and metrics definition

In [12]:
LABELS = ['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities']

def robust_extract_tags(text):
    """
    Scans the text for the presence of valid labels.
    """
    text = text.lower()
    found_tags = []
    
    # Use the global LABELS variable
    for label in LABELS:
        if label in text:
            found_tags.append(label)
            
    return found_tags

def evaluate_model(model, tokenizer, dataset, num_samples=None):
    model.eval()
    
    true_labels = []
    pred_labels = []
    
    eval_data = dataset.select(range(num_samples)) if num_samples else dataset

    print(f"Starting evaluation on {len(eval_data)} samples...")

    for i, row in tqdm(enumerate(eval_data), total=len(eval_data)):
        prompt = format_prompt_eval(row)
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs, 
                max_new_tokens=50, 
                do_sample=False, 
                pad_token_id=tokenizer.eos_token_id
            )
        
        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        answer_only = output_text[len(prompt):]
        
        prediction = robust_extract_tags(answer_only)
        
        truth = row['tags'] 
        
        pred_labels.append(prediction)
        true_labels.append(truth)

    return true_labels, pred_labels

### Example of an output of the finetuned LLM

In [13]:
example_index = 5
example = test_dataset[example_index]

prompt = format_prompt_eval(example)

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(
        **inputs, 
        max_new_tokens=100,      
        do_sample=False,         
        pad_token_id=tokenizer.eos_token_id
    )

full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

answer_start_index = len(prompt)
raw_answer = full_output[answer_start_index:] 

print(f"=== GROUND TRUTH ===")
print(example['tags'])

print(f"\n=== RAW MODEL ANSWER (After Prompt) ===")
print(f"'{raw_answer}'")  

print(f"\n=== FULL CONTEXT (Prompt + Answer) ===")
print(full_output)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== GROUND TRUTH ===
[]

=== RAW MODEL ANSWER (After Prompt) ===
' ]
'

=== FULL CONTEXT (Prompt + Answer) ===
### INSTRUCTION
Given the following problem description, classify it to the following categories:['math', 'graphs', 'strings', 'number theory', 'trees', 'geometry', 'games', 'probabilities'].


### PROB DESCRIPTION NOTES
NoteIn the first test case,   The substring is $$$\texttt{101}$$$, so we can do one operation to make the substring empty.  The substring is $$$\texttt{11011}$$$, so we can do one operation on $$$s[2, 4]$$$ to make $$$\texttt{11}$$$, then use two more operations to make the substring empty.  The substring is $$$\texttt{011}$$$, so we can do one operation on $$$s[1, 2]$$$ to make $$$\texttt{1}$$$, then use one more operation to make the substring empty. 

### PROB DESCRIPTION OUTPUT SPEC
Print $$$q$$$ lines, the $$$i$$$-th line representing the minimum number of operations needed for the $$$i$$$-th query.

### PROB DESCRIPTION INPUT SPEC
The first line contains

### Evaluate the llm with finetuning

In [14]:
y_true_lists, y_pred_lists = evaluate_model(base_model, tokenizer, test_dataset, num_samples=1200)

mlb = MultiLabelBinarizer(classes=LABELS)
y_true_bin = mlb.fit_transform(y_true_lists)
y_pred_bin = mlb.transform(y_pred_lists)

micro_f1 = f1_score(y_true_bin, y_pred_bin, average='micro')
macro_f1 = f1_score(y_true_bin, y_pred_bin, average='macro')
precision = precision_score(y_true_bin, y_pred_bin, average='micro')
recall = recall_score(y_true_bin, y_pred_bin, average='micro')

print("\n=== Evaluation Results ===")
print(f"Micro F1 Score: {micro_f1:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Precision:      {precision:.4f}")
print(f"Recall:         {recall:.4f}")

print("\n=== Detailed Classification Report ===")
print(classification_report(y_true_bin, y_pred_bin, target_names=LABELS, zero_division=0))

Starting evaluation on 1200 samples...


100%|██████████| 1200/1200 [08:34<00:00,  2.33it/s]



=== Evaluation Results ===
Micro F1 Score: 0.2442
Macro F1 Score: 0.1183
Precision:      0.3763
Recall:         0.1808

=== Detailed Classification Report ===
               precision    recall  f1-score   support

         math       0.41      0.37      0.39       328
       graphs       0.22      0.03      0.05       137
      strings       0.53      0.11      0.18        84
number theory       0.12      0.01      0.02        83
        trees       0.40      0.03      0.05        77
     geometry       0.12      0.05      0.08        37
        games       1.00      0.04      0.07        26
probabilities       0.11      0.11      0.11        19

    micro avg       0.38      0.18      0.24       791
    macro avg       0.36      0.09      0.12       791
 weighted avg       0.36      0.18      0.21       791
  samples avg       0.11      0.10      0.10       791



### Evaluate the llm without finetuning

In [15]:
with model.disable_adapter():
    y_true_lists_base, y_pred_lists_base = evaluate_model(model, tokenizer, test_dataset, num_samples=1200)


Starting evaluation on 1200 samples...


100%|██████████| 1200/1200 [52:58<00:00,  2.65s/it]


In [16]:
mlb = MultiLabelBinarizer(classes=LABELS)
y_true_bin = mlb.fit_transform(y_true_lists_base)
y_pred_bin = mlb.transform(y_pred_lists_base)

micro_f1 = f1_score(y_true_bin, y_pred_bin, average='micro')
macro_f1 = f1_score(y_true_bin, y_pred_bin, average='macro')
precision = precision_score(y_true_bin, y_pred_bin, average='micro')
recall = recall_score(y_true_bin, y_pred_bin, average='micro')

print("\n=== Evaluation Results ===")
print(f"Micro F1 Score: {micro_f1:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Precision:      {precision:.4f}")
print(f"Recall:         {recall:.4f}")

print("\n=== Detailed Classification Report ===")
print(classification_report(y_true_bin, y_pred_bin, target_names=LABELS, zero_division=0))


=== Evaluation Results ===
Micro F1 Score: 0.1473
Macro F1 Score: 0.1069
Precision:      0.1409
Recall:         0.1542

=== Detailed Classification Report ===
               precision    recall  f1-score   support

         math       0.39      0.11      0.17       328
       graphs       0.11      0.38      0.18       137
      strings       0.36      0.30      0.33        84
number theory       0.06      0.04      0.05        83
        trees       0.16      0.06      0.09        77
     geometry       0.04      0.03      0.03        37
        games       0.01      0.04      0.01        26
probabilities       0.00      0.00      0.00        19

    micro avg       0.14      0.15      0.15       791
    macro avg       0.14      0.12      0.11       791
 weighted avg       0.24      0.15      0.15       791
  samples avg       0.08      0.08      0.07       791

