In [79]:
import numpy as np
import json
import os
from mistralai import Mistral
import tqdm


import sys
sys.path.append('..')

In [46]:
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

# Data Loading

In [47]:
from src.processing import load_processed_data

df = load_processed_data()
df

Unnamed: 0_level_0,source_code,tags,full_description
src_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bb3fc45f903588baf131016bea175a9f,# calculate convex of polygon v.\n# v is list ...,[geometry],Problem Description:\nIahub has drawn a set of...
7d6faccc88a6839822fa0c0ec8c00251,s = input().strip();N = len(s)\nif len(s) == 1...,[strings],Problem Description:\nSome time ago Lesha foun...
891fabbb6ee8a4969b6f413120f672a8,"n = int(input())\nfor _ in range(n):\n k,x = m...","[number theory, math]",Problem Description:\nToday at the lesson of m...
9d46ae53e6dc8dc54f732ec93a82ded3,temp = list(input())\nm = int(input())\ntrans ...,"[math, strings]",Problem Description:\nPasha got a very beautif...
0e0f30521f9f5eb5cff2549cd391da3c,"N, B, E = input(), [], 0\nfor a in map(int, ra...",[math],Problem Description:\nYou are given an array $...
...,...,...,...
981e9991fb5dbd085db8a408c29564d4,import sys\nsys.setrecursionlimit(10000000)\na...,[graphs],Problem Description:\nYou are given a connecte...
ba27ac62b84705d80fa580567ab64c3b,"mas = list(map(int, input().split()))\r\nt = m...","[geometry, math]",Problem Description:\nDiamond Miner is a game ...
28b7e9de0eb583642526c077aa56daba,"def main():\n f= [1]\n for i in range(1,...",[math],Problem Description:\nYou are given an array a...
47129977694cb371c7647cfd0db63d29,def main():\n from sys import stdin\n fr...,[trees],Problem Description:\nWriting light novels is ...


In [61]:
def get_labels(df):
    """ Return 8-length binary vectors representing the labels """

    focus_tags = ['math', 'graphs', 'strings', 'number theory',
              'trees', 'geometry', 'games', 'probabilities']

    
    def encode_tags(tag_list):
        return [1 if t in tag_list else 0 for t in focus_tags]

    labels_vector = df["tags"].apply(encode_tags)

    return np.vstack(labels_vector.values)


# To be able to decode the labels later
label_mapping = {
    'math': 0,
    'graphs': 1,
    'strings': 2,
    'number theory': 3,
    'trees': 4,
    'geometry': 5,
    'games': 6,
    'probabilities': 7
}


Y = get_labels(df)

# Mistral Call Config

In [48]:
client = Mistral(api_key=MISTRAL_API_KEY)

In [49]:
# We can do a quick test 
response = client.chat.complete(
        model="mistral-small-latest",
        messages=[{"role": "user", "content": "Hello, world!"}]
    )
print(response.choices[0].message.content)

Hello! ðŸ˜Š How can I assist you today?


In [50]:
PROMPT_TEMPLATE = """
You are an expert in algorithmic problem classification.

Your task is to return a JSON list containing ONLY tags from the following set:

["math", "graphs", "strings", "number theory", "trees", "geometry", "games", "probabilities"]

Rules:
- Output MUST be a JSON list of strings.
- Only include tags from the allowed list.
- If no tag applies, return an empty list [].
- Do NOT include any explanation.
- Do NOT create new tags.

Problem Description:
{description}

User Code:
{source_code}

Return ONLY the JSON list.
"""

In [51]:
focus_tags = [
    "math", "graphs", "strings", "number theory",
    "trees", "geometry", "games", "probabilities"
]

def llm_predict_single(description, source_code):
    prompt = PROMPT_TEMPLATE.format(
        description=description,
        source_code=source_code
    )

    response = client.chat.complete(
        model="codestral-latest",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )

    raw_json = response.choices[0].message.content

    try:
        tags = json.loads(raw_json)
    except:
        # fallback minimal : empty list
        tags = []

    # Force allowed tags only
    tags = [t for t in tags if t in focus_tags]

    return tags

In [52]:
# We test it on a quick example
pred = llm_predict_single(
    description="Calculate the number of ways to arrange n distinct objects.",
    source_code="")

pred

['math']

In [53]:
def tags_to_vector(tags):
    return [1 if t in tags else 0 for t in focus_tags]

In [80]:
def predict_llm_for_df(df):
    vectors = []

    for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):

        # To limit cost, we clip to first 10k characters (in case of very long descriptions/code)
        row["full_description"] = row["full_description"][:10000]
        row["source_code"] = row["source_code"][:10000]
        
        tags = llm_predict_single(
            row["full_description"],
            row["source_code"]
        )
        vectors.append(tags_to_vector(tags))

    return np.array(vectors)

In [81]:
pred = predict_llm_for_df(df.iloc[[0]])
pred

  0%|          | 0/1 [00:00<?, ?it/s]

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  3.53it/s]


array([[1, 0, 0, 0, 0, 1, 0, 0]])

In [78]:
# Compare with true labels
Y[0:1]

array([[0, 0, 0, 0, 0, 1, 0, 0]])

# Evaluation

In [83]:
# To save cost and time, we won't run the whole dataset

# 100 random indexes
import random
random_indexes = random.sample(range(len(df)), 100)

df_sample = df.iloc[random_indexes]
Y_sample = Y[random_indexes]

In [84]:
Y_pred = predict_llm_for_df(df_sample)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:30<00:00,  3.32it/s]


In [85]:
from sklearn.metrics import classification_report

print(classification_report(Y_sample, Y_pred, target_names=focus_tags))

               precision    recall  f1-score   support

         math       0.83      0.83      0.83        53
       graphs       0.74      0.77      0.76        22
      strings       0.82      1.00      0.90        14
number theory       0.44      0.79      0.56        14
        trees       1.00      0.69      0.81        16
     geometry       0.62      0.71      0.67         7
        games       0.43      1.00      0.60         3
probabilities       1.00      0.50      0.67         4

    micro avg       0.73      0.80      0.77       133
    macro avg       0.74      0.79      0.73       133
 weighted avg       0.78      0.80      0.78       133
  samples avg       0.78      0.83      0.77       133



In [None]:
# That's the best results we ever had on any approch

# Monitoring

In [94]:
# We can adjust our function to be able to monitor cost

def predict_llm_for_df(df, monitor_cost=False):
    vectors = []
    total_cost = 0.0

    for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):

        # To limit cost, we clip to first 10k characters (in case of very long descriptions/code)
        row["full_description"] = row["full_description"][:10000]
        row["source_code"] = row["source_code"][:10000]
        
        response = client.chat.complete(
            model="codestral-latest",
            messages=[{"role": "user", "content": PROMPT_TEMPLATE.format(
                description=row["full_description"],
                source_code=row["source_code"]
            )}],
            response_format={"type": "json_object"},
        )

        if monitor_cost:
            usage = response.usage
            prompt_tokens = usage.prompt_tokens
            completion_tokens = usage.completion_tokens

            cost = (prompt_tokens * 0.3 + completion_tokens * 0.9) / 10**6  # Codestral Pricing
            total_cost += cost


        raw_json = response.choices[0].message.content

        try:
            tags = json.loads(raw_json)
        except:
            # fallback minimal : empty list
            tags = []

        # Force allowed tags only
        tags = [t for t in tags if t in focus_tags]

        vectors.append(tags_to_vector(tags))

    if monitor_cost:
        print(f"Total cost for predictions: ${total_cost:.4f}")

    return np.array(vectors)

In [98]:
# Test it on 2 samples

# New sample test
indexes = random.sample(range(len(df)), 100)

Xtest = df.iloc[indexes]
Ytest = Y[indexes]

Y_pred_test = predict_llm_for_df(Xtest, monitor_cost=True)

print(classification_report(Ytest, Y_pred_test, target_names=focus_tags))

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:30<00:00,  3.29it/s]

Total cost for predictions: $0.0368
               precision    recall  f1-score   support

         math       0.88      0.85      0.86        59
       graphs       0.74      0.88      0.80        16
      strings       0.76      0.87      0.81        15
number theory       0.38      0.91      0.54        11
        trees       1.00      0.83      0.91        12
     geometry       0.83      0.71      0.77         7
        games       0.70      1.00      0.82         7
probabilities       0.50      0.25      0.33         4

    micro avg       0.75      0.84      0.79       131
    macro avg       0.72      0.79      0.73       131
 weighted avg       0.79      0.84      0.80       131
  samples avg       0.78      0.86      0.79       131






In [None]:
# Here again, it's the bests results we ever had on any approch