In [1]:
import os
import sys
from typing import Dict, Any

import numpy as np

project_root = os.path.abspath(os.path.join(os.getcwd(), '../../train', '..'))

if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
from datasets_preprocessing import load_json_data, make_pipeline
import pandas as pd

math_pipeline = make_pipeline('math')

X_json_raw = load_json_data('datasets/math')
math_pipeline.fit_transform(X_json_raw)

math_df = pd.read_csv(os.path.join('csv_question_files', 'math.csv'))
math_df.head(10)

Unnamed: 0,question,tags_str,math,bio,code
0,Function on the unit circle and exponential,"complex-analysis,continuity",1,0,0
1,What does $\sum_{n=0}^\infty 1/n^n$ converge to?,"sequences-and-series,number-theory,limits",1,0,0
2,Random variable measurable with respect to sto...,"measure-theory,random-variables,stopping-times...",1,0,0
3,What is the distribution of 2 consecutive Bino...,"probability,binomial-coefficients,binomial-dis...",1,0,0
4,Determine lines intersecting four skew lines i...,"projective-geometry,projective-space,cross-ratio",1,0,0
5,How do you deal with absolute values in a func...,"calculus,solid-of-revolution",1,0,0
6,Aren't $ f’(xy) $ and $ f’(x/y)$ ambiguous not...,"multivariable-calculus,functions",1,0,0
7,Why do counits go that way?,"soft-question,category-theory,education,adjoin...",1,0,0
8,Not understanding a proof about coherent sheav...,"algebraic-geometry,proof-explanation,schemes,s...",1,0,0
9,Model theory of the naturals with a multiplica...,"model-theory,first-order-logic,nonstandard-models",1,0,0


In [3]:
bio_pipeline = make_pipeline('bio')

X_json_raw = load_json_data('datasets/bio')
bio_pipeline.fit_transform(X_json_raw)

bio_df = pd.read_csv(os.path.join('csv_question_files', 'bio.csv'))
bio_df.head(10)

Unnamed: 0,question,tags_str,math,bio,code
0,How many kg of seed can one expect from 230 kg...,agriculture,0,1,0
1,Adaptive Optics in Microscopy: what are the fa...,"biophysics,microscopy,fluorescent-microscopy,o...",0,1,0
2,Which part of the reflex arc takes the longest...,"human-biology,reflexes",0,1,0
3,Is wiping with RNAse Zap enough to destroy RNA...,"molecular-biology,lab-techniques,rna,lab-reagents",0,1,0
4,When there is incomplete dominance of one alle...,"genetics,terminology",0,1,0
5,Does drinking dry water have same effect as dr...,"human-biology,food",0,1,0
6,Do non-migratory canada geese still exhibit mi...,"ornithology,migration",0,1,0
7,"If life is discovered on another planet, will ...","taxonomy,astrobiology",0,1,0
8,Why do toenails grow much slower than fingerna...,"human-biology,human-anatomy",0,1,0
9,"Why do, humans, like many birds,tend to stand ...","brain,muscles,balance",0,1,0


In [4]:
code_pipeline = make_pipeline('code')

X_json_raw = load_json_data('datasets/code')
code_pipeline.fit_transform(X_json_raw)

code_df = pd.read_csv(os.path.join('csv_question_files', 'code.csv'))
code_df.head(10)

Unnamed: 0,question,tags_str,math,bio,code
0,WorkGroup Data Service with JSON / Web based API,"c#,linq,json",0,0,1
1,Design pattern for logger implementation,"object-oriented,design-patterns,salesforce-apex",0,0,1
2,Temperature calculator in Rust,"beginner,rust,unit-conversion",0,0,1
3,Read binary serial data and parse integers,"c#,serial-port",0,0,1
4,Brain-flak interpreter,"parsing,go,interpreter",0,0,1
5,Implement bash auto completion in Python,"python,python-3.x,bash,autocomplete",0,0,1
6,How well or poorly structured are my routes in...,"javascript,node.js",0,0,1
7,“Proper” Asynchronous implementation,"c#,asynchronous",0,0,1
8,Determining whether a loop iterated at least o...,"python,python-3.x,generator",0,0,1
9,Output JavaScript object into HTML table of ke...,"javascript,jquery",0,0,1


In [5]:
n_samples = 7000
half_samples = n_samples // 2

full_df = pd.concat(
	[
		math_df.sample(n=n_samples, random_state=42),
		bio_df.sample(n=half_samples , random_state=42),
		code_df.sample(n=half_samples, random_state=42)
	],
    ignore_index=True,
	axis=0
)
full_df

Unnamed: 0,question,tags_str,math,bio,code
0,How does rounding affect subsequent calculations?,rounding-error,1,0,0
1,Complex Hopf Fibration,"algebraic-geometry,differential-geometry,compl...",1,0,0
2,"Circular arrangement, probability of winning a...","probability,permutations",1,0,0
3,How many of the integers that satisfy the ineq...,"algebra-precalculus,inequality",1,0,0
4,A weaker Axiom of Infinity?,"set-theory,axioms",1,0,0
...,...,...,...,...,...
13995,Concurrent task pool,"concurrency,go",0,0,1
13996,Route finding on a graph that must go through ...,"algorithms,graphs,graph-traversal,traveling-sa...",0,0,1
13997,REST call helper function returns empty respon...,"python,python-3.x,rest,wrapper",0,0,1
13998,Vending machine change function,"c++,interview-questions,change-making-problem",0,0,1


# Math Model

## SVM + TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from train.reporting.model_interface import ModelInterface
from sklearn.svm import SVC

class TextSVMWrapper(ModelInterface):
    def get_params(self) -> Dict[str, Any]:
	    return self.model.get_params()

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
	    return self.model.predict_proba(X)

    def __init__(self, C=1.0):
        # Pipeline: Najpierw zamiana tekstu na liczby (TF-IDF), potem klasyfikator SVM
        self.model = Pipeline([
            ('tfidf', CountVectorizer(stop_words='english')),
            ('svm', SVC(C=C, random_state=42, probability=True, kernel='linear'))
        ])
        self.C = C
        self.is_fitted = False

    def fit(self, X, y, X_val=None, y_val=None):
        # SVM w sklearn nie wspiera śledzenia historii loss per epoka w prosty sposób,
        # więc po prostu trenujemy model.
        self.model.fit(X, y)
        self.is_fitted = True

    def predict(self, X):
        return self.model.predict(X)

    def get_loss_history(self):
        # SVM ze sklearn nie udostępnia historii funkcji straty w czasie treningu.
        # Zwracamy pusty słownik, aby Reporter wiedział, że ma pominąć wykres.
        return {}

    def get_new_instance(self):
        # Zwraca nową, czystą instancję (potrzebne do Cross Validation w Reporterze)
        return TextSVMWrapper(C=self.C)

    def get_feature_importance(self):
        """
        Dla tekstu 'ważność cech' to słowa, które najsilniej wskazują na daną kategorię.
        """
        if not self.is_fitted:
            return {}

        try:
            # Pobieramy słowa z wektoryzatora
            feature_names = self.model.named_steps['tfidf'].get_feature_names_out()
            # Pobieramy wagi z SVM
            coefs = self.model.named_steps['svm'].coef_.copy()

            # Uwaga: SVM binarny ma 1 wymiar wag, wieloklasowy ma (n_klas, n_cech).
            # Dla uproszczenia bierzemy średnią siłę wpływu słowa (wartość bezwzględna) dla wszystkich klas.
            # To pokaże słowa, które są ogólnie najbardziej "decydujące".
            avg_coefs = np.mean(np.abs(coefs), axis=0)
            avg_coefs = np.ravel(avg_coefs)

            # Tworzymy słownik {słowo: waga}
            importance_dict = dict(zip(feature_names, avg_coefs))
            return importance_dict
        except Exception as e:
            print(f"Nie udało się pobrać ważności cech: {e}")
            return {}

In [7]:
from train.reporting.model_reporter import ModelReporter

report_df = full_df

reporter = ModelReporter(TextSVMWrapper(C=1.0), report_df['question'],
                         report_df['math'])

reporter.generate_report()


[INFO] Raport zostanie zapisany w: reports/2026-01-10_13-02-23_TextSVMWrapper
Raport wygenerowany: 2026-01-10 13:02:23.087625
Model Wrapper: TextSVMWrapper
------------------------------
--- Rozpoczynanie treningu ---
Trening zakończony.
[INFO] Brak historii funkcji straty (Loss History). Pomijam wykres.
Zapisano macierz pomyłek: reports/2026-01-10_13-02-23_TextSVMWrapper/confusion_matrix.png

--- Uruchamianie 5-krotnej walidacji krzyżowej ---
Fold 5: 0.8864
Wyniki CV: [np.float64(0.89), np.float64(0.895), np.float64(0.8928571428571429), np.float64(0.88), np.float64(0.8864285714285715)]
Średnia dokładność: 0.8889 (+/- 0.0053)

--- Top 10 Ważność cech ---
np: 1.9026
master: 1.8296
dna: 1.6112
bar: 1.5819
dedekind: 1.5597
runtime: 1.5517
biology: 1.5276
efficient: 1.5077
python: 1.5014
quadratic: 1.4966
Generowanie wykresu 2D dla topowych cech (wersja uniwersalna)...
Wybrano cechy do wykresu: 'np' oraz 'master'
Zapisano wykres 2D: reports/2026-01-10_13-02-23_TextSVMWrapper/decision_bound