In [1]:
import os
import sys
from typing import Dict, Any

import numpy as np


project_root = os.path.abspath(os.path.join(os.getcwd(), '../../train', '..'))
app_root = os.path.abspath(os.path.join(project_root, '../../app', '..'))

if project_root not in sys.path:
    sys.path.append(project_root)
    sys.path.append(app_root)

In [2]:
from datasets_preprocessing import load_json_data, make_pipeline
import pandas as pd

math_pipeline = make_pipeline('math')

X_json_raw = load_json_data('datasets/math')
math_pipeline.fit_transform(X_json_raw)

math_df = pd.read_csv(os.path.join('csv_question_files', 'math.csv'))
math_df.head(10)

KeyboardInterrupt: 

In [None]:
bio_pipeline = make_pipeline('bio')

X_json_raw = load_json_data('datasets/bio')
bio_pipeline.fit_transform(X_json_raw)

bio_df = pd.read_csv(os.path.join('csv_question_files', 'bio.csv'))
bio_df.head(10)

In [None]:
code_pipeline = make_pipeline('code')

X_json_raw = load_json_data('datasets/code')
code_pipeline.fit_transform(X_json_raw)

code_df = pd.read_csv(os.path.join('csv_question_files', 'code.csv'))
code_df.head(10)

In [None]:
full_df = pd.concat(
	[
		math_df,
		bio_df,
		code_df
	],
    ignore_index=True,
	axis=0
)

full_df = full_df.drop_duplicates(subset=["question"], keep="first")
full_df

In [None]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import numpy as np
from typing import Dict, Any
from train.reporting.model_interface import ModelInterface


class TextSVMWrapper(ModelInterface):
	def get_params(self) -> Dict[str, Any]:
		return self.model.get_params()

	def predict_proba(self, X: np.ndarray) -> np.ndarray:
		return self.model.predict_proba(X)

	def get_vectorizer(self):
		return self.model.steps[0][1]

	def __init__(self, C=1.0):
		# Pipeline: Najpierw zamiana tekstu na liczby (TF-IDF), potem klasyfikator SVM
		self.model = Pipeline([
			('tfidf', TfidfVectorizer(stop_words='english')),
			('svm', SVC(C=C, random_state=42, probability=True, kernel='linear'))
		])
		self.C = C
		self.is_fitted = False

	def fit(self, X, y, X_val=None, y_val=None):
		# SVM w sklearn nie wspiera śledzenia historii loss per epoka w prosty sposób,
		# więc po prostu trenujemy model.
		self.model.fit(X, y)
		self.is_fitted = True

	def predict(self, X):
		return self.model.predict(X)

	def get_loss_history(self):
		# SVM ze sklearn nie udostępnia historii funkcji straty w czasie treningu.
		# Zwracamy pusty słownik, aby Reporter wiedział, że ma pominąć wykres.
		return {}

	def get_new_instance(self):
		# Zwraca nową, czystą instancję (potrzebne do Cross Validation w Reporterze)
		return TextSVMWrapper(C=self.C)

	def get_feature_importance(self):
		"""
        Dla tekstu 'ważność cech' to słowa, które najsilniej wskazują na daną kategorię.
        """
		if not self.is_fitted:
			return {}

		try:
			feature_names = self.model.named_steps['tfidf'].get_feature_names_out()
			coefs = self.model.named_steps['svm'].coef_.copy()

			avg_coefs = np.mean(np.abs(coefs), axis=0)
			avg_coefs = np.ravel(avg_coefs)

			# Tworzymy słownik {słowo: waga}
			importance_dict = dict(zip(feature_names, avg_coefs))
			return importance_dict
		except Exception as e:
			print(f"Nie udało się pobrać ważności cech: {e}")
			return {}

In [None]:
from train.reporting.model_interface import ModelInterface
from typing import Tuple
import pickle

import pandas as pd

def import_model_and_its_test_set(path: str) -> Tuple[ModelInterface, pd.DataFrame]:
        with open(path + "/model.pkl", "rb") as f:
            model = pickle.load(f)


        test_set = pd.read_csv(
            path + "/test_set.csv", index_col=0)
        return model, test_set



math_model, math_test_set = import_model_and_its_test_set("reports/runs_with_model_saved/math")
bio_model, bio_test_set = import_model_and_its_test_set("reports/runs_with_model_saved/bio")
code_model, code_test_set = import_model_and_its_test_set("reports/runs_with_model_saved/code")

math_test_set = math_test_set.rename(columns={"target": "math_target"})
bio_test_set = bio_test_set.rename(columns={"target": "bio_target"})
code_test_set = code_test_set.rename(columns={"target": "code_target"})

In [None]:
math_test_set

In [None]:
bio_test_set

In [None]:
code_test_set

In [None]:
test_df = pd.DataFrame(pd.concat([math_test_set["question"], bio_test_set["question"], code_test_set["question"]],
                    names=["question"], ignore_index=True))

test_df.drop_duplicates(subset=["question"], keep="first")
test_df

In [None]:
test_df_with_labels = test_df.merge(full_df.drop(columns="tags_str"), on="question", how="left")

test_df_with_labels

In [None]:
test_df_with_labels["real_class"] = (test_df_with_labels["math"] * 0 +  test_df_with_labels["bio"] * 1 +
                                 test_df_with_labels["code"]
                                * 2)
test_df_with_labels

In [None]:
test_df_with_labels["math_preds"] = math_model.predict_proba(test_df_with_labels["question"])[:, 1]
test_df_with_labels["bio_preds"] = bio_model.predict_proba(test_df_with_labels["question"])[:, 1]
test_df_with_labels["code_preds"] = code_model.predict_proba(test_df_with_labels["question"])[:, 1]

In [None]:
test_df_with_labels

In [None]:
cols = ['math_preds', 'bio_preds', 'code_preds']

max_values = test_df_with_labels[cols].max(axis=1)
max_names = test_df_with_labels[cols].idxmax(axis=1)

class_mapping = {'math_preds': 0, 'bio_preds': 1, 'code_preds': 2}

predicted_class = max_names.map(class_mapping)

test_df_with_labels['predicted_class'] = np.where(max_values > 0.5, predicted_class, -1)
test_df_with_labels


In [None]:
from sklearn.metrics import accuracy_score


accuracy_score(test_df_with_labels["real_class"], test_df_with_labels["predicted_class"])

In [None]:
test_df_with_labels.sample(n=10)