# Testowanie działania zapisanego modelu

## Utworzenie ramki danych

In [1]:
from typing import Dict, List, Any, Self, Optional

import numpy as np
# Konkatenacja powstałych datasetów csv w 1, wraz z transformacją kolumn math, bio, code w jedną - category,
# której jedyne wartości 0, 1, 2 odnoszą się odpowiednio do kategorii math, bio, code.

import pandas as pd
import glob
import sys
import os

from matplotlib.pyplot import title
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

if project_root not in sys.path:
    sys.path.append(project_root)

csv_files = glob.glob('../datasets_preprocessing/csv_question_files/*.csv')

dfs = []
for file in csv_files:
    tmpdf = pd.read_csv(file)
    sample = tmpdf.sample(n=5000)
    dfs.append(sample)

df = pd.concat(dfs, ignore_index=True)

def transform_categories(math, bio, code):
    if math == 1:
        return 0
    elif bio == 1:
        return 1
    elif code == 1:
        return 2
    return 2

df['category'] = df.apply(lambda row: transform_categories(row['math'], row['bio'], row['code']), axis=1)

# Usunięcie kolumn math, bio, code z df — są zastąpione category
df.drop(['math', 'bio', 'code'], axis=1, inplace=True)

## Funkcja do załadowania wybranego zbioru testowego

In [2]:
from train.reporting.model_interface import ModelInterface
from train.reporting.svm_model_wrapper import SVMModelWrapper
from typing import Tuple
import pickle

def import_model_and_test_set(path: str) -> Tuple[ModelInterface, pd.DataFrame]:
    with open (path + "/model.pkl", 'rb') as f:
        model = pickle.load(f)

    test_set = pd.read_csv(path + "/test_set.csv", index_col=0)

    return model, test_set

model, _ = import_model_and_test_set("saved_model/1model-saved")

In [3]:
test_set_number = 0

In [4]:
test_df = pd.read_csv(f"../datasets_preprocessing/datasets/test_all_models/test_{test_set_number}.csv", index_col=0)

test_df.drop_duplicates(subset=["question"], keep="first")
test_df

Unnamed: 0,question,tags_str,math,bio,code
459409,"Uniform convergence of $x^n$ on interval $[0,b...",uniform-convergence,1,0,0
1239932,Multivariate Weierstrass theorem?,"real-analysis,continuity,uniform-continuity,ap...",1,0,0
1335383,Proof that if $f$ and $g$ are uniformly contin...,real-analysis,1,0,0
838774,Is the pointwise maximum of two Riemann integr...,"real-analysis,integration",1,0,0
1321517,Functionally structured spaces and manifolds,"general-topology,category-theory,differential-...",1,0,0
...,...,...,...,...,...
54787,Clustered index is dense or sparse?,"terminology,database-theory",0,0,1
37650,Displaying nested arithmetic expressions as a ...,"beginner,scala,formatting",0,0,1
25173,OpenGL VertexArrayObject class,"c#,object-oriented,opengl",0,0,1
18969,Responsive Foundation Orbit Image Gallery,"javascript,jquery",0,0,1


## Dodanie kolumny określającej kategorię pytania

In [5]:
test_df['real_class'] = (test_df['math'] * 0 + test_df['bio'] * 1 + test_df['code'] * 2)

## Badanie predykcji modelu dla każdej kategorii i zapisanie ich w ramce danych

In [6]:
proba_arr = model.predict_proba(test_df["question"])

test_df["math_preds"] = proba_arr[:, 0]
test_df["bio_preds"] = proba_arr[:, 1]
test_df["code_preds"] = proba_arr[:, 2]

In [7]:
test_df

Unnamed: 0,question,tags_str,math,bio,code,real_class,math_preds,bio_preds,code_preds
459409,"Uniform convergence of $x^n$ on interval $[0,b...",uniform-convergence,1,0,0,0,9.855423e-01,1.267935e-04,0.014331
1239932,Multivariate Weierstrass theorem?,"real-analysis,continuity,uniform-continuity,ap...",1,0,0,0,9.571631e-01,2.053880e-03,0.040783
1335383,Proof that if $f$ and $g$ are uniformly contin...,real-analysis,1,0,0,0,9.456182e-01,5.987089e-03,0.048395
838774,Is the pointwise maximum of two Riemann integr...,"real-analysis,integration",1,0,0,0,8.244155e-01,1.116257e-02,0.164422
1321517,Functionally structured spaces and manifolds,"general-topology,category-theory,differential-...",1,0,0,0,8.931539e-01,1.608491e-02,0.090761
...,...,...,...,...,...,...,...,...,...
54787,Clustered index is dense or sparse?,"terminology,database-theory",0,0,1,2,8.109284e-01,1.063090e-02,0.178441
37650,Displaying nested arithmetic expressions as a ...,"beginner,scala,formatting",0,0,1,2,6.106899e-08,3.457788e-08,1.000000
25173,OpenGL VertexArrayObject class,"c#,object-oriented,opengl",0,0,1,2,1.452720e-02,4.816106e-03,0.980657
18969,Responsive Foundation Orbit Image Gallery,"javascript,jquery",0,0,1,2,1.461785e-01,3.457869e-02,0.819243


## Zapisanie przewidzianych klas w ramce danych (-1, gdy maksymalne prawdopodobieństwo nie przekracza 0.5)

In [8]:
import numpy as np

cols = ['math_preds', 'bio_preds', 'code_preds']

max_vals = test_df[cols].max(axis=1)
max_names = test_df[cols].idxmax(axis=1)

class_mapping = {'math_preds': 0, 'bio_preds': 1, 'code_preds': 2}
predicted_classes = max_names.map(class_mapping)

test_df['predicted_class'] = np.where(max_vals > 0.5, predicted_classes, -1)
test_df

Unnamed: 0,question,tags_str,math,bio,code,real_class,math_preds,bio_preds,code_preds,predicted_class
459409,"Uniform convergence of $x^n$ on interval $[0,b...",uniform-convergence,1,0,0,0,9.855423e-01,1.267935e-04,0.014331,0
1239932,Multivariate Weierstrass theorem?,"real-analysis,continuity,uniform-continuity,ap...",1,0,0,0,9.571631e-01,2.053880e-03,0.040783,0
1335383,Proof that if $f$ and $g$ are uniformly contin...,real-analysis,1,0,0,0,9.456182e-01,5.987089e-03,0.048395,0
838774,Is the pointwise maximum of two Riemann integr...,"real-analysis,integration",1,0,0,0,8.244155e-01,1.116257e-02,0.164422,0
1321517,Functionally structured spaces and manifolds,"general-topology,category-theory,differential-...",1,0,0,0,8.931539e-01,1.608491e-02,0.090761,0
...,...,...,...,...,...,...,...,...,...,...
54787,Clustered index is dense or sparse?,"terminology,database-theory",0,0,1,2,8.109284e-01,1.063090e-02,0.178441,0
37650,Displaying nested arithmetic expressions as a ...,"beginner,scala,formatting",0,0,1,2,6.106899e-08,3.457788e-08,1.000000,2
25173,OpenGL VertexArrayObject class,"c#,object-oriented,opengl",0,0,1,2,1.452720e-02,4.816106e-03,0.980657,2
18969,Responsive Foundation Orbit Image Gallery,"javascript,jquery",0,0,1,2,1.461785e-01,3.457869e-02,0.819243,2


## Efektywność modelu

In [9]:
from sklearn.metrics import accuracy_score

accuracy_score(test_df['real_class'], test_df['predicted_class'])

0.8828333333333334

## Przykładowe pytania, dla których model niepoprawnie przewidział kategorię

In [10]:
wrong_questions_sample = test_df[test_df['real_class'] != test_df['predicted_class']].sample(n=10)

wrong_questions_sample

Unnamed: 0,question,tags_str,math,bio,code,real_class,math_preds,bio_preds,code_preds,predicted_class
72350,How to represent circles in x-y coordinates,"algorithms,computational-geometry,modelling",0,0,1,2,0.690539,0.148956,0.160505,0
60368,$\mathbb{NEXP\subseteq(NEXP\cap coNEXP)/poly}\...,"complexity-theory,complexity-classes",0,0,1,2,0.821704,0.039862,0.138433,0
639438,"Is there a solution to $a^n=c,b^n=d$, where a ...",number-theory,1,0,0,0,0.156424,0.025377,0.818199,2
20501,how many times can a spitting cobra spit its v...,"zoology,snake",0,1,0,1,0.691459,0.068247,0.240294,0
1707,Write the haplotypes of the family,"human-biology,genetics,dna,dna-sequencing",0,1,0,1,0.720896,0.063864,0.21524,0
81290,Finding the kth element of a permutation,"permutations,randomness",0,0,1,2,0.838078,0.000704,0.161217,0
67135,Relational algebra and indexes,relational-algebra,0,0,1,2,0.754879,0.069919,0.175203,0
191950,What if my PDF is proportional to some known PDF?,"probability,probability-distributions",1,0,0,0,0.474289,0.163099,0.362612,-1
760825,Is '=' antisymmetric?,"elementary-set-theory,relations,equivalence-re...",1,0,0,0,0.1689,0.60522,0.225881,1
2984,What is the number of influenza strains occurr...,"virus,vaccination",0,1,0,1,0.051878,0.10601,0.842112,2


In [11]:
for question in wrong_questions_sample['question']:
    print(question)

How to represent circles in x-y coordinates
$\mathbb{NEXP\subseteq(NEXP\cap coNEXP)/poly}\implies \mathbb{NEXP=NEXP\cap coNEXP}$
Is there a solution to $a^n=c,b^n=d$, where a shares no factors with b, and n isn't an integer
how many times can a spitting cobra spit its venom?
Write the haplotypes of the family
Finding the kth element of a permutation
Relational algebra and indexes
What if my PDF is proportional to some known PDF?
Is '=' antisymmetric?
What is the number of influenza strains occurring at a given time?
