In [None]:
import json
import os
import pickle

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.svm import SVC

from constants import FORCE_EXTRACT, RANDOM_STATE
from extractor import get_features_from_path
from utils import NumpyEncoder, load_data

In [None]:
DATA_FILENAME = "data_train.pkl"
MODEL_FILENAME = "model_train.pkl"
SCORES_FILENAME = "scores_train.json"

In [None]:
data_exists = os.path.exists(DATA_FILENAME)

In [None]:
if FORCE_EXTRACT or not data_exists:
    if data_exists:
        os.rename(DATA_FILENAME, DATA_FILENAME + ".bak")

    data, labels = load_data(get_features_from_path)
    with open(DATA_FILENAME, "wb") as f:
        pickle.dump((data, labels), f)
else:
    with open(DATA_FILENAME, "rb") as f:
        data, labels = pickle.load(f)

In [None]:
clf = SVC(random_state=RANDOM_STATE, kernel="poly", C=1, degree=4)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
scores = cross_validate(
    clf,
    data,  # type: ignore
    labels,
    cv=cv,
    scoring=("accuracy", "balanced_accuracy"),
)

with open(SCORES_FILENAME, "w") as f:
    json.dump(scores, f, cls=NumpyEncoder, indent=4)

In [None]:
clf.fit(data, labels)  # type: ignore

with open(MODEL_FILENAME, "wb") as f:
    pickle.dump(clf, f)

In [None]:
scores