Import basic library

In [4]:

# %%
import numpy as np
from sklearn.pipeline import Pipeline
from FileHandling import *
from AnalysisFunctions import *
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.dists_kernels import FlatDist, ScipyDist
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV,StratifiedKFold
from time import time
from sklearn import svm
import json
from sklearn.metrics import accuracy_score
import ast
from sklearn.metrics import RocCurveDisplay

# from hyperparam_tuning import tune_with_grid_search

result_dir = Path.cwd() / 'ML_results'
if not result_dir.exists():
    result_dir.mkdir()


def tune_with_halving_grid_search(x_train, y_train, param_grid, suffix):
    svc = svm.SVC(class_weight='balanced', random_state=42)

    start = time()
    halving_gs_results = HalvingGridSearchCV(
        svc,
        param_grid,
        cv=5,
        factor=3,
        min_resources='exhaust'
    ).fit(x_train, y_train)

    duration = time() - start

    results = pd.DataFrame(halving_gs_results.cv_results_)
    results.loc[:, 'mean_test_score'] *= 100

    # take the most relevant columns and sort (for readability). Remember to sort on the iter columns first, so we see
    # the models with the most training data behind them first.
    results = results.loc[:, ('iter', 'rank_test_score', 'mean_test_score', 'params')]
    results.sort_values(by=['iter', 'rank_test_score'], ascending=[False, True], inplace=True)
    p = 'halving_svc_results' + suffix + '.csv'
    results.to_csv(result_dir / p)
    return results, duration


def tune_with_grid_search(x_train, y_train, param_grid):
    svc = svm.SVC(kernel='rbf', class_weight='balanced',
                  random_state=42)

    start = time()
    gs_results = GridSearchCV(svc, param_grid, cv=5).fit(x_train, y_train)
    duration = time() - start

    results = pd.DataFrame(gs_results.cv_results_)
    results.loc[:, 'mean_test_score'] *= 100
    results.to_csv(result_dir / 'svc_results.csv')

    # take the most relevant columns and sort (for readability)
    results = results.loc[:, ('rank_test_score', 'mean_test_score', 'params')]
    results.sort_values(by='rank_test_score', ascending=True, inplace=True)

    return results, duration
