In [76]:
import pandas as pd

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_pickle(r'./dataset/test_set_vector.pickle')

In [3]:
def split_vector_to_columns(data):
    col_names = ['feature'+str(i) for i in range(len(data.vector[0]))]
    X = pd.DataFrame(data.vector.tolist(), columns=col_names)
    return X, data.label.to_numpy()

In [4]:
X, y = split_vector_to_columns(data)

In [105]:
list_models_default_params = [svm.SVC(), RidgeClassifier(), GaussianNB(), BaggingClassifier(), AdaBoostClassifier(), RandomForestClassifier(), GradientBoostingClassifier(), KNeighborsClassifier()]
scoring = {'accuracy': 'accuracy',
        'f1_score': 'f1_weighted',
        'precision': 'precision_weighted',
        'recall': 'recall_weighted'}

In [99]:
list_parameters = {svm.SVC.__name__: {},
                RidgeClassifier.__name__: {'alpha': [0.01, 0.05, 0.1, 0.5, 1, 5]},
                GaussianNB.__name__: {},
                BaggingClassifier.__name__: {'n_estimators': [5,10,20]},
                AdaBoostClassifier.__name__: {'n_estimators': [10,25,50,75], 'learning_rate': [0.1, 0.5, 1, 5]},
                RandomForestClassifier.__name__: {},
                GradientBoostingClassifier.__name__: {},
                KNeighborsClassifier.__name__: {},}

In [70]:
def compute_model_performance(list_models_GS, X, y, scoring, train=True):
    """
    For each model in the list_models, compute validation score for all metrics in scoring
    For each model, return the average cross validated score for each metric
    """
    result = {}
    idx_names = []
    for model in list_models_GS:
        idx_names.append(type(model).__name__)
        scores = cross_validate(model, X, y, scoring=scoring, cv=5, return_train_score=train)
        for key, _ in scores.items():
            score = scores[key].mean()
            if key not in result:
                result[key] = [score]
            else: 
                result[key].append(score)
    return pd.DataFrame(result, index=idx_names)

In [100]:
list_models = [RidgeClassifier, BaggingClassifier]
def compute_GS(list_models, X, y, list_parameters, scoring = "accuracy", cv=5):
    """
    For each model in list_models, perform grid search and return the model instance initiated with the best performing parameter based on scoring
    """
    list_models_GS = []
    for model in list_models:
        params = list_parameters[model.__name__]
        if params != {}:
            p = compute_GS_one_model(model(), X, y, params, scoring=scoring, cv=cv)
            model_GS = model(**p)
        else:
            model_GS = model()
        list_models_GS.append(model_GS)
    return list_models_GS

def compute_GS_one_model(model, X, y, params, scoring = "accuracy", cv=5):
    grid= GridSearchCV(model, params, scoring = scoring, cv=cv)
    grid.fit(X, y)
    return grid.best_params_

In [101]:
list_models_GS = compute_GS(list_models, X, y, list_parameters)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [106]:
r = compute_model_performance(list_models_default_params, X, y, scoring, False)



In [107]:
r

Unnamed: 0,fit_time,score_time,test_accuracy,test_f1_score,test_precision,test_recall
SVC,3.097833,0.85935,0.815667,0.815527,0.816385,0.815667
RidgeClassifier,0.069549,0.017782,0.860667,0.86062,0.861034,0.860667
GaussianNB,0.03145,0.020432,0.677,0.676525,0.678399,0.677
BaggingClassifier,17.091623,0.021128,0.685667,0.684003,0.690434,0.685667
AdaBoostClassifier,13.793446,0.039683,0.724667,0.724596,0.724939,0.724667
RandomForestClassifier,6.37714,0.037848,0.738667,0.738522,0.739269,0.738667
GradientBoostingClassifier,79.242125,0.019601,0.76,0.759875,0.760452,0.76
KNeighborsClassifier,0.026877,0.167947,0.640667,0.638894,0.6441,0.640667
