Tune
=====
***

## Importing modules
This notebook trains and tests a model and dataset of your choice with various parameters using scikit-learn's `RandomizedSearchCV` and `GridSearchCV` functions to attempt to optimize the hyperparameters of the model. It stores the results into a separate file.

The first step is to import the modules needed for calculation and data processing.
* `numpy` is necessary for loading the dataset chosen
* `sklearn.model_selection.GridSearchCV` and `RandomizedSearchCV` are the functions from scikit-learn that test a classifier with various parameters and returns the set of best parameters.
* `time` is used for timing the train and test time for a classifier
* `sklearn.metrics.accuracy_score`, `precision_score`, and `f1_score` are used to evalutate how well the optimized classifier performs
* `scipy` is used to generate some ranges for the parameters in some classifiers

In [24]:
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import time
from sklearn.metrics import accuracy_score, precision_score, f1_score
import scipy

Decide which dataset to use

In [25]:
import ipynb.fs.full.TrainTest as TrainTest

p = 29
a = [2]
w = 4
cw = 9

path = './data/created_UniMiB-SHAR/nperseg=sqrt/'
file = 'p' + str(p) + '_a' + str(a) + '_w' + str(w) + '_cw' + str(cw)

# path = './data/created_collected/'
# file = 'w' + str(w) + '_cw' + str(cw)
ext = '.npy'

data = np.load(path + file + ext)
print(data)

print('datapoints: {}'.format(len(data)))
true_count = 0
for d in data:
    if d[1]:
        true_count += 1
print('num true datapoints: {}'.format(true_count))

x_train, y_train, x_test, y_test = TrainTest.get_train_test(data, 0.8)

[[list([0.043889873807057334, 0.02790274654327562, 0.056384859069108645, 0.028407378540714956, 0.029909518877672837, 0.06268918870596274, 0.06268918870596274])
  True]
 [list([0.038555549796496724, 0.034047958172445165, 0.07140860072408488, 0.030144429004604824, 0.023216501502549575, 0.048193072266976236, 0.048193072266976236])
  True]
 [list([0.016550405734261166, 0.0412423178052609, 0.07633630255419359, 0.037685888869649437, 0.03076987826785081, 0.07096448790532905, 0.07096448790532905])
  True]
 ...
 [list([0.21525735417382894, 0.07533349550830877, 0.03617445176176007, 0.07787616616811044, 0.07281928942714669, 0.0384134186245445, 0.0384134186245445])
  True]
 [list([0.22803877724256935, 0.07675640546227899, 0.0458651909639872, 0.07879921538939309, 0.06205415748936275, 0.050775977163007815, 0.050775977163007815])
  True]
 [list([0.219261816034031, 0.0898163263768038, 0.041367862295759075, 0.08885118735403907, 0.07084578181871606, 0.05663263087053835, 0.05663263087053835])
  True]]
da

Function for getting the best parameters for a classifier

In [26]:
def getBestParams(clf, param_dict, search='random', n_iter=100, cv=3):
    if search == 'random':
        clf_search = RandomizedSearchCV(
            estimator = clf, 
            param_distributions = param_dict, 
            n_iter = n_iter, 
            cv = cv,
            verbose = 1,
            n_jobs = -1
        )
    elif search == 'grid':
        clf_search = GridSearchCV(
            estimator = clf,
            param_grid = param_dict,
            cv = cv,
            verbose = 1,
            n_jobs = -1
        )
    clf_search.fit(x_train, y_train)
    return clf_search.best_params_

Function to record the results of classifier with best parameters

In [27]:
def record_results(clf, clf_name, trials=10):
    print('testing {} through {} trials'.format(clf_name, trials))
    avg_acc = 0
    avg_prec = 0
    avg_f1 = 0
    avg_train_time = 0
    avg_test_time = 0
    
    for i in range(trials):
        train_start = time.clock()
        clf.fit(x_train, y_train)
        train_end = time.clock()
        
        test_start = time.clock()
        y_pred = clf.predict(x_test)
        test_end = time.clock()
        
        avg_acc += accuracy_score(y_test, y_pred)
        avg_prec += precision_score(y_test, y_pred)
        avg_f1 += f1_score(y_test, y_pred)
        avg_train_time += (train_end - train_start)
        avg_test_time += (test_end - test_start)
        print('trial {} / {} finished\n'.format(i + 1, trials), end='\r')
    
    avg_acc /= trials
    avg_prec /= trials
    avg_f1 /= trials
    avg_train_time /= trials
    avg_test_time /= trials
    
    with open('./results/collected/' + clf_name + '/' + file + '.txt', 'a+') as f:
        f.write('(best = {})\n'.format(best))
        f.write('trials         : {}\n'.format(trials))
        f.write('avg acc        : {}\n'.format(avg_acc))
        f.write('avg prec       : {}\n'.format(avg_prec))
        f.write('avg f1         : {}\n'.format(avg_f1))
        f.write('avg_train_time : {}\n'.format(avg_train_time))
        f.write('avg_test_time  : {}\n'.format(avg_test_time))
        f.write('-----\n\n')

Define classifier and parameters to search through, then call functions

In [28]:
## NEAREST NEIGHBORS
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf_name = 'knn'
clf_options = {
    'n_neighbors' : range(1, 6),
    'weights' : ['uniform', 'distance'],
    'algorithm' : ['auto'],
    'leaf_size' : [10, 20, 30, 40, 50],
    'p' : [1, 2, 3, 4],
    'metric' : ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
#     'metric_params' : [],
#     'n_jobs' : []
}

# ## RANDOM FOREST
# from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier()
# clf_name = 'rf'
# clf_options = {
#     'n_estimators' : range(100, 2001, 100),
#     # 'criterion' : [],
#     'max_features' : ['auto', 3, 4, 5],
#     'max_depth' : [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
#     'min_samples_split' : [2, 5, 10],
#     'min_samples_leaf' : [1, 2, 4],
#     # 'min_weight_fraction_leaf' : [],
#     # 'max_leaf_nodes' : [],
#     # 'min_impurity_split' : [],
#     # 'min_impurity_decrease' : [],
#     'bootstrap' : [True, False]
#     # 'oob_score' : [],
# }


# ## SVM
# from sklearn.svm import SVC
# clf = SVC()
# clf_name = 'svm'
# clf_options = {
#     'C' : scipy.stats.expon(scale=100),
#     'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
#     'degree' : [2, 3, 4, 5],
#     'gamma' : scipy.stats.expon(scale=.1),
#     'coef0' : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#     'probability' : [True, False],
#     'shrinking' : [True, False],
#     'tol' : [1e-3, 1e-4],
# #     'cache_size' : [],
#     'class_weight' : [None, 'balanced'],
# #     'verbose' : [],
# #     'max_iter' : [],
# #     'decision_function_shape' : []
# #     'random_state' : []
# }

### DECISION TREE
# from sklearn.tree import DecisionTreeClassifier
# clf = DecisionTreeClassifier()
# clf_name = 'dt'
# clf_options = {
#     'criterion' : ['gini', 'entropy'],
#     'splitter' : ['best', 'random'],
#     'max_depth' : [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
#     'min_samples_split' : [2, 5, 10],
#     'min_samples_leaf' : [1, 2, 4],
# #     'min_weight_fraction_leaf' : [],
#     'max_features' : ['auto', 'log2', 3, 4, 5, None],
# #     'random_state' : [],
# #     'max_leaf_nodes' : [],
# #     'min_impurity_decrease' : [],
# #     'min_impurity_split' : [],
#     'class_weight' : [None, 'balanced'],
#     'presort' : [True, False]
# }


best = getBestParams(clf, clf_options, search='grid', cv=10)
print('done')

Fitting 10 folds for each of 800 candidates, totalling 8000 fits


[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 1024 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 2524 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 4624 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done 6037 tasks      | elapsed:   56.6s
[Parallel(n_jobs=-1)]: Done 7687 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 7985 out of 8000 | elapsed:  1.5min remaining:    0.2s


done


[Parallel(n_jobs=-1)]: Done 8000 out of 8000 | elapsed:  1.5min finished


In [29]:
best.best_params_

AttributeError: 'dict' object has no attribute 'best_params_'

In [31]:
best_clf = KNeighborsClassifier(**best)
record_results(best_clf, clf_name, trials=25)
print('done')

testing knn through 25 trials
trial 1 / 25 finished
trial 2 / 25 finished
trial 3 / 25 finished
trial 4 / 25 finished
trial 5 / 25 finished
trial 6 / 25 finished
trial 7 / 25 finished
trial 8 / 25 finished
trial 9 / 25 finished
trial 10 / 25 finished
trial 11 / 25 finished
trial 12 / 25 finished
trial 13 / 25 finished
trial 14 / 25 finished
trial 15 / 25 finished
trial 16 / 25 finished
trial 17 / 25 finished
trial 18 / 25 finished
trial 19 / 25 finished
trial 20 / 25 finished
trial 21 / 25 finished
trial 22 / 25 finished
trial 23 / 25 finished
trial 24 / 25 finished
trial 25 / 25 finished
done
