In [41]:
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import time
from sklearn.metrics import accuracy_score, precision_score, f1_score

Decide which dataset to use

In [52]:
import ipynb.fs.full.TrainTest as TrainTest

# p = 29
# a = [2]
w = 4
cw = 9

# path = './data/created_UniMiB-SHAR/nperseg=sqrt/'
# file = 'p' + str(p) + '_a' + str(a) + '_w' + str(w) + '_cw' + str(cw)

path = './data/created_collected/'
file = 'w' + str(w) + '_cw' + str(cw)
ext = '.npy'

data = np.load(path + file + ext)

print('datapoints: {}'.format(len(data)))
true_count = 0
for d in data:
    if d[1]:
        true_count += 1
print('num true datapoints: {}'.format(true_count))

x_train, y_train, x_test, y_test = TrainTest.get_train_test(data, 0.8)

datapoints: 51118
num true datapoints: 24559


Function for getting the best parameters for a classifier

In [43]:
def getBestParams(clf, param_dict, search='random', n_iter=100, cv=3):
    if search == 'random':
        clf_search = RandomizedSearchCV(
            estimator = clf, 
            param_distributions = param_dict, 
            n_iter = n_iter, 
            cv = cv,
            verbose = 1,
#             n_jobs = -1
        )
    elif search == 'grid':
        clf_search = GridSearchCV(
            estimator = clf,
            param_grid = param_dict,
            cv = cv,
            verbose = 1,
            n_jobs = -1
        )
    clf_search.fit(x_train, y_train)
    return clf_search.best_params_

Function to record the results of classifier with best parameters

In [44]:
def record_results(clf, clf_name, trials=10):
    avg_acc = 0
    avg_prec = 0
    avg_f1 = 0
    avg_train_time = 0
    avg_test_time = 0
    
    for i in range(trials):
        train_start = time.clock()
        clf.fit(x_train, y_train)
        train_end = time.clock()
        
        test_start = time.clock()
        y_pred = clf.predict(x_test)
        test_end = time.clock()
        
        avg_acc += accuracy_score(y_test, y_pred)
        avg_prec += precision_score(y_test, y_pred)
        avg_f1 += f1_score(y_test, y_pred)
        avg_train_time += (train_end - train_start)
        avg_test_time += (test_end - test_start)
    
    avg_acc /= trials
    avg_prec /= trials
    avg_f1 /= trials
    avg_train_time /= trials
    avg_test_time /= trials
    
    with open('./results/collected/' + clf_name + '/' + file + '.txt', 'a+') as f:
        f.write('(best = {})\n'.format(best))
        f.write('trials         : {}\n'.format(trials))
        f.write('avg acc        : {}\n'.format(avg_acc))
        f.write('avg prec       : {}\n'.format(avg_prec))
        f.write('avg f1         : {}\n'.format(avg_f1))
        f.write('avg_train_time : {}\n'.format(avg_train_time))
        f.write('avg_test_time  : {}\n'.format(avg_test_time))
        f.write('-----\n\n')

Define classifier and parameters to search through, then call functions

In [45]:
## NEAREST NEIGHBORS
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf_name = 'knn'
clf_options = {
    'n_neighbors' : range(1, 6),
    'weights' : ['uniform', 'distance'],
    'algorithm' : ['auto'],
    'leaf_size' : [10, 20, 30, 40, 50],
    'p' : [1, 2, 3, 4],
    'metric' : ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
#     'metric_params' : [],
#     'n_jobs' : []
}

# ## RANDOM FOREST
# from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier()
# clf_name = 'rf'
# clf_options = {
#     'n_estimators' : range(100, 2001, 100),
#     # 'criterion' : [],
#     'max_features' : ['auto', 3, 4, 5],
#     'max_depth' : [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
#     'min_samples_split' : [2, 5, 10],
#     'min_samples_leaf' : [1, 2, 4],
#     # 'min_weight_fraction_leaf' : [],
#     # 'max_leaf_nodes' : [],
#     # 'min_impurity_split' : [],
#     # 'min_impurity_decrease' : [],
#     'bootstrap' : [True, False]
#     # 'oob_score' : [],
# }


### SVM
# from sklearn.svm import SVC
# clf = SVC()
# clf_name = 'svm'
# clf_options = {
#     'C' : scipy.stats.expon(scale=100),
#     'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
#     'degree' : [2, 3, 4, 5],
#     'gamma' : scipy.stats.expon(scale=.1),
#     'coef0' : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#     'probability' : [True, False],
#     'shrinking' : [True, False],
#     'tol' : [1e-3, 1e-4],
# #     'cache_size' : [],
#     'class_weight' : [None, 'balanced'],
# #     'verbose' : [],
# #     'max_iter' : [],
# #     'decision_function_shape' : []
# #     'random_state' : []
# }

### DECISION TREE
# from sklearn.tree import DecisionTreeClassifier
# clf = DecisionTreeClassifier()
# clf_name = 'dt'
# clf_options = {
#     'criterion' : ['gini', 'entropy'],
#     'splitter' : ['best', 'random'],
#     'max_depth' : [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
#     'min_samples_split' : [2, 5, 10],
#     'min_samples_leaf' : [1, 2, 4],
# #     'min_weight_fraction_leaf' : [],
#     'max_features' : ['auto', 'log2', 3, 4, 5, None],
# #     'random_state' : [],
# #     'max_leaf_nodes' : [],
# #     'min_impurity_decrease' : [],
# #     'min_impurity_split' : [],
#     'class_weight' : [None, 'balanced'],
#     'presort' : [True, False]
# }


best = getBestParams(clf, clf_options, search='random', cv=5)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  8.2min finished


In [46]:
best_clf = KNeighborsClassifier(**best)
record_results(best_clf, clf_name)
print('done')

done
