In [2]:
from __future__ import print_function

import os
import sys
sys.path.append('/home/jogi/git/repository/smart_play_set')
from itertools import compress

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import fft, arange, fftpack

from utils.smart_utils import get_dir_path, tensor_to_pandas, load_hdf5_file
from utils.smart_utils import get_array_filenames, split_on_classes, create_row_mask
from preprocessing.process_data import get_data

from utils.plot_utils import plot_spectra_1axis, plot_spectra_3axis
from utils.plot_utils import plot_3axis_raw_signal_1, plot_3axis_raw_signal_compare
from utils.plot_utils import single_file_plots, load_file_to_pandas

from sklearn.preprocessing import normalize
from sklearn.metrics import r2_score, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation
from sklearn.model_selection import cross_val_predict
from ReliefF import ReliefF
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif
from sklearn.pipeline import make_pipeline

%matplotlib inline 


In [2]:
train_data_1, train_labels_1, dta_dict_1 = get_data('20160921', force=False, apply_window_func=True, calc_mag=True,
                                              extra_label="20hz_1axis_f2.01.5_12f_207_12_1", optimal_w_size=False,
                                                   f_type='lowhigh', lowcut=2, highcut=0.1, b_order=5)
                                     
                # calc_mag=True, f_type="lowhigh", lowcut=2, highcut=0.1, b_order=5)
        
# for cross validation train data has to have 2 dim tensor
train_data_1_r = np.reshape(train_data_1, (train_data_1.shape[0], train_data_1.shape[1] * train_data_1.shape[2]))
# for cross validation train labels has to have 1 dim tensor
train_labels_1_r = np.reshape(train_labels_1, (train_labels_1.shape[0]))
print(train_data_1_r.shape)
print(train_labels_1_r.shape)
print(dta_dict_1["features"])
print(dta_dict_1["filter"])
print(dta_dict_1["filter_specs"])
print(dta_dict_1["window_func"])
num_of_features_1 = len(dta_dict_1["features"])
features1 = dta_dict_1["features"]
num_of_windows1 = dta_dict_1["num_of_windows"]

INFO - Used data label 20160921_futurocube_roadrunner_20hz_1axis_f2.01.5_12f_207_12_1
INFO Loading matrices from h5 file /home/jogi/git/repository/smart_play_set/data/futurocube/roadrunner/20160921_futurocube_roadrunner_20hz_1axis_f2.01.5_12f_207_12_1.h5
('INFO - List of arrays in this file: \n', [u'feature_data', u'label_data'])
INFO - Loading data description from json.
(207, 12)
(207,)
[u'minf', u'maxf', u'mean', u'std', u'median', u'range', u'rms', u'int_squared_jerk', u'dc', u'energy', u'power_spec_entropy', u'dxdy_error']
lowhigh
[2, 0.1, 5]
True


In [31]:
train_data_2, train_labels_2, dta_dict_2 = get_data('20160921', force=False, apply_window_func=True, calc_mag=True,
                                              extra_label="20hz_1axis_f2.01.5_10f_45_10_1", optimal_w_size=False,
                                                   f_type='lowhigh', lowcut=2, highcut=0.1, b_order=5)

# for cross validation train data has to have 2 dim tensor
train_data_2_r = np.reshape(train_data_2, (train_data_2.shape[0], train_data_2.shape[1] * train_data_2.shape[2]))
# for cross validation train labels has to have 1 dim tensor
train_labels_2_r = np.reshape(train_labels_2, (train_labels_2.shape[0]))
print(train_data_2_r.shape)
print(train_labels_2_r.shape)

print(dta_dict_2["features"])
print(dta_dict_2["filter"])
print(dta_dict_2["filter_specs"])
print(dta_dict_2["window_func"])
num_of_features_2 = len(dta_dict_2["features"])
features2 = dta_dict_2["features"]
num_of_windows2 = dta_dict_2["num_of_windows"]

INFO - Used data label 20160921_futurocube_roadrunner_20hz_1axis_f2.01.5_10f_45_10_1
INFO Loading matrices from h5 file /home/jogi/git/repository/smart_play_set/data/futurocube/roadrunner/20160921_futurocube_roadrunner_20hz_1axis_f2.01.5_10f_45_10_1.h5
('INFO - List of arrays in this file: \n', [u'feature_data', u'label_data'])
INFO - Loading data description from json.
(45, 10)
(45,)
[u'min', u'max', u'mean', u'std', u'median', u'rms', u'range', u'dc', u'energy', u'power_spec_entropy']
lowhigh
[2, 0.1, 5]
True


In [5]:
use_1 = True
apply_mask = False

if use_1:
    print("Use set train_data_1 !")
    dta_train = train_data_1_r
    lbl_train = train_labels_1_r
    num_of_features = num_of_features_1
    features = features1
    num_of_windows = num_of_windows1
else:
    print("Use set train_data_2 !")
    dta_train = train_data_2_r
    lbl_train = train_labels_2_r
    num_of_features = num_of_features_2
    features = features2
    num_of_windows = num_of_windows2

for d in np.arange(num_of_features):
    idx = d+1
    fs = ReliefF(n_neighbors=10, n_features_to_keep=idx)
    X_train_2 = fs.fit_transform(dta_train, lbl_train)
    X_train_2 = fs.transform(dta_train)
    f_list = [features[i] for i in fs.top_features]
    print("--------------------------------------------------------------")
    print("Keep %d, top feature list: %s" % (idx, ', '.join(f_list[:idx])))

    if apply_mask:
        multiplier = int(X_train_2.shape[0]/num_of_windows)
        b_mask = create_row_mask([False, False, True, False, True], multiplier)
        X_train_2 = X_train_2[b_mask, :]
        lbl_train_subset = lbl_train[b_mask]
    else:
        lbl_train_subset = lbl_train
    
    clf = svm.SVC(kernel='rbf', C=1)
    scores_svm = cross_validation.cross_val_score(clf, X_train_2, lbl_train_subset, cv=12)
    # predicted = cross_val_predict(clf, X_train_f, train_labels_f_r, cv=8)
    # print(predicted)
    # scores = accuracy_score(train_labels_r, predicted) 
    print("SVM - Accuracy: %0.2f (+/- %0.2f)" % (scores_svm.mean(), scores_svm.std() * 2))
    
    rfc = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=8, min_samples_leaf=4)
    scores_rfc = cross_validation.cross_val_score(clf, X_train_2, lbl_train_subset, cv=12)
    print("rfc - Accuracy: %0.2f (+/- %0.2f)" % (scores_rfc.mean(), scores_rfc.std() * 2))
    
    xgb = XGBClassifier(learning_rate=0.1, n_estimators=300, max_depth=5, min_child_weight=2, gamma=0.1,
                         subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4,
                         scale_pos_weight=1, seed=27)
    scores_xgb = cross_validation.cross_val_score(clf, X_train_2, lbl_train_subset, cv=12)
    print("xgb - Accuracy: %0.2f (+/- %0.2f)" % (scores_xgb.mean(), scores_xgb.std() * 2))
    
    gnb = GaussianNB()
    scores_gnb = cross_validation.cross_val_score(gnb, X_train_2, lbl_train_subset, cv=12)
    print("gnb - Accuracy: %0.2f (+/- %0.2f)" % (scores_gnb.mean(), scores_gnb.std() * 2))

Use set train_data_1 !
--------------------------------------------------------------
Keep 1, top feature list: minf
SVM - Accuracy: 0.58 (+/- 0.29)
rfc - Accuracy: 0.58 (+/- 0.29)
xgb - Accuracy: 0.58 (+/- 0.29)
gnb - Accuracy: 0.59 (+/- 0.22)
--------------------------------------------------------------
Keep 2, top feature list: minf, maxf
SVM - Accuracy: 0.60 (+/- 0.32)
rfc - Accuracy: 0.60 (+/- 0.32)
xgb - Accuracy: 0.60 (+/- 0.32)
gnb - Accuracy: 0.55 (+/- 0.26)
--------------------------------------------------------------
Keep 3, top feature list: minf, maxf, range
SVM - Accuracy: 0.61 (+/- 0.32)
rfc - Accuracy: 0.61 (+/- 0.32)
xgb - Accuracy: 0.61 (+/- 0.32)
gnb - Accuracy: 0.56 (+/- 0.23)
--------------------------------------------------------------
Keep 4, top feature list: minf, maxf, range, dxdy_error
SVM - Accuracy: 0.56 (+/- 0.34)
rfc - Accuracy: 0.56 (+/- 0.34)
xgb - Accuracy: 0.56 (+/- 0.34)
gnb - Accuracy: 0.60 (+/- 0.20)
---------------------------------------------

In [5]:
use_1 = True 

if use_1:
    print("Use set train_data_1 !")
    dta_train = train_data_1_r
    lbl_train = train_labels_1_r
    num_of_features = num_of_features_1
    features = features1
    num_of_windows = num_of_windows1
else:
    print("Use set train_data_2 !")
    dta_train = train_data_2_r
    lbl_train = train_labels_2_r
    num_of_features = num_of_features_2
    features = features2
    num_of_windows = num_of_windows2

for d in np.arange(1, num_of_features):
    clf = svm.SVC(kernel='linear', C=1)
    selector = RFE(clf, n_features_to_select=d, step=1)
    selector = selector.fit(dta_train, lbl_train)
    support = selector.support_
    feature_ranks = list(compress(features, support))
    print("Keep feature(s) %s" % ", ".join(feature_ranks))
    print(support)
    ranking = selector.ranking_
    print(ranking)
    dta_train_subset = dta_train[:, support]
    multiplier = int(dta_train_subset.shape[0]/num_of_windows)
    b_mask = create_row_mask([True, False, True, False, True], multiplier)
    dta_train_subset = dta_train_subset[b_mask, :]
    lbl_train_subset = lbl_train[b_mask]
    print(dta_train_subset.shape)
    
    clf = svm.SVC(kernel='rbf', C=1)
    scores_svm = cross_validation.cross_val_score(clf, dta_train_subset, lbl_train_subset, cv=12)
    print("SVM - Accuracy: %0.2f (+/- %0.2f)" % (scores_svm.mean(), scores_svm.std() * 2))
    print("------------------------------------------------------------------")

Use set train_data_1 !
Keep feature(s) std
[False False False  True False False False False False]
[3 2 6 1 9 8 4 7 5]
(27, 1)
SVM - Accuracy: 0.54 (+/- 0.43)
------------------------------------------------------------------
Keep feature(s) maxf, std
[False  True False  True False False False False False]
[2 1 5 1 8 7 3 6 4]
(27, 2)
SVM - Accuracy: 0.61 (+/- 0.55)
------------------------------------------------------------------
Keep feature(s) minf, maxf, std
[ True  True False  True False False False False False]
[1 1 4 1 7 6 2 5 3]
(27, 3)
SVM - Accuracy: 0.60 (+/- 0.40)
------------------------------------------------------------------
Keep feature(s) minf, maxf, std, dc
[ True  True False  True False False  True False False]
[1 1 3 1 6 5 1 4 2]
(27, 4)
SVM - Accuracy: 0.82 (+/- 0.44)
------------------------------------------------------------------
Keep feature(s) minf, maxf, std, dc, power_spec_entropy
[ True  True False  True False False  True False  True]
[1 1 2 1 5 4 1 3 1]

In [11]:


for d in np.arange(1, num_of_features):
    # ANOVA SVM-C
    # 1) anova filter, take X best ranked features, f_classif, mutual_info_classif
    anova_filter = SelectKBest(f_classif, k=d)
    
    dta_train_subset = anova_filter.fit_transform(dta_train, lbl_train)
    print(dta_train_subset.shape)
    # keep selected feature names
    feature_names = [features[i] for i
                             in anova_filter.get_support(indices=True)]
    print("features to keep %s" % ', '.join(feature_names))
    
    # 2) svm
    clf = svm.SVC(kernel='rbf', C=1)
    scores_svm = cross_validation.cross_val_score(clf, dta_train_subset, lbl_train, cv=12)
    print("SVM - Accuracy: %0.2f (+/- %0.2f)" % (scores_svm.mean(), scores_svm.std() * 2))
    print("------------------------------------------------------------------")


(45, 1)
features to keep std
SVM - Accuracy: 0.73 (+/- 0.36)
------------------------------------------------------------------
(45, 2)
features to keep minf, std
SVM - Accuracy: 0.71 (+/- 0.44)
------------------------------------------------------------------
(45, 3)
features to keep minf, std, energy
SVM - Accuracy: 0.75 (+/- 0.33)
------------------------------------------------------------------
(45, 4)
features to keep minf, maxf, std, energy
SVM - Accuracy: 0.75 (+/- 0.26)
------------------------------------------------------------------
(45, 5)
features to keep minf, maxf, std, median, energy
SVM - Accuracy: 0.72 (+/- 0.31)
------------------------------------------------------------------
(45, 6)
features to keep minf, maxf, std, median, energy, power_spec_entropy
SVM - Accuracy: 0.77 (+/- 0.36)
------------------------------------------------------------------
(45, 7)
features to keep minf, maxf, mean, std, median, energy, power_spec_entropy
SVM - Accuracy: 0.73 (+/- 0.42)
-