In [9]:
import numpy as np
import DataProcess as DP
import FeatureCalculate as FC
import DataProcessForPCA as DPpca
import pandas as pd
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

### Prepare data:

In [2]:
# Use StandardScaler:
specchunks=[]
for i in DP.protocol_list:
    X_std, specchunk = DPpca.preprocess(i, 1000, rescale=True, scaler='standard')
    specchunks.append(np.array(specchunk))
    
chunks=[]
for i in range(len(specchunks)):
    chunks.append(DPpca.segmentation(specchunks[i],T=512,stride=512))


In [3]:
# Shuffle data:
data = np.vstack(chunks)
np.random.shuffle(data)

X = data[:,:-1]
y = data[:,-1]

# Calculate class_weight of activities:
actID, counts = np.unique(y, return_counts=True)
weight = counts/np.sum(counts)
act_weight = dict(zip(actID, weight))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Set up pipeline and gridsearch:
**Search for optimal n_components of PCA and parameters for svm classifier**

In [6]:
from tempfile import mkdtemp
from shutil import rmtree

In [12]:
# Enable caching
cachedir = mkdtemp()

estimators = [('reduce_dim', PCA()), ('clf', SVC())]
param_grid = dict(reduce_dim__n_components=[20,30,50,70,100,130,160,200], 
                  clf__C=[0.01, 0.05, 0.1, 0.3], 
                  clf__kernel=["linear", "poly"], 
                  clf__class_weight=['balanced', act_weight], 
                  clf__tol=[1, 5e-1, 3e-1, 1e-1])

logreg_estimators = [('reduce_dim', PCA()), ('clf', LogisticRegression())]
logreg_param_grid = dict(reduce_dim__n_components=[20,30,50,70,100,130,160,200],
                         clf__C=[0.1, 0.3, 1, 3, 10], 
                         clf__penalty=["l1", "l2"], 
                         clf__class_weight=["balanced", act_weight])

pipe = Pipeline(estimators, memory=cachedir)
logreg_pipe = Pipeline(logreg_estimators, memory=cachedir)

grid = GridSearchCV(pipe, cv=3, param_grid=param_grid)
logreg_grid = GridSearchCV(logreg_pipe, cv=3, param_grid=logreg_param_grid)

### Train and search for optimal parameters:

**SVM:**

In [18]:
grid.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory='C:\\Users\\ELITE8~1\\AppData\\Local\\Temp\\tmp8wv_iddq',
     steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'reduce_dim__n_components': [20, 30, 50, 70, 100, 130, 160, 200], 'clf__C': [0.01, 0.05, 0.1, 0.3], 'clf__kernel': ['linear', 'poly'], 'clf__class_weight': ['balanced', {1.0: 0.10270270270270271, 2.0: 0.097897897897897893, 3.0: 0.1012012012012012, 4.0: 0.12912912912912913, 5.0: 0.0498498...2192192199, 17.0: 0.12912912912912913, 24.0: 0.022222222222222223}], 'clf__tol': [1, 0.5, 0.3, 0.1]},
       pre_dispatch='2*n_jo

In [19]:
print("Best: %f using %s\n" % (grid.best_score_, grid.best_params_))

# means = grid.cv_results_['mean_test_score']
# stds = grid.cv_results_['std_test_score']
# params = grid.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.983483 using {'clf__C': 0.01, 'clf__class_weight': 'balanced', 'clf__kernel': 'linear', 'clf__tol': 0.5, 'reduce_dim__n_components': 100}



**Logistic Regression:**

In [13]:
logreg_grid.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory='C:\\Users\\ELITE8~1\\AppData\\Local\\Temp\\tmp8wv_iddq',
     steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fi...y='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'reduce_dim__n_components': [20, 30, 50, 70, 100, 130, 160, 200], 'clf__C': [0.1, 0.3, 1, 3, 10], 'clf__penalty': ['l1', 'l2'], 'clf__class_weight': ['balanced', {1.0: 0.10270270270270271, 2.0: 0.097897897897897893, 3.0: 0.1012012012012012, 4.0: 0.12912912912912913, 5.0: 0.049849849849849852, 6.0: 0.087687687687687685, 7.0: 0.1012012012012012, 12.0: 0.047447447447447451, 13.0: 0.039339339339339342, 16.0: 0.092192192192192199, 17.0: 0.12912912912912913, 24.0: 0.022222222222222223}]

In [15]:
print("Best: %f using %s\n" % (logreg_grid.best_score_, logreg_grid.best_params_))

Best: 0.964715 using {'clf__C': 0.3, 'clf__class_weight': 'balanced', 'clf__penalty': 'l1', 'reduce_dim__n_components': 160}



### Test with testing data:

**SVM:**

In [20]:
grid.score(X_test, y_test)

0.98648648648648651

**Logistic Regression:**

In [16]:
logreg_grid.score(X_test, y_test)

0.96846846846846846

In [21]:
# Clear the cache directory
rmtree(cachedir)