## Prepare work

In [1]:
import os
import time
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# our own pipeline
from pipelines.data_prapare import read_power_band_txt,read_features_table, read_signal_data
from pipelines.ml_functions import prepare_signals,set_seed, clean_all_feature_table, print_performance

In [19]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import precision_recall_fscore_support


import warnings
warnings.filterwarnings("ignore")

In [3]:
set_seed(42)

Read data

In [4]:
aat_vis, aat_img, asl_vis, asl_img = read_features_table()
# bp_data_dict = read_power_band_txt()

# 26 * 32 = 832 data
labels_1 = np.array(aat_vis['label_index'])
# 26 * 32 * 2= 1664 data
labels_2 = np.concatenate((labels_1, labels_1), axis=0)


# for the feature analyse
col_name = list(asl_img.columns)[2:]
# col_name

In [5]:
# read and clean data
aat_vis = clean_all_feature_table(aat_vis.iloc[:, 2:])
aat_img = clean_all_feature_table(aat_img.iloc[:, 2:])
asl_vis = clean_all_feature_table(asl_vis.iloc[:, 2:])
asl_img = clean_all_feature_table(asl_img.iloc[:, 2:])

redefine our functions

In [6]:
def model_evaluation_dict(x, y, model, model_name, params, Test_size):
    """
    Perform 10 fold crossvalidation, fit model with train data and evaluate its performance
    return performance dict

    """

    clf = GridSearchCV(model, params, cv=10)

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=Test_size)
    clf.fit(X_train, y_train)
    params = clf.best_params_

    Training_score = clf.score(X_train, y_train)
    Score = clf.score(X_test, y_test)
    Whole_score = clf.score(x, y)
    cvres = clf.cv_results_
    cvscore = cvres['mean_test_score'][clf.best_index_]
    macro_precision, macro_recall, macro_f1_score, macro_support = \
        precision_recall_fscore_support(y_test, clf.predict(X_test), average='macro')
    micro_precision, micro_recall, micro_f1_score, micro_support = \
        precision_recall_fscore_support(y_test, clf.predict(X_test), average='micro')
    if not params:
        # empty params dict
        params = 'default'
    # return a dictionary
    d_info = {'Classifier': model_name, 'param': params, 'Traing score': Training_score, ' Test Score': Score,
              'Whole score': Whole_score, 'CV Score': cvscore,
              'Precision(Macro)': macro_precision, 'Precision(Micro)': micro_precision,
              'Recall(Macro)': macro_recall, 'Recall(Micro)': micro_recall,
              'F1 Score(Macro)': macro_f1_score, 'F1 Score(Micro)': micro_f1_score}

    return d_info

In [7]:
def init_classifiers():
    """
    Initialize our machine learning classifier ---
    where catboost and NN (neural network classification) are not initialized,
    and most hyperparameters will take default values

    """

    model_names = ['SVM', 'LR', 'KNN', 'GBDT', 'DT', 'AdaB', 'RF', 'XGB', 'LGB', 'Catboost', 'NN']

    # the training parameters of each model
    param_grid_svc = [{}]
    param_grid_logistic = [{'C': [0.1], 'penalty': ['l1', 'l2']}]
    param_grid_knn = [{}, {'n_neighbors': list(range(3, 8))}]
    param_grid_gbdt = [{}]
    param_grid_tree = [{}]
    param_grid_boost = [{}]
    param_grid_rf = [{}]
    param_grid_xgb = [{}]
    param_grid_lgb = [{}]

    return ([(SVC(), model_names[0], param_grid_svc),
             (LogisticRegression(), model_names[1], param_grid_logistic),
             (KNeighborsClassifier(), model_names[2], param_grid_knn),
             (RandomForestClassifier(), model_names[6], param_grid_rf)])

## Validation LDA

If we summarize the LDA idea in one sentence, that is, "the intra-class variance is the smallest after projection, and the inter-class variance is the largest".

How to validate LDA are a real working method?

* In fact, in the previous notebook, we train our LDA by all the data and then split this new features into training set and test set. But for a real word scenario, we should just use training set to train LDA machine, and when we have a new unfamiliar data, we use our LDA machine project this new data, and send it to the classifier!

In [29]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
# print(random_grid)

In [39]:
lda = LinearDiscriminantAnalysis(n_components=24)

data_allfeature = np.concatenate((aat_img, aat_vis), axis=0) 
x_train,x_test,y_train,y_test = train_test_split(data_allfeature,labels_2,test_size=0.3)

lda.fit(x_train, y_train)
x_train_new = lda.transform(x_train)

In [40]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, 
                               cv = 5, verbose=2, random_state=13, n_jobs = -1)
rf_random.fit(x_train_new, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 21, 32, 43, 54, 65,
                                                      76, 87, 98, 110, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 3, 4],
                                        'n_estimators': [20, 240, 460, 680, 900,
                                                         1120, 1340, 1560, 1780,
                                                         2000]},
                   random_state=13, verbose=2)

In [41]:
# print(rf_random.best_params_)
best_random_rf = rf_random.best_estimator_
x_test_new = lda.transform(x_test)
best_random_rf.score(x_test_new, y_test)

0.046

In [42]:
accuracy, f1_w, recall_w, precision_w,  f1, recall, precision = print_performance(y_test, best_random_rf.predict(x_test_new))

accuracy: 0.046
f1 score macro av: 0.028
recall score macro av: 0.052
precision score macro av: 0.024
f1 score for every class:  [0.         0.07142857 0.17647059 0.07272727 0.         0.
 0.04       0.         0.         0.         0.         0.
 0.         0.         0.07142857 0.12698413 0.         0.
 0.         0.         0.         0.         0.         0.05084746
 0.12631579 0.        ]
recall:  [0.         0.05555556 0.15789474 0.11764706 0.         0.
 0.11764706 0.         0.         0.         0.         0.
 0.         0.         0.11111111 0.17391304 0.         0.
 0.         0.         0.         0.         0.         0.21428571
 0.4        0.        ]
precision:  [0.         0.1        0.2        0.05263158 0.         0.
 0.02409639 0.         0.         0.         0.         0.
 0.         0.         0.05263158 0.1        0.         0.
 0.         0.         0.         0.         0.         0.02884615
 0.075      0.        ]
 


In [43]:
feat_importance = best_random_rf.feature_importances_

In [44]:
feat_importance

array([0.06929422, 0.06237054, 0.05761907, 0.04796602, 0.05130749,
       0.05843308, 0.04654299, 0.05012194, 0.04249871, 0.04592949,
       0.04656026, 0.03818365, 0.03620287, 0.03880467, 0.04350782,
       0.03897015, 0.02787916, 0.03388112, 0.02378562, 0.03990698,
       0.02457999, 0.02900164, 0.02424741, 0.02240507])

So we can find this is still a bad method!!!

We didn't get anything in the previous notebook....

Let's check if this results will be good for just one type data!

In [45]:
lda = LinearDiscriminantAnalysis(n_components=24)

# data_allfeature = np.concatenate((aat_img, aat_vis), axis=0) 
x_train,x_test,y_train,y_test = train_test_split(aat_vis,labels_1,test_size=0.3)

lda.fit(x_train, y_train)
x_train_new = lda.transform(x_train)

In [46]:
rf = RandomForestClassifier() 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, 
                               cv = 5, verbose=2, random_state=13, n_jobs = -1)
rf_random.fit(x_train_new, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 21, 32, 43, 54, 65,
                                                      76, 87, 98, 110, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 3, 4],
                                        'n_estimators': [20, 240, 460, 680, 900,
                                                         1120, 1340, 1560, 1780,
                                                         2000]},
                   random_state=13, verbose=2)

In [47]:
# print(rf_random.best_params_)
best_random_rf = rf_random.best_estimator_

x_test_new = lda.transform(x_test)
best_random_rf.score(x_test_new, y_test)

0.048

In [48]:
accuracy, f1_w, recall_w, precision_w,  f1, recall, precision = print_performance(y_test, best_random_rf.predict(x_test_new))

accuracy: 0.048
f1 score macro av: 0.043
recall score macro av: 0.047
precision score macro av: 0.048
f1 score for every class:  [0.         0.2        0.         0.28571429 0.         0.
 0.         0.16666667 0.         0.         0.         0.
 0.04651163 0.04651163 0.         0.         0.         0.
 0.         0.1        0.         0.         0.         0.19047619
 0.07407407 0.        ]
recall:  [0.         0.15384615 0.         0.22222222 0.         0.
 0.         0.15384615 0.         0.         0.         0.
 0.09090909 0.125      0.         0.         0.         0.
 0.         0.2        0.         0.         0.         0.18181818
 0.09090909 0.        ]
precision:  [0.         0.28571429 0.         0.4        0.         0.
 0.         0.18181818 0.         0.         0.         0.
 0.03125    0.02857143 0.         0.         0.         0.
 0.         0.06666667 0.         0.         0.         0.2
 0.0625     0.        ]
 


### After check and validation, this is not a real good and correct method!!!

But I have an assumption, if we let our DL models directly learn those LDA features, will this let our learning process more easy?