In [1]:
##############################################################
#                                                            #
#    Mark Hoogendoorn and Burkhardt Funk (2017)              #
#    Machine Learning for the Quantified Self                #
#    Springer                                                #
#    Chapter 7                                               #
#                                                            #
##############################################################

import os
import copy
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import time
start = time.time()

from sklearn.model_selection import train_test_split

from Chapter7.PrepareDatasetForLearning import PrepareDatasetForLearning
from Chapter7.LearningAlgorithms import ClassificationAlgorithms
from Chapter7.LearningAlgorithms import RegressionAlgorithms
from Chapter7.Evaluation import ClassificationEvaluation
from Chapter7.Evaluation import RegressionEvaluation
from Chapter7.FeatureSelection import FeatureSelectionClassification
from Chapter7.FeatureSelection import FeatureSelectionRegression
from util import util
from util.VisualizeDataset import VisualizeDataset
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
# Read the result from the previous chapter, and make sure the index is of the type datetime.
DATA_PATH = Path('./datasets/group47/dataset/intermediate_datafiles/')
DATASET_FNAME = 'chapter5_group47_result.csv'
RESULT_FNAME = 'chapter7_group47_classification_result.csv'
EXPORT_TREE_PATH = Path('./figures/crowdsignals_ch7_group47_classification/')

# Next, we declare the parameters we'll use in the algorithms.
N_FORWARD_SELECTION = 50

try:
    dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = pd.to_datetime(dataset.index)

# Let us create our visualization class again.
DataViz = VisualizeDataset()#__file__)

# Let us consider our first task, namely the prediction of the label. We consider this as a non-temporal task.

# We create a single column with the categorical attribute representing our class. Furthermore, we use 70% of our data
# for training and the remaining 30% as an independent test set. We select the sets based on stratified sampling. We remove
# cases where we do not know the label.

prepare = PrepareDatasetForLearning()

# Check if your dataset is empty
print("Shape of the dataset: ", dataset.shape)

# Check if your dataset is empty
if dataset.empty:
    print("Dataset is empty.")
else:
    print("Dataset is not empty.")

# Check the shape of your dataset
print("Shape of the dataset: ", dataset.shape)

# Check the number of instances in each class
class_counts = dataset['cluster'].value_counts()
print("Class counts: ", class_counts)

# Identify the least populated class
least_populated_class = class_counts.idxmin()
print("Least populated class: ", least_populated_class)

# Check if the least populated class has less than 2 members
if class_counts.min() < 2:
    print(f"The class '{least_populated_class}' has only {class_counts.min()} member, which is too few. The minimum number of groups for any class cannot be less than 2.")
    exit()




# Now, try the split
try:
    labels = ['labelCycling', 'labelStairs', 'labelWalking', 'labelSitting', 'labelOther']
    # Check the number of instances for each label
    for label in labels:
        label_counts = dataset[label].value_counts()
        print(f"Counts for {label}: ", label_counts)

    # binary_labels = mlb.fit_transform(dataset[labels])
    # dataset[labels] = binary_labels
    

    
    train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(dataset, labels, 'cluster', 0.7, filter=True, temporal=False)

    # Check if labels are in the correct format
    def check_label_format(y):
        unique_values = np.unique(y)
        if len(unique_values) > 2 or not all(i in [0, 1] for i in unique_values):
            print("Error: Labels are not in binary format.")
            return False
        return True

    # Check the format of train_y and test_y
    if not check_label_format(train_y):
        print("train_y is not in the correct format.")
        exit()
    else:
        print("train_y is in the correct format.")

    if not check_label_format(test_y):
        print("test_y is not in the correct format.")
        exit()
    else:
        print("test_y is in the correct format.")

    print("Shape of train_X: ", train_X.shape)
    print("Shape of train_y: ", train_y.shape)
    print("Shape of test_x: ",test_X.shape)
    print("Shape of test_y: ", test_y.shape)
    print("Shape of train_y values: ", train_y.values.ravel().shape)
    print("First 10 values of train_y: ", train_y.values.ravel()[:10])

    lb = LabelBinarizer()

    # Fit the LabelBinarizer and transform the labels
    train_y = lb.fit_transform(train_y)
    test_y = lb.transform(test_y)

except ValueError as e:
    print("Error during split: ", str(e))
    exit()

Shape of the dataset:  (3623, 163)
Dataset is not empty.
Shape of the dataset:  (3623, 163)
Class counts:  2    692
3    683
4    675
0    565
5    515
1    493
Name: cluster, dtype: int64
Least populated class:  1
Counts for labelCycling:  0    2915
1     708
Name: labelCycling, dtype: int64
Counts for labelStairs:  0    3413
1     210
Name: labelStairs, dtype: int64
Counts for labelWalking:  0    2984
1     639
Name: labelWalking, dtype: int64
Counts for labelSitting:  1    1929
0    1694
Name: labelSitting, dtype: int64
Counts for labelOther:  0    3474
1     149
Name: labelOther, dtype: int64
Features:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102

  res_values = method(rvalues)


In [4]:
print('Training set length is: ', len(train_X.index))
print('Test set length is: ', len(test_X.index))

# Select subsets of the features that we will consider:

basic_features = ['acc_phone_x','acc_phone_y','acc_phone_z','lin_acc_phone_x','lin_acc_phone_y','lin_acc_phone_z','mag_phone_x','mag_phone_y','mag_phone_z','pca_1','pca_2','pca_3','pca_4','pca_5','pca_6','pca_7']
pca_features = ['pca_1','pca_2','pca_3','pca_4','pca_5','pca_6','pca_7']
time_features = [name for name in dataset.columns if '_temp_' in name]
freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))]
print('#basic features: ', len(basic_features))
print('#PCA features: ', len(pca_features))
print('#time features: ', len(time_features))
print('#frequency features: ', len(freq_features))
cluster_features = ['cluster']
print('#cluster features: ', len(cluster_features))
features_after_chapter_3 = list(set().union(basic_features, pca_features))
features_after_chapter_4 = list(set().union(basic_features, pca_features, time_features, freq_features))
features_after_chapter_5 = list(set().union(basic_features, pca_features, time_features, freq_features, cluster_features))


selected_features = {'chapter_3': features_after_chapter_3,
                        'chapter_4': features_after_chapter_4,
                        'chapter_5': features_after_chapter_5}

Training set length is:  2525
Test set length is:  1083
#basic features:  16
#PCA features:  7
#time features:  32
#frequency features:  81
#cluster features:  1


In [5]:
# First, let us consider the performance over a selection of features:

fs = FeatureSelectionClassification()

features, ordered_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION,
                                                                  train_X[selected_features['chapter_5']],
                                                                  test_X[selected_features['chapter_5']],
                                                                  train_y,
                                                                  test_y,
                                                                  gridsearch=False)

Added feature0
Added feature1
Added feature2
Added feature3
Added feature4
Added feature5
Added feature6
Added feature7
Added feature8
Added feature9
Added feature10
Added feature11
Added feature12
Added feature13
Added feature14
Added feature15
Added feature16
Added feature17
Added feature18
Added feature19
Added feature20
Added feature21
Added feature22
Added feature23
Added feature24
Added feature25
Added feature26
Added feature27
Added feature28
Added feature29
Added feature30
Added feature31
Added feature32
Added feature33
Added feature34
Added feature35
Added feature36
Added feature37
Added feature38
Added feature39
Added feature40
Added feature41
Added feature42
Added feature43
Added feature44
Added feature45
Added feature46
Added feature47
Added feature48
Added feature49


In [1]:
DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION+1)], y=[ordered_scores],
                xlabel='number of features', ylabel='accuracy')

NameError: name 'DataViz' is not defined

In [6]:
# Let us first study the impact of regularization and model complexity: does regularization prevent overfitting?

learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()
start = time.time()

reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10]
performance_training = []
performance_test = []
N_REPEATS_NN = 3

In [7]:
for reg_param in reg_parameters:
    performance_tr = 0
    performance_te = 0
    for i in range(0, N_REPEATS_NN):

        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(
            train_X, train_y,
            test_X, hidden_layer_sizes=(250, ), alpha=reg_param, max_iter=500,
            gridsearch=False
        )

        performance_tr += eval.accuracy(train_y, class_train_y)
        performance_te += eval.accuracy(test_y, class_test_y)
    performance_training.append(performance_tr/N_REPEATS_NN)
    performance_test.append(performance_te/N_REPEATS_NN)

In [None]:
DataViz.plot_xy(x=[reg_parameters, reg_parameters], y=[performance_training, performance_test], method='semilogx',
                xlabel='regularization parameter value', ylabel='accuracy', ylim=[0.95, 1.01],
                names=['training', 'test'], line_styles=['r-', 'b:'])

In [8]:
# Second, let us consider the influence of certain parameter settings for the tree model.

leaf_settings = [1,2,5,10]
performance_training = []
performance_test = []

# Convert train_y to a DataFrame
train_y_df = pd.DataFrame(train_y, columns=['labelCycling', 'labelStairs', 'labelWalking', 'labelSitting', 'labelOther'])

for no_points_leaf in leaf_settings:

    # class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
    #     train_X[selected_features['chapter_5']], train_y, test_X[selected_features['chapter_5']], min_samples_leaf=no_points_leaf,
    #     gridsearch=False, print_model_details=False)

    # Then pass train_y_df to the decision_tree method
    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
        train_X[selected_features['chapter_5']], train_y_df, test_X[selected_features['chapter_5']],
        gridsearch=True,
        print_model_details=True, export_tree_path=EXPORT_TREE_PATH)

    # performance_training.append(eval.accuracy(train_y, class_train_y))
    #new
    print("Unique values in train_y:", np.unique(train_y))
    print("Shape of train_y:", train_y.shape)

    print("Unique values in class_train_y:", np.unique(class_train_y))
    print("Shape of class_train_y:", class_train_y.shape)
    #performance_training.append(jaccard_score(train_y, class_train_y, average='samples'))
    mlb = MultiLabelBinarizer(classes=np.unique(train_y))
    #class_train_y_binarized = mlb.fit_transform(class_train_y.reshape(-1, 1))
    # Create an empty binary matrix with the same shape as train_y
    class_train_y_binarized = np.zeros_like(train_y)

    # Set the corresponding columns in class_train_y_binarized to 1 based on the labels in class_train_y
    for i, label in enumerate(class_train_y):
        class_train_y_binarized[i, label] = 1


{'criterion': 'entropy', 'min_samples_leaf': 2}
Feature importance decision tree:
mag_phone_y_temp_mean_ws_30 & 0.5198219934700646
pca_3_temp_mean_ws_30 & 0.08658929130040623
pca_1_temp_mean_ws_30 & 0.07847455011848531
acc_phone_y_temp_mean_ws_30 & 0.04940059371595363
acc_phone_y_temp_std_ws_30 & 0.04298300033071299
mag_phone_y_temp_std_ws_30 & 0.03428718681449926
pca_7_temp_std_ws_30 & 0.03170752296206501
acc_phone_z_freq_0.0_Hz_ws_10 & 0.027213952399858674
mag_phone_y & 0.025184148913247318
acc_phone_z_temp_mean_ws_30 & 0.021640190222891983
mag_phone_x & 0.02160720758548066
mag_phone_x_freq_0.0_Hz_ws_10 & 0.014983737684304301
acc_phone_x_freq_0.0_Hz_ws_10 & 0.013709057374630454
mag_phone_y_max_freq & 0.01193296072398955
pca_6_temp_std_ws_30 & 0.011140815261354485
pca_4_temp_mean_ws_30 & 0.00932379112205554
lin_acc_phone_y_freq_weighted & 0.0
lin_acc_phone_x_freq_0.3_Hz_ws_10 & 0.0
lin_acc_phone_y_freq_0.0_Hz_ws_10 & 0.0
mag_phone_y_freq_0.4_Hz_ws_10 & 0.0
acc_phone_y_freq_weighted & 

In [12]:
print("Shape of train_y: ", train_y.shape)
print("Shape of class_train_y_binarized: ", class_train_y_binarized.shape)
print("Unique labels in class_train_y: ", np.unique(class_train_y))
print("Shape of test_y: ", test_y.shape)
print("Shape of class_test_y_binarized: ", class_test_y_binarized.shape)



Shape of train_y:  (2525, 5)
Shape of class_train_y_binarized:  (2525, 5)
Unique labels in class_train_y:  [0 1]
Shape of test_y:  (1083, 5)
Shape of class_test_y_binarized:  (1083, 2)


In [11]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
class_test_y_binarized = mlb.fit_transform(class_test_y.reshape(-1, 1))

performance_test.append(eval.accuracy(test_y, class_test_y_binarized))


ValueError: inconsistent shapes

In [10]:
performance_training.append(jaccard_score(train_y, class_train_y_binarized, average='samples'))

performance_test.append(eval.accuracy(test_y, class_test_y))

ValueError: Classification metrics can't handle a mix of multilabel-indicator and binary targets

In [None]:
DataViz.plot_xy(x=[leaf_settings, leaf_settings], y=[performance_training, performance_test],
                xlabel='minimum number of points per leaf', ylabel='accuracy',
                names=['training', 'test'], line_styles=['r-', 'b:'])

In [17]:
# So yes, it is important :) Therefore we perform grid searches over the most important parameters, and do so by means
# of cross validation upon the training set.

possible_feature_sets = [basic_features] + list(selected_features.values())
feature_names = ['initial set'] + list(selected_features.keys())
N_KCV_REPEATS = 5

print('Preprocessing took', time.time()-start, 'seconds.')

Preprocessing took 1787.6657936573029 seconds.


In [20]:
scores_over_all_algs = []

for i in range(0, len(possible_feature_sets)):
    selected_train_X = train_X[possible_feature_sets[i]]
    selected_test_X = test_X[possible_feature_sets[i]]

    # First we run our non deterministic classifiers a number of times to average their score.

    performance_tr_nn = 0
    performance_tr_rf = 0
    performance_tr_svm = 0
    performance_te_nn = 0
    performance_te_rf = 0
    performance_te_svm = 0

    for repeat in range(0, N_KCV_REPEATS):
        print("Training NeuralNetwork run {} / {} ... ".format(repeat, N_KCV_REPEATS, feature_names[i]))
        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(
            selected_train_X, train_y, selected_test_X, gridsearch=True
        )
        print("Training RandomForest run {} / {} ... ".format(repeat, N_KCV_REPEATS, feature_names[i]))
        performance_tr_nn += eval.accuracy(train_y, class_train_y)
        performance_te_nn += eval.accuracy(test_y, class_test_y)
        
        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
            selected_train_X, train_y, selected_test_X, gridsearch=True
        )
        
        performance_tr_rf += eval.accuracy(train_y, class_train_y)
        performance_te_rf += eval.accuracy(test_y, class_test_y)

        print("Training SVM run {} / {}, featureset: {}... ".format(repeat, N_KCV_REPEATS, feature_names[i]))
      
        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.support_vector_machine_with_kernel(
            selected_train_X, train_y, selected_test_X, gridsearch=True
        )
        performance_tr_svm += eval.accuracy(train_y, class_train_y)
        performance_te_svm += eval.accuracy(test_y, class_test_y)

    overall_performance_tr_nn = performance_tr_nn/N_KCV_REPEATS
    overall_performance_te_nn = performance_te_nn/N_KCV_REPEATS
    overall_performance_tr_rf = performance_tr_rf/N_KCV_REPEATS
    overall_performance_te_rf = performance_te_rf/N_KCV_REPEATS
    overall_performance_tr_svm = performance_tr_svm/N_KCV_REPEATS
    overall_performance_te_svm = performance_te_svm/N_KCV_REPEATS

    #     #And we run our deterministic classifiers:
    print("Determenistic Classifiers:")

    print("Training Nearest Neighbor run 1 / 1, featureset {}:".format(feature_names[i]))
    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(
        selected_train_X, train_y, selected_test_X, gridsearch=True
    )
    performance_tr_knn = eval.accuracy(train_y, class_train_y)
    performance_te_knn = eval.accuracy(test_y, class_test_y)
    print("Training Descision Tree run 1 / 1  featureset {}:".format(feature_names[i]))
    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
        selected_train_X, train_y, selected_test_X, gridsearch=True
    )
    
    performance_tr_dt = eval.accuracy(train_y, class_train_y)
    performance_te_dt = eval.accuracy(test_y, class_test_y)
    print("Training Naive Bayes run 1/1 featureset {}:".format(feature_names[i]))
    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(
        selected_train_X, train_y, selected_test_X
    )
   
    performance_tr_nb = eval.accuracy(train_y, class_train_y)
    performance_te_nb = eval.accuracy(test_y, class_test_y)

    scores_with_sd = util.print_table_row_performances(feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [
                                                                                                (overall_performance_tr_nn, overall_performance_te_nn),
                                                                                                (overall_performance_tr_rf, overall_performance_te_rf),
                                                                                                (overall_performance_tr_svm, overall_performance_te_svm),
                                                                                                (performance_tr_knn, performance_te_knn),
                                                                                                (performance_tr_dt, performance_te_dt),
                                                                                                (performance_tr_nb, performance_te_nb)])
    scores_over_all_algs.append(scores_with_sd)

Training NeuralNetwork run 0 / 5 ... 
Training RandomForest run 0 / 5 ... 


AttributeError: 'numpy.ndarray' object has no attribute 'values'

: 

In [None]:
DataViz.plot_performances_classification(['NN', 'RF','SVM', 'KNN', 'DT', 'NB'], feature_names, scores_over_all_algs)

In [None]:
# # And we study two promising ones in more detail. First, let us consider the decision tree, which works best with the
# # selected features.

class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(train_X[selected_features], train_y, test_X[selected_features],
                                                                                           gridsearch=True,
                                                                                           print_model_details=True, export_tree_path=EXPORT_TREE_PATH)

class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
    train_X[selected_features], train_y, test_X[selected_features],
    gridsearch=True, print_model_details=True)

test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns)


In [None]:
DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False)