In [2]:
import os
!pip install pandas
!pip install matplotlib



In [32]:
import pandas as pd
import matplotlib
import ipympl
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import discriminant_analysis
from sklearn import neighbors
from sklearn import tree
from sklearn import naive_bayes
from sklearn import svm
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from mpl_toolkits import mplot3d


# Importing the dataset (Smartphone-Based Recognition of Human Activities and Postural Transitions)

In [4]:
with open('resources/HAPT Data Set/features.txt', 'r') as f:
    names = f.readlines()
columns_names = []
for name in names: # removes " \n" at the end of the column's name
    name = name.strip()
    columns_names.append(name)


## Cleaning the columns' names

When I tried to add the columns_name array as a parameter of the X_train DataFrame, an error occurred. "ValueError: Duplicate names are not allowed." So I made a for loop that would fill a "pairs" array with the columns' names that appear more than once, to know which column's name needs to be modified.

In [5]:
names_array = []
pairs = []

for name in columns_names:
    if name not in names_array:
        names_array.append(name)
    elif name not in pairs:
        pairs.append(name)
    else:
        pass

def change_end(end_of_name, keep):
    i = 2
    name_before = ''
    beginning_of_name = ''
    for j in range(len(columns_names)):
        if columns_names[j].endswith(end_of_name):
            beginning_of_name = columns_names[j].split("-")[0]
            if beginning_of_name == name_before:
                columns_names[j] = columns_names[j].replace(end_of_name, keep + str(i))
                i += 1
                name_before = beginning_of_name
            else:
                name_before = beginning_of_name
                i = 2
    
def change_end_bis(entire_name):
    i = 1
    while i < 4:
        for j in range(len(columns_names)):
            if columns_names[j] == entire_name:
                columns_names[j] = columns_names[j].replace(columns_names[j][-1], str(i))
                i += 1

change_end('ropy-1', 'ropy-')
change_end_bis('fBodyAcc-Skewness-1')
change_end_bis('fBodyAcc-Kurtosis-1')
change_end_bis('fBodyAccJerk-Skewness-1')
change_end_bis('fBodyAccJerk-Kurtosis-1')
change_end_bis('fBodyGyro-Skewness-1')
change_end_bis('fBodyGyro-Kurtosis-1')

## Importing the dataset (the end)

In [6]:
X_train = pd.read_table("resources/HAPT Data Set/Train/X_train.txt", delimiter=" ", names=columns_names)
Y_train = pd.read_table("resources/HAPT Data Set/Train/y_train.txt", names=['Posture'])

X_test = pd.read_table("resources/HAPT Data Set/Test/X_test.txt", delimiter=" ", names=columns_names)
Y_test = pd.read_table("resources/HAPT Data Set/Test/y_test.txt", names=['Posture'])
# train_df = pd.read_table("resources/HAPT Data Set/Train/X_train.txt", delimiter=" ")
# train_df.columns = [columns_names]
# train_df["Posture"] = pd.read_table("resources/HAPT Data Set/Train/y_train.txt")


## Looking at the data

In [7]:
print("Head of X_train:\n{}".format(X_train.head())) # shows the first 5 rows of the DataFrame
print("Shape of X_train: {}".format(X_train.shape)) # gives the dimensions of the DataFrame
print("Description of X_train:\n{}".format(X_train.describe())) # gives a statistical description of the DataFrame
print("Null values are contained in the DataFrame? {}".format(X_train.isnull().values.any())) # checks if the DataFrame contains null values.

Head of X_train:
   tBodyAcc-Mean-1  tBodyAcc-Mean-2  tBodyAcc-Mean-3  tBodyAcc-STD-1  \
0         0.043580        -0.005970        -0.035054       -0.995381   
1         0.039480        -0.002131        -0.029067       -0.998348   
2         0.039978        -0.005153        -0.022651       -0.995482   
3         0.039785        -0.011809        -0.028916       -0.996194   
4         0.038758        -0.002289        -0.023863       -0.998241   

   tBodyAcc-STD-2  tBodyAcc-STD-3  tBodyAcc-Mad-1  tBodyAcc-Mad-2  \
0       -0.988366       -0.937382       -0.995007       -0.988816   
1       -0.982945       -0.971273       -0.998702       -0.983315   
2       -0.977314       -0.984760       -0.996415       -0.975835   
3       -0.988569       -0.993256       -0.996994       -0.988526   
4       -0.986774       -0.993115       -0.998216       -0.986479   

   tBodyAcc-Mad-3  tBodyAcc-Max-1  ...  fBodyGyroJerkMag-MeanFreq-1  \
0       -0.953325       -0.794796  ...                    -0.012

# Data visualisation

In [33]:
%matplotlib widget
test_df = pd.DataFrame(X_train['tBodyAcc-AngleWRTGravity-1'])
test_df['Posture'] = Y_train['Posture']

fig = matplotlib.pyplot.figure(figsize=[18, 18])
ax = matplotlib.pyplot.axes(projection='3d')
ax.scatter3D(X_train['tBodyAcc-STD-1'], X_train['tBodyAcc-STD-2'], X_train['tBodyAcc-STD-3'], c=Y_train['Posture'])
ax.set_xlabel('tBodyAcc-STD-1')
ax.set_ylabel('tBodyAcc-STD-2')
ax.set_zlabel('tBodyAcc-STD-3')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'tBodyAcc-STD-3')

# Models

Here, we are working on the classification of the users' postures.

In [8]:
svc_clf = svm.SVC(gamma=0.001, C=100.) # clf = classifier
svc_clf.fit(X_train, Y_train.values.ravel())
Y_svc_pred = svc_clf.predict(X_test)
print("Accuracy score for SVC: {}".format(accuracy_score(Y_test.values.ravel(), Y_svc_pred)))

Accuracy score for SVC: 0.9535104364326376


In [9]:
knn = neighbors.KNeighborsClassifier()
knn.fit(X_train, Y_train.values.ravel())
Y_knc_pred = knn.predict(X_test)
print("Accuracy score for KNeighborsClassifier: {}".format(accuracy_score(Y_test.values.ravel(), Y_knc_pred)))

Accuracy score for KNeighborsClassifier: 0.8848829854522454


In [10]:
dt_clf = tree.DecisionTreeClassifier()
dt_clf.fit(X_train, Y_train.values.ravel())
Y_dt_clf_pred = dt_clf.predict(X_test)
print("Accuracy score for DecisionTreeClassifier: {}".format(accuracy_score(Y_test.values.ravel(), Y_dt_clf_pred)))


Accuracy score for DecisionTreeClassifier: 0.8032890575585073


In [11]:
gnb_clf = naive_bayes.GaussianNB()
gnb_clf.fit(X_train, Y_train.values.ravel())
Y_gnb_clf_pred = dt_clf.predict(X_test)
print("Accuracy score for GaussianNB: {}".format(accuracy_score(Y_test.values.ravel(), Y_gnb_clf_pred)))


Accuracy score for GaussianNB: 0.8032890575585073


# Cross-validation

In [16]:
results = []
models = [svc_clf, knn, dt_clf, gnb_clf]
for model in models:
    kfold = model_selection.StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train.values.ravel(), cv=kfold, scoring='accuracy')
    print(cv_results)
    results.append(cv_results)
print(results)

[0.97554698 0.98198198 0.97168597 0.97812098 0.97812098 0.98198198
 0.97812098 0.98453608 0.97680412 0.9806701 ]
[0.93822394 0.95495495 0.96267696 0.95881596 0.94723295 0.94723295
 0.95109395 0.96134021 0.95489691 0.95876289]
[0.91377091 0.92535393 0.91505792 0.92792793 0.92149292 0.91505792
 0.91505792 0.91237113 0.93041237 0.93298969]
[0.67310167 0.75804376 0.62290862 0.72458172 0.70785071 0.69240669
 0.74002574 0.71005155 0.7628866  0.70876289]
[array([0.97554698, 0.98198198, 0.97168597, 0.97812098, 0.97812098,
       0.98198198, 0.97812098, 0.98453608, 0.97680412, 0.9806701 ]), array([0.93822394, 0.95495495, 0.96267696, 0.95881596, 0.94723295,
       0.94723295, 0.95109395, 0.96134021, 0.95489691, 0.95876289]), array([0.91377091, 0.92535393, 0.91505792, 0.92792793, 0.92149292,
       0.91505792, 0.91505792, 0.91237113, 0.93041237, 0.93298969]), array([0.67310167, 0.75804376, 0.62290862, 0.72458172, 0.70785071,
       0.69240669, 0.74002574, 0.71005155, 0.7628866 , 0.70876289])]


## Evaluate the models