# First get the data

In [1]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.names
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip

!unzip "UCI HAR Dataset.zip"

--2019-12-20 10:04:59--  https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.names
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6304 (6.2K) [application/x-httpd-php]
Saving to: ‘UCI HAR Dataset.names’


2019-12-20 10:05:00 (131 MB/s) - ‘UCI HAR Dataset.names’ saved [6304/6304]

--2019-12-20 10:05:01--  https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60999314 (58M) [application/x-httpd-php]
Saving to: ‘UCI HAR Dataset.zip’


2019-12-20 10:05:06 (15.7 MB/s) - ‘UCI HAR Dataset.zip’ saved [60999314/60999314]

Archive:  UCI HAR Dataset.zip
  

# Get some helper functions ready

In [0]:
import pandas as pd
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from datetime import datetime

In [0]:
labels=['LAYING', 'SITTING','STANDING','WALKING','WALKING_DOWNSTAIRS','WALKING_UPSTAIRS']

In [0]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [0]:
def perform_model(model, X_train, y_train, X_test, y_test, class_labels, cm_normalize=True, \
                 print_cm=True, cm_cmap=plt.cm.Greens):
    
    # to store results at various phases
    results = dict()
    
    # time at which model starts training 
    train_start_time = datetime.now()
    print('training the model..')
    model.fit(X_train, y_train)
    print('Done \n \n')
    train_end_time = datetime.now()
    results['training_time'] =  train_end_time - train_start_time
    print('training_time(HH:MM:SS.ms) - {}\n\n'.format(results['training_time']))
    
    
    # predict test data
    print('Predicting test data')
    test_start_time = datetime.now()
    y_pred = model.predict(X_test)
    test_end_time = datetime.now()
    print('Done \n \n')
    results['testing_time'] = test_end_time - test_start_time
    print('testing time(HH:MM:SS:ms) - {}\n\n'.format(results['testing_time']))
    results['predicted'] = y_pred
   

    # calculate overall accuracty of the model
    accuracy = metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
    # store accuracy in results
    results['accuracy'] = accuracy
    print('---------------------')
    print('|      Accuracy      |')
    print('---------------------')
    print('\n    {}\n\n'.format(accuracy))
    
    
    # confusion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)
    results['confusion_matrix'] = cm
    if print_cm: 
        print('--------------------')
        print('| Confusion Matrix |')
        print('--------------------')
        print('\n {}'.format(cm))
        
    # plot confusin matrix
    plt.figure(figsize=(8,8))
    plt.grid(b=False)
    plot_confusion_matrix(cm, classes=class_labels, normalize=True, title='Normalized confusion matrix', cmap = cm_cmap)
    plt.show()
    
    # get classification report
    print('-------------------------')
    print('| Classifiction Report |')
    print('-------------------------')
    classification_report = metrics.classification_report(y_test, y_pred)
    # store report in results
    results['classification_report'] = classification_report
    print(classification_report)
    
    # add the trained  model to the results
    results['model'] = model
    
    return results

In [0]:
# Activities are the class labels
# It is a 6 class classification
ACTIVITIES = {
    0: 'WALKING',
    1: 'WALKING_UPSTAIRS',
    2: 'WALKING_DOWNSTAIRS',
    3: 'SITTING',
    4: 'STANDING',
    5: 'LAYING',
}

# Utility function to print the confusion matrix
def confusion_matrix(Y_true, Y_pred):
    Y_true = pd.Series([ACTIVITIES[y] for y in np.argmax(Y_true, axis=1)])
    Y_pred = pd.Series([ACTIVITIES[y] for y in np.argmax(Y_pred, axis=1)])

    return pd.crosstab(Y_true, Y_pred, rownames=['True'], colnames=['Pred'])

# Utility function to print the confusion matrix
def confusion_matrix_raw(Y_true, Y_pred):
    Y_true = pd.Series([ACTIVITIES[y] for y in np.argmax(Y_true, axis=1)])
    Y_pred = pd.Series([ACTIVITIES[y] for y in np.argmax(Y_pred, axis=1)])

    #return pd.crosstab(Y_true, Y_pred, rownames=['True'], colnames=['Pred'])
    return metrics.confusion_matrix(Y_true, Y_pred)

In [0]:
# Data directory
DATADIR = 'UCI HAR Dataset'
# Raw data signals
# Signals are from Accelerometer and Gyroscope
# The signals are in x,y,z directions
# Sensor signals are filtered to have only body acceleration
# excluding the acceleration due to gravity
# Triaxial acceleration from the accelerometer is total acceleration
SIGNALS = [
    "body_acc_x",
    "body_acc_y",
    "body_acc_z",
    "body_gyro_x",
    "body_gyro_y",
    "body_gyro_z",
    "total_acc_x",
    "total_acc_y",
    "total_acc_z"
]

# Load in the data for the baseline model

In [18]:
# get the features from the file features.txt
features = list()
with open('UCI HAR Dataset/features.txt') as f:
    features = [line.split()[0] + '_' + line.split()[1] for line in f.readlines()]
print('No of Features: {}'.format(len(features)))

No of Features: 561


In [19]:
# get the data from txt files to pandas dataffame
X_train = pd.read_csv('UCI HAR Dataset/train/X_train.txt', delim_whitespace=True, header=None, names=features)

# add subject column to the dataframe
X_train['subject'] = pd.read_csv('UCI HAR Dataset/train/subject_train.txt', header=None, squeeze=True)

y_train = pd.read_csv('UCI HAR Dataset/train/y_train.txt', names=['Activity'], squeeze=True)
y_train_labels = y_train.map({1: 'WALKING', 2:'WALKING_UPSTAIRS', 3:'WALKING_DOWNSTAIRS', 4:'SITTING', 5:'STANDING', 6:'LAYING'})

# put all columns in a single dataframe
train = X_train
train['Activity'] = y_train
train['ActivityName'] = y_train_labels
train.sample()

Unnamed: 0,1_tBodyAcc-mean()-X,2_tBodyAcc-mean()-Y,3_tBodyAcc-mean()-Z,4_tBodyAcc-std()-X,5_tBodyAcc-std()-Y,6_tBodyAcc-std()-Z,7_tBodyAcc-mad()-X,8_tBodyAcc-mad()-Y,9_tBodyAcc-mad()-Z,10_tBodyAcc-max()-X,11_tBodyAcc-max()-Y,12_tBodyAcc-max()-Z,13_tBodyAcc-min()-X,14_tBodyAcc-min()-Y,15_tBodyAcc-min()-Z,16_tBodyAcc-sma(),17_tBodyAcc-energy()-X,18_tBodyAcc-energy()-Y,19_tBodyAcc-energy()-Z,20_tBodyAcc-iqr()-X,21_tBodyAcc-iqr()-Y,22_tBodyAcc-iqr()-Z,23_tBodyAcc-entropy()-X,24_tBodyAcc-entropy()-Y,25_tBodyAcc-entropy()-Z,"26_tBodyAcc-arCoeff()-X,1","27_tBodyAcc-arCoeff()-X,2","28_tBodyAcc-arCoeff()-X,3","29_tBodyAcc-arCoeff()-X,4","30_tBodyAcc-arCoeff()-Y,1","31_tBodyAcc-arCoeff()-Y,2","32_tBodyAcc-arCoeff()-Y,3","33_tBodyAcc-arCoeff()-Y,4","34_tBodyAcc-arCoeff()-Z,1","35_tBodyAcc-arCoeff()-Z,2","36_tBodyAcc-arCoeff()-Z,3","37_tBodyAcc-arCoeff()-Z,4","38_tBodyAcc-correlation()-X,Y","39_tBodyAcc-correlation()-X,Z","40_tBodyAcc-correlation()-Y,Z",...,525_fBodyBodyAccJerkMag-maxInds,526_fBodyBodyAccJerkMag-meanFreq(),527_fBodyBodyAccJerkMag-skewness(),528_fBodyBodyAccJerkMag-kurtosis(),529_fBodyBodyGyroMag-mean(),530_fBodyBodyGyroMag-std(),531_fBodyBodyGyroMag-mad(),532_fBodyBodyGyroMag-max(),533_fBodyBodyGyroMag-min(),534_fBodyBodyGyroMag-sma(),535_fBodyBodyGyroMag-energy(),536_fBodyBodyGyroMag-iqr(),537_fBodyBodyGyroMag-entropy(),538_fBodyBodyGyroMag-maxInds,539_fBodyBodyGyroMag-meanFreq(),540_fBodyBodyGyroMag-skewness(),541_fBodyBodyGyroMag-kurtosis(),542_fBodyBodyGyroJerkMag-mean(),543_fBodyBodyGyroJerkMag-std(),544_fBodyBodyGyroJerkMag-mad(),545_fBodyBodyGyroJerkMag-max(),546_fBodyBodyGyroJerkMag-min(),547_fBodyBodyGyroJerkMag-sma(),548_fBodyBodyGyroJerkMag-energy(),549_fBodyBodyGyroJerkMag-iqr(),550_fBodyBodyGyroJerkMag-entropy(),551_fBodyBodyGyroJerkMag-maxInds,552_fBodyBodyGyroJerkMag-meanFreq(),553_fBodyBodyGyroJerkMag-skewness(),554_fBodyBodyGyroJerkMag-kurtosis(),"555_angle(tBodyAccMean,gravity)","556_angle(tBodyAccJerkMean),gravityMean)","557_angle(tBodyGyroMean,gravityMean)","558_angle(tBodyGyroJerkMean,gravityMean)","559_angle(X,gravityMean)","560_angle(Y,gravityMean)","561_angle(Z,gravityMean)",subject,Activity,ActivityName
2046,0.245567,-0.022919,-0.111754,-0.307961,-0.130955,-0.152379,-0.348899,-0.120308,-0.126794,-0.158359,-0.207061,-0.078938,0.296522,0.156763,0.579016,-0.146039,-0.759017,-0.853404,-0.676117,-0.521043,-0.295085,-0.081851,0.334088,0.36091,-0.061878,-0.60423,0.4011,-0.022951,-0.042902,-0.208757,0.106921,0.097941,0.104978,-0.719962,0.572389,-0.510932,0.258412,-0.374862,-0.014502,-0.087568,...,-0.904762,-0.023681,0.058203,-0.256384,-0.560808,-0.278531,-0.397612,-0.311572,-0.769317,-0.560808,-0.756559,-0.674005,0.199544,-0.846154,-0.416147,0.057984,-0.371473,-0.79852,-0.828301,-0.786702,-0.856469,-0.968714,-0.79852,-0.980494,-0.738075,0.128673,-0.777778,0.058765,-0.529902,-0.806119,0.527447,-0.675423,0.643123,0.769014,-0.629196,0.329779,-0.092016,11,2,WALKING_UPSTAIRS


In [20]:
# get the data from txt files to pandas dataffame
X_test = pd.read_csv('UCI HAR Dataset/test/X_test.txt', delim_whitespace=True, header=None, names=features)

# add subject column to the dataframe
X_test['subject'] = pd.read_csv('UCI HAR Dataset/test/subject_test.txt', header=None, squeeze=True)

# get y labels from the txt file
y_test = pd.read_csv('UCI HAR Dataset/test/y_test.txt', names=['Activity'], squeeze=True)
y_test_labels = y_test.map({1: 'WALKING', 2:'WALKING_UPSTAIRS',3:'WALKING_DOWNSTAIRS',\
                       4:'SITTING', 5:'STANDING',6:'LAYING'})


# put all columns in a single dataframe
test = X_test
test['Activity'] = y_test
test['ActivityName'] = y_test_labels
test.sample()

Unnamed: 0,1_tBodyAcc-mean()-X,2_tBodyAcc-mean()-Y,3_tBodyAcc-mean()-Z,4_tBodyAcc-std()-X,5_tBodyAcc-std()-Y,6_tBodyAcc-std()-Z,7_tBodyAcc-mad()-X,8_tBodyAcc-mad()-Y,9_tBodyAcc-mad()-Z,10_tBodyAcc-max()-X,11_tBodyAcc-max()-Y,12_tBodyAcc-max()-Z,13_tBodyAcc-min()-X,14_tBodyAcc-min()-Y,15_tBodyAcc-min()-Z,16_tBodyAcc-sma(),17_tBodyAcc-energy()-X,18_tBodyAcc-energy()-Y,19_tBodyAcc-energy()-Z,20_tBodyAcc-iqr()-X,21_tBodyAcc-iqr()-Y,22_tBodyAcc-iqr()-Z,23_tBodyAcc-entropy()-X,24_tBodyAcc-entropy()-Y,25_tBodyAcc-entropy()-Z,"26_tBodyAcc-arCoeff()-X,1","27_tBodyAcc-arCoeff()-X,2","28_tBodyAcc-arCoeff()-X,3","29_tBodyAcc-arCoeff()-X,4","30_tBodyAcc-arCoeff()-Y,1","31_tBodyAcc-arCoeff()-Y,2","32_tBodyAcc-arCoeff()-Y,3","33_tBodyAcc-arCoeff()-Y,4","34_tBodyAcc-arCoeff()-Z,1","35_tBodyAcc-arCoeff()-Z,2","36_tBodyAcc-arCoeff()-Z,3","37_tBodyAcc-arCoeff()-Z,4","38_tBodyAcc-correlation()-X,Y","39_tBodyAcc-correlation()-X,Z","40_tBodyAcc-correlation()-Y,Z",...,525_fBodyBodyAccJerkMag-maxInds,526_fBodyBodyAccJerkMag-meanFreq(),527_fBodyBodyAccJerkMag-skewness(),528_fBodyBodyAccJerkMag-kurtosis(),529_fBodyBodyGyroMag-mean(),530_fBodyBodyGyroMag-std(),531_fBodyBodyGyroMag-mad(),532_fBodyBodyGyroMag-max(),533_fBodyBodyGyroMag-min(),534_fBodyBodyGyroMag-sma(),535_fBodyBodyGyroMag-energy(),536_fBodyBodyGyroMag-iqr(),537_fBodyBodyGyroMag-entropy(),538_fBodyBodyGyroMag-maxInds,539_fBodyBodyGyroMag-meanFreq(),540_fBodyBodyGyroMag-skewness(),541_fBodyBodyGyroMag-kurtosis(),542_fBodyBodyGyroJerkMag-mean(),543_fBodyBodyGyroJerkMag-std(),544_fBodyBodyGyroJerkMag-mad(),545_fBodyBodyGyroJerkMag-max(),546_fBodyBodyGyroJerkMag-min(),547_fBodyBodyGyroJerkMag-sma(),548_fBodyBodyGyroJerkMag-energy(),549_fBodyBodyGyroJerkMag-iqr(),550_fBodyBodyGyroJerkMag-entropy(),551_fBodyBodyGyroJerkMag-maxInds,552_fBodyBodyGyroJerkMag-meanFreq(),553_fBodyBodyGyroJerkMag-skewness(),554_fBodyBodyGyroJerkMag-kurtosis(),"555_angle(tBodyAccMean,gravity)","556_angle(tBodyAccJerkMean),gravityMean)","557_angle(tBodyGyroMean,gravityMean)","558_angle(tBodyGyroJerkMean,gravityMean)","559_angle(X,gravityMean)","560_angle(Y,gravityMean)","561_angle(Z,gravityMean)",subject,Activity,ActivityName
1949,0.28271,-0.016633,-0.110989,-0.992354,-0.991923,-0.993821,-0.992461,-0.99124,-0.994951,-0.936639,-0.570515,-0.823865,0.848283,0.691926,0.839352,-0.993887,-0.999933,-0.999956,-0.999891,-0.991938,-0.991322,-0.996053,-0.471846,-0.663894,-0.706222,0.234433,-0.094051,0.152271,0.062948,0.194435,-0.01653,0.098529,0.157763,0.126672,0.027743,0.008753,0.028893,-0.132365,-0.092219,-0.050282,...,-0.650794,0.372277,-0.737509,-0.95394,-0.987835,-0.990439,-0.988476,-0.992518,-0.996827,-0.987835,-0.999889,-0.986804,-0.736521,-0.897436,0.168983,-0.634417,-0.87119,-0.992227,-0.992227,-0.991743,-0.993736,-0.993386,-0.992227,-0.999946,-0.995103,-0.923452,-0.968254,0.132862,-0.406681,-0.793117,-0.081971,0.251379,-0.220999,-0.530747,0.369271,-0.727066,-0.218609,18,6,LAYING


In [0]:
# get X_train and y_train from csv files
X_train = train.drop(['subject', 'Activity', 'ActivityName'], axis=1)
y_train = train.ActivityName

In [0]:
# get X_test and y_test from test csv file
X_test = test.drop(['subject', 'Activity', 'ActivityName'], axis=1)
y_test = test.ActivityName

# Load in the raw signal data

In [0]:
def load_y(subset):
    """
    The objective that we are trying to predict is a integer, from 1 to 6,
    that represents a human activity. We return a binary representation of 
    every sample objective as a 6 bits vector using One Hot Encoding
    (https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html)
    """
    filename = f'UCI HAR Dataset/{subset}/y_{subset}.txt'
    y = _read_csv(filename)[0]

    return pd.get_dummies(y).as_matrix()

In [0]:
# Utility function to read the data from csv file
def _read_csv(filename):
    return pd.read_csv(filename, delim_whitespace=True, header=None)

# Utility function to load the load
def load_signals(subset):
    signals_data = []

    for signal in SIGNALS:
        filename = f'UCI HAR Dataset/{subset}/Inertial Signals/{signal}_{subset}.txt'
        signals_data.append(
            _read_csv(filename).as_matrix()
        ) 

    # Transpose is used to change the dimensionality of the output,
    # aggregating the signals by combination of sample/timestep.
    # Resultant shape is (7352 train/2947 test samples, 128 timesteps, 9 signals)
    return np.transpose(signals_data, (1, 2, 0))

In [0]:
def load_data():
    """
    Obtain the dataset from multiple files.
    Returns: X_train, X_test, y_train, y_test
    """
    X_train, X_test = load_signals('train'), load_signals('test')
    y_train, y_test = load_y('train'), load_y('test')

    return X_train, y_train, X_test,  y_test

In [0]:
# Utility function to count the number of classes
def _count_classes(y):
    return len(set([tuple(category) for category in y]))

In [62]:
# Loading the train and test data
X_train, Y_train, X_test,  Y_test = load_data()

  # This is added back by InteractiveShellApp.init_path()
