## Import libraries

In [167]:
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import math

## Present working directory

In [168]:
# pwd

## New dataframe for consolidating all the data

In [169]:
all_subject_df = pd.DataFrame()

## Function to load all the datasets into a single dataframe

In [170]:
def load_datasets(folderpath):
    global all_subject_df
    
    files = [f for f in listdir(folderpath) if isfile(join(folderpath, f))]
    
    for file in files:
        subject_id = file.split('.')
        full_file_name = folderpath + '/' + str(file)
        print ("Loading dataset - {}".format(full_file_name))
        subject_df = pd.read_csv(full_file_name,sep = ' ', header=None)
        subject_df['subject_id'] = subject_id[0]
        all_subject_df = all_subject_df.append(subject_df, ignore_index=True, sort=False)

In [171]:
load_datasets("./dataset/Protocol") # Provide the folder path containing datasets

Loading dataset - ./dataset/Protocol/subject108.dat
Loading dataset - ./dataset/Protocol/subject109.dat
Loading dataset - ./dataset/Protocol/subject107.dat
Loading dataset - ./dataset/Protocol/subject106.dat
Loading dataset - ./dataset/Protocol/subject104.dat
Loading dataset - ./dataset/Protocol/subject105.dat
Loading dataset - ./dataset/Protocol/subject101.dat
Loading dataset - ./dataset/Protocol/subject102.dat
Loading dataset - ./dataset/Protocol/subject103.dat


## Original dataframe size

In [172]:
all_subject_df.shape

(2872533, 55)

## Loading optional datasets
Commented to reduce data size

In [173]:
# load_datasets("./dataset/Optional") # Provide the folder path containing datasets

## Size after loading optional datasets
Commenting due to exclusion of optional dataset

In [174]:
# all_subject_df.shape

## Count of records for each subject

In [175]:
all_subject_df['subject_id'].value_counts()

subject102    447000
subject108    408031
subject101    376417
subject105    374783
subject106    361817
subject104    329576
subject107    313599
subject103    252833
subject109      8477
Name: subject_id, dtype: int64

In [176]:
all_subject_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,subject_id
0,5.89,0,,33.3125,-9.7976,-1.4567,1.01438,-9.70702,-1.60337,1.32098,...,0.010364,0.010731,-35.455,38.3829,-17.9083,0.546211,0.370903,0.695334,0.283892,subject108
1,5.9,0,,33.3125,-9.91204,-1.41822,0.97405,-9.78262,-1.60285,1.29085,...,-0.005926,-0.006704,-35.4508,38.2543,-17.7663,0.545806,0.370867,0.69568,0.283872,subject108
2,5.91,0,,33.3125,-9.87531,-1.57027,0.975113,-9.78237,-1.57265,1.29079,...,-0.002265,0.014646,-35.6984,38.2688,-17.058,0.545484,0.371607,0.695437,0.284117,subject108
3,5.92,0,,33.3125,-9.72175,-1.6087,1.05452,-9.70677,-1.60334,1.35118,...,0.035314,0.010983,-34.9492,38.7447,-17.6233,0.545474,0.372077,0.694859,0.284935,subject108
4,5.93,0,,33.3125,-9.7992,-1.49497,0.975999,-9.58657,-1.57408,1.27549,...,-0.001065,-0.00041,-35.2105,38.8857,-17.908,0.54581,0.372349,0.694122,0.285732,subject108


## Statistics for the data

In [177]:
all_subject_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
count,2872533.0,2872533.0,262268.0,2859392.0,2859392.0,2859392.0,2859392.0,2859392.0,2859392.0,2859392.0,...,2860784.0,2860784.0,2860784.0,2860784.0,2860784.0,2860784.0,2860784.0,2860784.0,2860784.0,2860784.0
mean,1834.354,5.466243,109.872508,32.65258,-4.960786,3.587758,3.168417,-4.88942,3.584267,3.349479,...,0.008635143,-0.03450122,0.00775203,-32.72102,1.593304,16.89044,0.3986417,0.02154835,0.3091533,-0.01878725
std,1105.689,6.331333,25.870036,1.844274,5.985029,6.277838,3.843923,5.992726,6.05575,3.84065,...,1.073556,0.5966026,1.842552,18.8786,21.61181,20.30858,0.3034561,0.5691302,0.3237875,0.4731373
min,5.64,0.0,57.0,24.75,-145.367,-104.301,-101.452,-61.4895,-61.868,-61.9347,...,-23.995,-18.1269,-14.0196,-172.865,-137.908,-109.289,-0.253628,-0.956876,-0.876838,-0.997281
25%,893.16,0.0,90.0,31.4375,-9.02842,1.290268,0.9685817,-8.93327,1.28468,1.16404,...,-0.152625,-0.08267092,-0.3084595,-42.8948,-11.4854,3.289347,0.156344,-0.583991,0.01087023,-0.504758
50%,1790.83,3.0,108.0,33.125,-5.788145,3.57083,2.958415,-5.737615,3.61343,3.132855,...,0.004251595,-0.00424985,-0.002216015,-33.9002,1.362615,18.09105,0.3197555,0.0,0.304382,0.0
75%,2710.57,7.0,125.0,34.0,-0.782942,6.60272,6.00293,-0.724992,6.60196,6.257612,...,0.09464213,0.08296868,0.06343258,-19.0592,17.3309,30.8782,0.579442,0.627945,0.6020032,0.4634433
max,4475.63,24.0,202.0,35.5,62.8596,155.699,157.76,52.8214,62.2598,61.9446,...,17.4204,13.5882,16.5288,97.5255,123.306,146.9,1.0,0.959538,0.992354,0.996105


## Renaming the columns with proper names

In [178]:
all_subject_df = all_subject_df.rename(index=str, columns={0: "Timestamp", 1: "ActivityID", 2: "HeartRate", 3:"IMU_Hand_temperature", 4:"IMU_Hand_3D-acc_16g_1", 5:"IMU_Hand_3D-acc_16g_2", 6:"IMU_Hand_3D-acc_16g_3", 7:"IMU_Hand_3D-acc_6g_1", 8:"IMU_Hand_3D-acc_6g_2", 9:"IMU_Hand_3D-acc_6g_3", 10:"IMU_Hand_3D-gyroscope_1", 11:"IMU_Hand_3D-gyroscope_2", 12:"IMU_Hand_3D-gyroscope_3", 13:"IMU_Hand_3D-magnetometer_1", 14:"IMU_Hand_3D-magnetometer_2", 15:"IMU_Hand_3D-magnetometer_3", 16:"IMU_Hand_orientation_1", 17:"IMU_Hand_orientation_2", 18:"IMU_Hand_orientation_3", 19:"IMU_Hand_orientation_4",20:"IMU_Chest_temperature", 21:"IMU_Chest_3D-acc_16g_1", 22:"IMU_Chest_3D-acc_16g_2", 23:"IMU_Chest_3D-acc_16g_3", 24:"IMU_Chest_3D-acc_6g_1", 25:"IMU_Chest_3D-acc_6g_2", 26:"IMU_Chest_3D-acc_6g_3", 27:"IMU_Chest_3D-gyroscope_1", 28:"IMU_Chest_3D-gyroscope_2", 29:"IMU_Chest_3D-gyroscope_3", 30:"IMU_Chest_3D-magnetometer_1", 31:"IMU_Chest_3D-magnetometer_2", 32:"IMU_Chest_3D-magnetometer_3", 33:"IMU_Chest_orientation_1", 34:"IMU_Chest_orientation_2", 35:"IMU_Chest_orientation_3", 36:"IMU_Chest_orientation_4",37:"IMU_Ankle_temperature", 38:"IMU_Ankle_3D-acc_16g_1", 39:"IMU_Ankle_3D-acc_16g_2", 40:"IMU_Ankle_3D-acc_16g_3", 41:"IMU_Ankle_3D-acc_6g_1", 42:"IMU_Ankle_3D-acc_6g_2", 43:"IMU_Ankle_3D-acc_6g_3", 44:"IMU_Ankle_3D-gyroscope_1", 45:"IMU_Ankle_3D-gyroscope_2", 46:"IMU_Ankle_3D-gyroscope_3", 47:"IMU_Ankle_3D-magnetometer_1", 48:"IMU_Ankle_3D-magnetometer_2", 49:"IMU_Ankle_3D-magnetometer_3", 50:"IMU_Ankle_orientation_1", 51:"IMU_Ankle_orientation_2", 52:"IMU_Ankle_orientation_3", 53:"IMU_Ankle_orientation_4"})

In [179]:
all_subject_df.head()

Unnamed: 0,Timestamp,ActivityID,HeartRate,IMU_Hand_temperature,IMU_Hand_3D-acc_16g_1,IMU_Hand_3D-acc_16g_2,IMU_Hand_3D-acc_16g_3,IMU_Hand_3D-acc_6g_1,IMU_Hand_3D-acc_6g_2,IMU_Hand_3D-acc_6g_3,...,IMU_Ankle_3D-gyroscope_2,IMU_Ankle_3D-gyroscope_3,IMU_Ankle_3D-magnetometer_1,IMU_Ankle_3D-magnetometer_2,IMU_Ankle_3D-magnetometer_3,IMU_Ankle_orientation_1,IMU_Ankle_orientation_2,IMU_Ankle_orientation_3,IMU_Ankle_orientation_4,subject_id
0,5.89,0,,33.3125,-9.7976,-1.4567,1.01438,-9.70702,-1.60337,1.32098,...,0.010364,0.010731,-35.455,38.3829,-17.9083,0.546211,0.370903,0.695334,0.283892,subject108
1,5.9,0,,33.3125,-9.91204,-1.41822,0.97405,-9.78262,-1.60285,1.29085,...,-0.005926,-0.006704,-35.4508,38.2543,-17.7663,0.545806,0.370867,0.69568,0.283872,subject108
2,5.91,0,,33.3125,-9.87531,-1.57027,0.975113,-9.78237,-1.57265,1.29079,...,-0.002265,0.014646,-35.6984,38.2688,-17.058,0.545484,0.371607,0.695437,0.284117,subject108
3,5.92,0,,33.3125,-9.72175,-1.6087,1.05452,-9.70677,-1.60334,1.35118,...,0.035314,0.010983,-34.9492,38.7447,-17.6233,0.545474,0.372077,0.694859,0.284935,subject108
4,5.93,0,,33.3125,-9.7992,-1.49497,0.975999,-9.58657,-1.57408,1.27549,...,-0.001065,-0.00041,-35.2105,38.8857,-17.908,0.54581,0.372349,0.694122,0.285732,subject108


## Remove rows with ActivityID = 0 (Arbitrary activity)

In [180]:
all_subject_df = all_subject_df[all_subject_df['ActivityID'] != 0] #re

In [182]:
all_subject_df.shape

(1942872, 55)

In [183]:
all_subject_df['ActivityID'].value_counts()

4     238761
17    238690
1     192523
3     189931
7     188107
2     185188
16    175353
6     164600
12    117216
13    104944
5      98199
24     49360
Name: ActivityID, dtype: int64

In [184]:
all_subject_df.describe()

Unnamed: 0,Timestamp,ActivityID,HeartRate,IMU_Hand_temperature,IMU_Hand_3D-acc_16g_1,IMU_Hand_3D-acc_16g_2,IMU_Hand_3D-acc_16g_3,IMU_Hand_3D-acc_6g_1,IMU_Hand_3D-acc_6g_2,IMU_Hand_3D-acc_6g_3,...,IMU_Ankle_3D-gyroscope_1,IMU_Ankle_3D-gyroscope_2,IMU_Ankle_3D-gyroscope_3,IMU_Ankle_3D-magnetometer_1,IMU_Ankle_3D-magnetometer_2,IMU_Ankle_3D-magnetometer_3,IMU_Ankle_orientation_1,IMU_Ankle_orientation_2,IMU_Ankle_orientation_3,IMU_Ankle_orientation_4
count,1942872.0,1942872.0,177408.0,1931748.0,1931748.0,1931748.0,1931748.0,1931748.0,1931748.0,1931748.0,...,1934365.0,1934365.0,1934365.0,1934365.0,1934365.0,1934365.0,1934365.0,1934365.0,1934365.0,1934365.0
mean,1705.202,8.08183,107.468502,32.75752,-4.938311,3.580308,3.609347,-4.871064,3.569088,3.794033,...,0.01062149,-0.0367671,0.007725809,-31.56858,1.414712,17.24289,0.3811172,-0.009125905,0.3022156,-0.05725826
std,1093.463,6.174789,26.977609,1.791983,6.231142,6.887907,3.960176,6.237104,6.586273,3.94633,...,1.126987,0.638079,2.011906,18.35626,21.6885,19.72172,0.3037064,0.5713763,0.332831,0.4801762
min,31.2,1.0,57.0,24.875,-145.367,-104.301,-101.452,-61.2147,-61.8417,-61.9347,...,-23.995,-18.1269,-14.0196,-172.865,-137.908,-102.716,1.52128e-06,-0.956876,-0.876838,-0.997281
25%,744.54,3.0,86.0,31.6875,-8.9558,1.048068,1.161655,-8.854415,1.047087,1.36396,...,-0.206296,-0.106301,-0.436872,-41.7072,-12.4517,3.76173,0.14257,-0.613197,0.00488722,-0.538348
50%,1480.33,6.0,104.0,33.1875,-5.42667,3.523155,3.44133,-5.353635,3.562085,3.67237,...,0.00466678,-0.00397101,-0.0022434,-33.9775,0.800972,18.7576,0.283652,0.0,0.305178,0.0
75%,2663.61,13.0,124.0,34.0625,-0.9430042,6.45432,6.538525,-0.8922058,6.45867,6.78776,...,0.130721,0.115439,0.0916035,-17.8767,17.8561,31.2087,0.56001,0.615105,0.596304,0.438298
max,4245.68,24.0,202.0,35.5,62.8596,155.699,157.76,52.8214,62.2598,61.9234,...,17.4204,13.5882,16.5288,91.5516,94.2478,146.9,1.0,0.959538,0.951482,0.996105


## Averaging out the heartbeat values to 2 closest neighbors

In [190]:
def add_missing_heartbeat_values():
    index = 0
    chooseFirst = True
    first = 0
    firstIndex = 0
    second = 0
    secondIndex = 0

    for i, (index, row) in enumerate(all_subject_df.iterrows()):
        if(math.isnan(row['HeartRate']) == False):
            if chooseFirst == True:
                first = int(row['HeartRate'])
                firstIndex = int(index)
                chooseFirst = False
            else:
                second = row['HeartRate']
                chooseFirst = False
                secondIndex = int(index)
                for new_index, updated_row in all_subject_df[firstIndex-6147+1:secondIndex-6147].iterrows():
                    all_subject_df.at[new_index, 'HeartRate'] = (first+second)/2
                firstIndex = secondIndex

        if (i == len(all_subject_df)-1) and math.isnan(row['HeartRate']) == True:
            secondIndex = int(index)
            for new_index, updated_row in all_subject_df[firstIndex-6147+1:secondIndex-6147+1].iterrows():
                all_subject_df.at[new_index, 'HeartRate'] = first

In [191]:
add_missing_heartbeat_values()

## Exporting the dataframe to a csv file
Commented because of the processing time

In [188]:
# all_subject_df.to_csv("complete_dataset.csv")

In [192]:
all_subject_df.shape

(1942872, 55)

In [20]:
# Impute Heart Rate

def person_activity_mean(x):
    # compute mean() for a particular person and activity combination
    pass
    
# Need to use mean value for a particular activity for the particular subject
#new_df = all_subject_df['HeartRate'].fillna((all_subject_df['HeartRate'].apply(lambda x: person_activity_mean(x))))

