# Setup

In [1]:
import os
from pandas import read_csv
from numpy import dstack
import pandas as pd
import numpy as np

In [2]:
# load a single file as a numpy array
def load_file(filepath):
	dataframe = read_csv(filepath, header=None, delim_whitespace=True)
	return dataframe.values

In [3]:
# load a list of files and return as a 3d numpy array
def load_group(filenames, prefix=''):
	loaded = list()
	for name in filenames:
		data = load_file(prefix + name)
		loaded.append(data)
	# stack group so that features are the 3rd dimension
	loaded = dstack(loaded)
	return loaded

In [4]:
# load a dataset group, such as train or test
def load_dataset_group(group, prefix=''):
	filepath = prefix + group + '/Inertial Signals/'
	# load all 9 files as a single array
	filenames = list()
	# total acceleration
	filenames += ['total_acc_x_'+group+'.txt', 'total_acc_y_'+group+'.txt', 'total_acc_z_'+group+'.txt']
	# body acceleration
	filenames += ['body_acc_x_'+group+'.txt', 'body_acc_y_'+group+'.txt', 'body_acc_z_'+group+'.txt']
	# body gyroscope
	filenames += ['body_gyro_x_'+group+'.txt', 'body_gyro_y_'+group+'.txt', 'body_gyro_z_'+group+'.txt']
	# load input data
	X = load_group(filenames, filepath)
	# load class output
	y = load_file(prefix + group + '/y_'+group+'.txt')
	return X, y

In [5]:
path_data = "./UCI HAR Dataset/"

trainX, trainy = load_dataset_group('train', path_data)
print(trainX.shape, trainy.shape)

(7352, 128, 9) (7352, 1)


In [6]:
def load_simple_dataframe(group, prefix=''):
    # Paths to the necessary files
    acc_x_path = prefix + group + '/Inertial Signals/total_acc_x_' + group + '.txt'
    acc_y_path = prefix + group + '/Inertial Signals/total_acc_y_' + group + '.txt'
    acc_z_path = prefix + group + '/Inertial Signals/total_acc_z_' + group + '.txt'
    subjects_path = prefix + group + '/subject_' + group + '.txt'
    y_path = prefix + group + '/y_' + group + '.txt'
    activity_labels_path = prefix + 'activity_labels.txt'
    
    # Load accelerometer data
    acc_x = pd.read_csv(acc_x_path, header=None, delim_whitespace=True).stack().reset_index(drop=True)
    acc_y = pd.read_csv(acc_y_path, header=None, delim_whitespace=True).stack().reset_index(drop=True)
    acc_z = pd.read_csv(acc_z_path, header=None, delim_whitespace=True).stack().reset_index(drop=True)
    
    # Load subjects and activity labels
    subjects = pd.read_csv(subjects_path, header=None).iloc[:, 0]
    activities = pd.read_csv(y_path, header=None).iloc[:, 0]

    # Load activity labels mapping
    activity_labels = pd.read_csv(activity_labels_path, header=None, delim_whitespace=True, index_col=0)
    activity_labels_dict = activity_labels[1].to_dict()

    # Map activity labels
    activities_mapped = activities.map(activity_labels_dict)
    
    # Repeat each subject and activity ID for each time step in their respective sequence
    # Assuming 128 readings per sequence (which is typical in this dataset)
    num_time_steps = 128
    subjects_repeated = np.repeat(subjects, num_time_steps).reset_index(drop=True)
    activities_repeated = np.repeat(activities_mapped, num_time_steps).reset_index(drop=True)

    # Combine into a single DataFrame
    df = pd.DataFrame({
        'subject_id': subjects_repeated,
        'acc_x': acc_x,
        'acc_y': acc_y,
        'acc_z': acc_z,
        'activity': activities_repeated
    })
    
    return df

# Example usage:
prefix = './UCI HAR Dataset/'
train_df = load_simple_dataframe('train', prefix)
print(train_df.head())

   subject_id     acc_x     acc_y     acc_z  activity
0           1  1.012817 -0.123217  0.102934  STANDING
1           1  1.022833 -0.126876  0.105687  STANDING
2           1  1.022028 -0.124004  0.102102  STANDING
3           1  1.017877 -0.124928  0.106553  STANDING
4           1  1.023680 -0.125767  0.102814  STANDING


In [7]:
train_df.shape

(941056, 5)

In [8]:
# get number of subjects
train_subjects = len(train_df['subject_id'].unique())
print(train_subjects)

21


In [9]:
train_df['subject_id'].unique()

array([ 1,  3,  5,  6,  7,  8, 11, 14, 15, 16, 17, 19, 21, 22, 23, 25, 26,
       27, 28, 29, 30], dtype=int64)

In [10]:
test_df = load_simple_dataframe('test', prefix)
print(test_df.head())

   subject_id     acc_x     acc_y     acc_z  activity
0           2  1.041216 -0.269796  0.023780  STANDING
1           2  1.041803 -0.280025  0.076293  STANDING
2           2  1.039086 -0.292663  0.147475  STANDING
3           2  1.054768 -0.292384  0.139906  STANDING
4           2  1.028376 -0.285826  0.119934  STANDING


In [11]:
test_df['subject_id'].unique()

array([ 2,  4,  9, 10, 12, 13, 18, 20, 24], dtype=int64)

In [12]:
# combine train_df and test_df into one dataframe
df = pd.concat([train_df, test_df], ignore_index=True)

In [17]:
# sort by subject_id and reset index
df = df.sort_values('subject_id').reset_index(drop=True)
df.head()

Unnamed: 0,subject_id,acc_x,acc_y,acc_z,activity
0,1,1.012817,-0.123217,0.102934,STANDING
1,1,0.985853,0.130482,0.236301,SITTING
2,1,0.986359,0.132166,0.233987,SITTING
3,1,0.985987,0.133959,0.233613,SITTING
4,1,0.985375,0.137033,0.234184,SITTING


In [18]:
df.shape

(1318272, 5)

In [19]:
df['subject_id'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], dtype=int64)

In [20]:
df['activity'].unique()

array(['STANDING', 'SITTING', 'LAYING', 'WALKING', 'WALKING_UPSTAIRS',
       'WALKING_DOWNSTAIRS'], dtype=object)

In [17]:
# export the dataframe to a CSV file
df.to_csv('./UCI HAR Dataset/uci_har.csv', index=False)