In [1]:
import os
import re
from pathlib import Path as p
import pandas as pd


class DataModule:
    def __init__(self, data_path, activities):
        self.data_path = data_path
        self.activities = activities
        self.df = pd.DataFrame()

    def combine_data(self):
        '''This combines all datasets'''
        subject_data = pd.read_csv(p(self.data_path) / 'data_subjects_info.csv')
        for curr, sub_folders, files in os.walk(self.data_path):
            if not sub_folders:
                # Extract activity from the folder title
                activity = p(curr).stem[:3]
                for file in files:
                    subject_id = int(re.search(r"\d+", file).group())
                    act_path = p(curr) / file
                    activity_data = pd.read_csv(act_path, encoding_errors='ignore')
                    # Store all information of subject of which code matches subject_id
                    added_subject_data = subject_data.loc[subject_data['code'] == int(subject_id), 'weight':]
                    # Repeat the added_subject_data by the number of the length of activity_data as they come from the same subject
                    added_subject_data = pd.concat([added_subject_data] * len(activity_data), ignore_index=True)
                    added_data = pd.concat([activity_data, added_subject_data], axis=1)
                    # Add the columns
                    added_data['subject'] = subject_id
                    added_data['target'] = activity
                    self.df = pd.concat([self.df, added_data])
        # Drop redundant columns
        self.df.drop(columns=['Unnamed: 0'], inplace=True)


data_path = './Data/Raw'
activities = 'dws jog sit std ups wlk'.split()
data_module = DataModule(data_path, activities)
data_module.combine_data()
df = data_module.df
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,weight,height,age,gender,subject,target
0,1.528132,-0.733896,0.696372,0.741895,0.669768,-0.031672,0.316738,0.77818,1.082764,0.294894,-0.184493,0.377542,102,188,46,1,1,dws
1,1.527992,-0.716987,0.677762,0.753099,0.657116,-0.032255,0.842032,0.424446,0.643574,0.219405,0.035846,0.114866,102,188,46,1,1,dws
2,1.527765,-0.706999,0.670951,0.759611,0.649555,-0.032707,-0.138143,-0.040741,0.343563,0.010714,0.134701,-0.167808,102,188,46,1,1,dws
3,1.516768,-0.704678,0.675735,0.760709,0.647788,-0.04114,-0.025005,-1.048717,0.03586,-0.008389,0.136788,0.094958,102,188,46,1,1,dws
4,1.493941,-0.703918,0.672994,0.760062,0.64721,-0.05853,0.114253,-0.91289,0.047341,0.199441,0.353996,-0.044299,102,188,46,1,1,dws


In [2]:
# Creates a column BMI from weight and height
df['bmi'] = df['weight'] / ((df['height']/100) ** 2)
df_no_hw = df.drop(columns=['height', 'weight'])
# Separates the dataset into accelerometer and gyroscope
accelerometer = df_no_hw.drop(
    columns=['attitude.roll', 'attitude.pitch', 'attitude.yaw', 'rotationRate.x', 'rotationRate.y', 'rotationRate.z', ])
gyroscope = df_no_hw.drop(
    columns=['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z', 'gravity.x', 'gravity.y',
             'gravity.z', ], )

In [3]:
# Only activity columns have missing values
# df_no_hw.isna().sum()

In [4]:
# Save the combined data
df.to_parquet('./Data/Engineered/df.parquet', engine='fastparquet', compression='gzip')
df_no_hw.to_parquet('./Data/Engineered/combined_df_no_hw.parquet', engine='fastparquet', compression='gzip')
accelerometer.to_parquet('./Data/Engineered/combined_accelerometer.parquet', engine='fastparquet', compression='gzip')
gyroscope.to_parquet('./Data/Engineered/combined_gyroscope.parquet', engine='fastparquet', compression='gzip')