# Data Cleaning process: 

- I got rid of all the NaNs and other irrelevant data points.
- I changed the data type where appropriate.
- I converted the 'Time' column data type from object to integer.
- I kept the 'Timestamp' column data type as object. As I am not sure how to convert it to an appropriate datetime datatype.
- I removed the 'Mode' column.
- I kept the 'Data' column.

In [1]:
# Import necessary libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 


## Acce_1_26_05

In [2]:
# Load and preview the data.

Accel_1_26_05 = pd.read_csv('accel_1_26_05_2022.csv')

Accel_1_26_05.head()

Unnamed: 0,Time,X,Y,Z,X_f,Y_f,Z_f,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,2918276,0,-32,0,1.068721,-0.504631,-0.231132,,,,
1,2918374,0,64,64,1.067315,-0.442127,-0.089711,,,,
2,2918471,64,-32,32,1.150048,-0.472716,-0.019123,,,,
3,2918569,-32,160,-96,1.10647,-0.317224,-0.229926,,,,
4,2918666,-32,-32,0,1.062978,-0.347992,-0.229421,,,,


In [3]:
# Data types and data info.
Accel_1_26_05.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236 entries, 0 to 235
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Time         236 non-null    object 
 1   X            226 non-null    object 
 2   Y            223 non-null    object 
 3   Z            223 non-null    object 
 4   X_f          222 non-null    object 
 5   Y_f          222 non-null    object 
 6   Z_f          218 non-null    float64
 7   Unnamed: 7   2 non-null      object 
 8   Unnamed: 8   1 non-null      object 
 9   Unnamed: 9   1 non-null      object 
 10  Unnamed: 10  1 non-null      object 
dtypes: float64(1), object(10)
memory usage: 20.4+ KB


In [4]:
# Check for missing values.
Accel_1_26_05.isna().sum()

Time             0
X               10
Y               13
Z               13
X_f             14
Y_f             14
Z_f             18
Unnamed: 7     234
Unnamed: 8     235
Unnamed: 9     235
Unnamed: 10    235
dtype: int64

In [5]:
# have a look at the rows with missing values in Z_f. This contains all the NaNs in the labelled columns

A1_na_check = Accel_1_26_05[Accel_1_26_05['Z_f'].isna()]

A1_na_check.isna().sum()

A1_na_check

# This data all looks fine to be removed

Unnamed: 0,Time,X,Y,Z,X_f,Y_f,Z_f,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
36,2921694,gps:fixing,soc:100.00%,volt:4.09 AM1805:06:10:51,wakeup:0s,ext_alive:0s,,,,,
77,2925591,,,,,,,,,,
79,2925688,,,,,,,,,,
85,2926276,,,,,,,###########################,,,
86,2926277,,,,,,,,,,
88,2926,s,,,,,,,,,
89,�,� �7 ����,,,,,,,,,
98,2927248,,,,,,,,,,
100,2927346,,,,,,,,,,
112,2928421,,,,,,,&,PRODUCTION,INFO,###########################


In [6]:
# Create a copy of original data.

Accel_1_26_05_copy =  Accel_1_26_05.copy()

In [7]:
# View dataframe.
Accel_1_26_05_copy.head()

Unnamed: 0,Time,X,Y,Z,X_f,Y_f,Z_f,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,2918276,0,-32,0,1.068721,-0.504631,-0.231132,,,,
1,2918374,0,64,64,1.067315,-0.442127,-0.089711,,,,
2,2918471,64,-32,32,1.150048,-0.472716,-0.019123,,,,
3,2918569,-32,160,-96,1.10647,-0.317224,-0.229926,,,,
4,2918666,-32,-32,0,1.062978,-0.347992,-0.229421,,,,


In [8]:
# Remove the "Unnamed Columns"

Accel_1_26_05_copy.drop(labels=['Unnamed: 7','Unnamed: 8','Unnamed: 9','Unnamed: 10'], axis=1, inplace=True)

In [9]:
# View Data frame.
Accel_1_26_05_copy.head()

Unnamed: 0,Time,X,Y,Z,X_f,Y_f,Z_f
0,2918276,0,-32,0,1.068721,-0.504631,-0.231132
1,2918374,0,64,64,1.067315,-0.442127,-0.089711
2,2918471,64,-32,32,1.150048,-0.472716,-0.019123
3,2918569,-32,160,-96,1.10647,-0.317224,-0.229926
4,2918666,-32,-32,0,1.062978,-0.347992,-0.229421


In [10]:
# Remove NaNs of the data frame.

Accel_1_26_05_copy.dropna(axis=0, inplace=True)

In [11]:
# Check data info, data types and NaNs

print(Accel_1_26_05_copy.info())
print(Accel_1_26_05_copy.isna().sum())



<class 'pandas.core.frame.DataFrame'>
Int64Index: 218 entries, 0 to 233
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    218 non-null    object 
 1   X       218 non-null    object 
 2   Y       218 non-null    object 
 3   Z       218 non-null    object 
 4   X_f     218 non-null    object 
 5   Y_f     218 non-null    object 
 6   Z_f     218 non-null    float64
dtypes: float64(1), object(6)
memory usage: 13.6+ KB
None
Time    0
X       0
Y       0
Z       0
X_f     0
Y_f     0
Z_f     0
dtype: int64


In [12]:
# Check Tail of data.

Accel_1_26_05_copy.head()

Unnamed: 0,Time,X,Y,Z,X_f,Y_f,Z_f
0,2918276,0,-32,0,1.068721,-0.504631,-0.231132
1,2918374,0,64,64,1.067315,-0.442127,-0.089711
2,2918471,64,-32,32,1.150048,-0.472716,-0.019123
3,2918569,-32,160,-96,1.10647,-0.317224,-0.229926
4,2918666,-32,-32,0,1.062978,-0.347992,-0.229421


In [13]:
# Data are still labelled as 'Object' data type.
# Thus some data points are not NaNs but are not relevant data.

# Using David's formula to check what isn't an integer in column Time/X/Y/Z/X_f/Y_f.

def check_int(value):
    try:
        int(value)
        return 'int' # if value is an integer, show 'int'
    except ValueError:
        return value # if value is not an integer, show the value
    
# run the function on the X column
X_int_check = Accel_1_26_05_copy['X'].apply(check_int)

X_int_check.loc[lambda x : x != 'int']

    

134    � �7 ����0002930415 [app] INFO: ACCEL:\t
Name: X, dtype: object

In [14]:
# Drop values from column X column using index.

Accel_1_26_05_copy.drop(index=134, inplace=True)

In [15]:
Accel_1_26_05_copy.head()

Unnamed: 0,Time,X,Y,Z,X_f,Y_f,Z_f
0,2918276,0,-32,0,1.068721,-0.504631,-0.231132
1,2918374,0,64,64,1.067315,-0.442127,-0.089711
2,2918471,64,-32,32,1.150048,-0.472716,-0.019123
3,2918569,-32,160,-96,1.10647,-0.317224,-0.229926
4,2918666,-32,-32,0,1.062978,-0.347992,-0.229421


In [16]:
# Run the function for Y column

Y_int_check = Accel_1_26_05_copy['Y'].apply(check_int)

Y_int_check.loc[lambda x : x != 'int']

# Observation: No abnormal data

Series([], Name: Y, dtype: object)

In [17]:
# Run the function for Z column

Z_int_check = Accel_1_26_05_copy['Z'].apply(check_int)

Z_int_check.loc[lambda x : x != 'int']

# Observation: No abnormal data.

Series([], Name: Z, dtype: object)

In [18]:
# Run the function for X_f column

X_f_int_check = Accel_1_26_05_copy['X_f'].apply(check_int)

X_f_int_check.loc[lambda x : x != 'int']

# Observation: No abnoraml data.

0        1.068721
1        1.067315
2        1.150048
3         1.10647
4        1.062978
          ...    
229     -15.81442
230    -18.281284
231    -16.682217
232    -17.735546
233    -19.067039
Name: X_f, Length: 217, dtype: object

In [19]:
# Run the function for Y_f column.

Y_f_int_check = Accel_1_26_05_copy['Y_f'].apply(check_int)

Y_f_int_check.loc[lambda x : x != 'int']

# Observation: No abnormal data.

0      -0.504631
1      -0.442127
2      -0.472716
3      -0.317224
4      -0.347992
         ...    
229    -3.397115
230    -1.684881
231    -3.495255
232    -0.738091
233    -1.354533
Name: Y_f, Length: 217, dtype: object

In [20]:
# Run the function for Time column.

Time_int_check = Accel_1_26_05_copy['Time'].apply(check_int)

Time_int_check.loc[lambda x : x != 'int']

# Observation: abnormal data found at row 42,83,161 and 202.


43                �
90     

0002926564
177               �
219               �
Name: Time, dtype: object

In [21]:
# Drop values from Time column using index.

Accel_1_26_05_copy.drop(index=[43,90,177,219], inplace=True)

In [22]:
pd.set_option("display.max_rows", None)

print(Accel_1_26_05_copy)

            Time       X      Y      Z         X_f         Y_f        Z_f
0        2918276       0    -32      0    1.068721   -0.504631  -0.231132
1        2918374       0     64     64    1.067315   -0.442127  -0.089711
2        2918471      64    -32     32    1.150048   -0.472716  -0.019123
3        2918569     -32    160    -96     1.10647   -0.317224  -0.229926
4        2918666     -32    -32      0    1.062978   -0.347992  -0.229421
5        2918764      32     32      0    1.103594   -0.316589  -0.228919
6        2918861      64    -64    -32     1.18611   -0.378393  -0.298374
7        2918959     -96      0    -32    1.058615   -0.378025  -0.367570
8        2919056       0     96      0    1.057225   -0.284554  -0.366769
9        2919154     -32    -64     64    1.013887   -0.346407  -0.226795
10       2919251      32     64    -32    1.054484   -0.283922  -0.295828
11       2919349      32     32      0    1.094999   -0.252563  -0.295187
12       2919446     704   1120   -384

In [23]:
# By looking at the full data frame above. We can see that at Index 87, the Time value is unusually high. 
# Therefore it needs to be dropedd as well 

# Dropv values from Time column using index.

Accel_1_26_05_copy.drop(index=87, inplace=True)

In [24]:
# Get data info.
Accel_1_26_05_copy.info()

# Observation: 
# All abnormal data and NaNs have been removed but the data types for Time/X/Y/Z/X_f/Y_f are still object.
# Thus I update the data type myself.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212 entries, 0 to 233
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    212 non-null    object 
 1   X       212 non-null    object 
 2   Y       212 non-null    object 
 3   Z       212 non-null    object 
 4   X_f     212 non-null    object 
 5   Y_f     212 non-null    object 
 6   Z_f     212 non-null    float64
dtypes: float64(1), object(6)
memory usage: 13.2+ KB


In [25]:
# Convert the data type of each column from object to integer or float.

Accel_1_26_05_copy = Accel_1_26_05_copy.astype({'Time':int, 'X':int, 'Y':int, 'Z':int, 'X_f':float, 'Y_f':float})

In [26]:
Accel_1_26_05_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212 entries, 0 to 233
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    212 non-null    int64  
 1   X       212 non-null    int64  
 2   Y       212 non-null    int64  
 3   Z       212 non-null    int64  
 4   X_f     212 non-null    float64
 5   Y_f     212 non-null    float64
 6   Z_f     212 non-null    float64
dtypes: float64(3), int64(4)
memory usage: 13.2 KB


In [27]:
# Save the data frame.
Accel_1_26_05_copy.to_csv('Accel_1_26_05_cleaned.csv')

## Accel_2_26_05

In [28]:
# Load and preview the data.

Accel_2_26_05 = pd.read_csv('accel_2_26_05_2022.csv')

Accel_2_26_05.head()

Unnamed: 0,Time,X,Y,Z,X_f,Y_f,Z_f
0,3002645,32,64,64,-25.849033,12.145277,-52.598858
1,3002743,-64,96,96,-25.972031,12.371145,-51.784473
2,3002889,-64,32,64,-26.094332,12.423964,-51.15152
3,3002986,-32,32,-32,-26.113277,12.476519,-51.047153
4,3003084,128,64,64,-25.620377,12.614525,-50.42337


In [29]:
# Check data type and info.

Accel_2_26_05.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    264 non-null    object 
 1   X       264 non-null    int64  
 2   Y       264 non-null    int64  
 3   Z       264 non-null    int64  
 4   X_f     264 non-null    float64
 5   Y_f     264 non-null    float64
 6   Z_f     264 non-null    float64
dtypes: float64(3), int64(3), object(1)
memory usage: 14.6+ KB


In [30]:
# Check for missing values.
Accel_2_26_05.isna().sum()

Time    0
X       0
Y       0
Z       0
X_f     0
Y_f     0
Z_f     0
dtype: int64

In [31]:
# Create a copy of the original dataframe.

Accel_2_26_05_copy = Accel_2_26_05.copy()


In [32]:
# Check for non integer values in Time Column.

Time_int_check = Accel_2_26_05_copy['Time'].apply(check_int)

Time_int_check.loc[lambda x : x != 'int']

# Observation: At row 53, data is abnormal.


53    �0003008
Name: Time, dtype: object

In [33]:
# Remove abnormal data in Time column using index.

Accel_2_26_05_copy.drop(index=53, inplace=True)

In [34]:
pd.set_option('display.max_rows', None)
print(Accel_2_26_05_copy)

        Time     X     Y      Z        X_f        Y_f         Z_f
0    3002645    32    64     64 -25.849033  12.145277  -52.598858
1    3002743   -64    96     96 -25.972031  12.371145  -51.784473
2    3002889   -64    32     64 -26.094332  12.423964  -51.151520
3    3002986   -32    32    -32 -26.113277  12.476519  -51.047153
4    3003084   128    64     64 -25.620377  12.614525  -50.423370
5    3003181   -96   -96     64 -25.845444  12.324079  -49.804901
6    3003328     0   -64   -160 -25.762939  12.120084  -50.398663
7    3003425    64   -64      0 -25.477163  11.916867  -50.127964
8    3003523    32   -32     32 -25.294430  11.799758  -49.688759
9    3003620   -32    64    128 -25.315704  11.938707  -48.742023
10   3003718     0    32      0 -25.235626  11.992022  -48.482788
11   3004010    64    64    -32 -25.076683  12.013341  -48.309109
12   3004108   128     0    -64 -24.595558  11.981594  -48.391384
13   3004205    96    64    -64 -24.216557  12.118731  -48.472824
14   30043

In [35]:
# By looking at the data frame above we can see that in row 151 the Time value is 3. This is an unusual number, 
# that also needs to be removed.


In [36]:
# Remove abnormal value in Time column using index.
Accel_2_26_05_copy.drop(index=151, inplace=True)


In [37]:
pd.set_option('display.max_rows', None)
print(Accel_2_26_05_copy)

        Time     X     Y      Z        X_f        Y_f         Z_f
0    3002645    32    64     64 -25.849033  12.145277  -52.598858
1    3002743   -64    96     96 -25.972031  12.371145  -51.784473
2    3002889   -64    32     64 -26.094332  12.423964  -51.151520
3    3002986   -32    32    -32 -26.113277  12.476519  -51.047153
4    3003084   128    64     64 -25.620377  12.614525  -50.423370
5    3003181   -96   -96     64 -25.845444  12.324079  -49.804901
6    3003328     0   -64   -160 -25.762939  12.120084  -50.398663
7    3003425    64   -64      0 -25.477163  11.916867  -50.127964
8    3003523    32   -32     32 -25.294430  11.799758  -49.688759
9    3003620   -32    64    128 -25.315704  11.938707  -48.742023
10   3003718     0    32      0 -25.235626  11.992022  -48.482788
11   3004010    64    64    -32 -25.076683  12.013341  -48.309109
12   3004108   128     0    -64 -24.595558  11.981594  -48.391384
13   3004205    96    64    -64 -24.216557  12.118731  -48.472824
14   30043

In [38]:
# Reset the index.
Accel_2_26_05_copy.reset_index(inplace=True)

In [39]:
# And drop the index column.
Accel_2_26_05_copy.drop(labels=['index'], axis=1, inplace=True)

In [40]:
Accel_2_26_05_copy.head()

Unnamed: 0,Time,X,Y,Z,X_f,Y_f,Z_f
0,3002645,32,64,64,-25.849033,12.145277,-52.598858
1,3002743,-64,96,96,-25.972031,12.371145,-51.784473
2,3002889,-64,32,64,-26.094332,12.423964,-51.15152
3,3002986,-32,32,-32,-26.113277,12.476519,-51.047153
4,3003084,128,64,64,-25.620377,12.614525,-50.42337


In [41]:
# Check data types.
Accel_2_26_05_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    262 non-null    object 
 1   X       262 non-null    int64  
 2   Y       262 non-null    int64  
 3   Z       262 non-null    int64  
 4   X_f     262 non-null    float64
 5   Y_f     262 non-null    float64
 6   Z_f     262 non-null    float64
dtypes: float64(3), int64(3), object(1)
memory usage: 14.5+ KB


In [42]:
# Time Column needs to be updated to integer.
Accel_2_26_05_copy = Accel_2_26_05_copy.astype({'Time':int})


In [43]:
# Re-check data
Accel_2_26_05_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    262 non-null    int64  
 1   X       262 non-null    int64  
 2   Y       262 non-null    int64  
 3   Z       262 non-null    int64  
 4   X_f     262 non-null    float64
 5   Y_f     262 non-null    float64
 6   Z_f     262 non-null    float64
dtypes: float64(3), int64(4)
memory usage: 14.5 KB


In [44]:
# Save the data frame as a csv format.

Accel_2_26_05_copy.to_csv('Accel_2_26_05_cleaned.csv')

## Accel_3_26_05:

In [45]:
# Load and preview the data.

Accel_3_26_05 = pd.read_csv('accel_3_26_05_2022.csv')

Accel_3_26_05.head()

Unnamed: 0,Time,X,Y,Z,X_f,Y_f,Z_f
0,2918276,0,-32,0,1.068721,-0.504631,-0.231132
1,2918374,0,64,64,1.067315,-0.442127,-0.089711
2,2918471,64,-32,32,1.150048,-0.472716,-0.019123
3,2918569,-32,160,-96,1.10647,-0.317224,-0.229926
4,2918666,-32,-32,0,1.062978,-0.347992,-0.229421


In [46]:
# Check data types and info.
Accel_3_26_05.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    217 non-null    int64  
 1   X       217 non-null    int64  
 2   Y       217 non-null    int64  
 3   Z       217 non-null    int64  
 4   X_f     217 non-null    float64
 5   Y_f     217 non-null    float64
 6   Z_f     217 non-null    float64
dtypes: float64(3), int64(4)
memory usage: 12.0 KB


In [47]:
# Check for NaNs.
Accel_3_26_05.isna().sum()

Time    0
X       0
Y       0
Z       0
X_f     0
Y_f     0
Z_f     0
dtype: int64

In [48]:
# Save the file as there are no missing value and all data type are what they should be.

Accel_3_26_05.to_csv('Accel_3_26_05_cleaned.csv')

## Accel_4_26_05:

In [49]:
# Load and preview the data.

Accel_4_26_05 = pd.read_csv('accel_4_26_05_2022.csv')

Accel_4_26_05.head()


Unnamed: 0,Time,X,Y,Z,X_f,Y_f,Z_f
0,3002645,32,64,64,-25.849033,12.145277,-52.598858
1,3002743,-64,96,96,-25.972031,12.371145,-51.784473
2,3002889,-64,32,64,-26.094332,12.423964,-51.15152
3,3002986,-32,32,-32,-26.113277,12.476519,-51.047153
4,3003084,128,64,64,-25.620377,12.614525,-50.42337


In [50]:
# Get info. 
Accel_4_26_05.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 542 entries, 0 to 541
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    542 non-null    object 
 1   X       542 non-null    int64  
 2   Y       542 non-null    int64  
 3   Z       542 non-null    int64  
 4   X_f     542 non-null    float64
 5   Y_f     542 non-null    float64
 6   Z_f     542 non-null    float64
dtypes: float64(3), int64(3), object(1)
memory usage: 29.8+ KB


In [51]:
# Check for NaNs. 
Accel_4_26_05.isna().sum()

Time    0
X       0
Y       0
Z       0
X_f     0
Y_f     0
Z_f     0
dtype: int64

In [52]:
# Create a copy of dataframe.
Accel_4_26_05_copy = Accel_4_26_05.copy()

In [53]:
# Check for non integer values in Time Column.

Time_int_check = Accel_4_26_05_copy['Time'].apply(check_int)

Time_int_check.loc[lambda x : x != 'int']



54    �0003008203
Name: Time, dtype: object

In [54]:
# Remove non numeric data from Time column.

Accel_4_26_05_copy.drop(index=54, inplace=True)

In [55]:
# Check data types and info.
Accel_4_26_05_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 541 entries, 0 to 541
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    541 non-null    object 
 1   X       541 non-null    int64  
 2   Y       541 non-null    int64  
 3   Z       541 non-null    int64  
 4   X_f     541 non-null    float64
 5   Y_f     541 non-null    float64
 6   Z_f     541 non-null    float64
dtypes: float64(3), int64(3), object(1)
memory usage: 33.8+ KB


In [56]:
# Convert Time data from object to integer.

Accel_4_26_05_copy = Accel_4_26_05_copy.astype({'Time':int})

In [57]:
# Check data info.

Accel_4_26_05_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 541 entries, 0 to 541
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    541 non-null    int64  
 1   X       541 non-null    int64  
 2   Y       541 non-null    int64  
 3   Z       541 non-null    int64  
 4   X_f     541 non-null    float64
 5   Y_f     541 non-null    float64
 6   Z_f     541 non-null    float64
dtypes: float64(3), int64(4)
memory usage: 33.8 KB


In [58]:
# Save the cleaned data frame

Accel_4_26_05_copy.to_csv('Accel_4_26_05_cleaned.csv')

## Accel_6_curb_up_31_05

In [59]:
# Load and preview the data.


Accel_6_curb_up = pd.read_csv('accel_6_curb_up_31_05_2022.csv')

Accel_6_curb_up.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,AX,AY,AZ,fAX,fAY,fAZ
0,0,18:10:09:027,352,-1120,-1088,-44.076183,55.046425,-97.609222
1,1,18:10:09:118,928,32,768,-35.793819,54.783489,-92.298302
2,2,18:10:09:204,-8512,14016,5888,-108.783569,212.341125,-55.202663
3,3,18:10:09:295,2624,15424,-4928,-81.536301,428.819519,-88.810295
4,5,18:10:09:384,96,512,960,-79.68885,430.335266,-80.931335


In [60]:
# Check data info.
Accel_6_curb_up.isna().sum()

Unnamed: 0    0
Timestamp     0
AX            0
AY            0
AZ            0
fAX           0
fAY           0
fAZ           0
dtype: int64

In [61]:
# Make a copy of the dataframe.

Accel_6_curb_up_copy = Accel_6_curb_up.copy()

In [62]:
# Remove the "Unnamed:0" column. 

Accel_6_curb_up_copy.drop(labels=['Unnamed: 0'], axis=1, inplace=True)

In [63]:
# Preview the data.

Accel_6_curb_up_copy.head()

Unnamed: 0,Timestamp,AX,AY,AZ,fAX,fAY,fAZ
0,18:10:09:027,352,-1120,-1088,-44.076183,55.046425,-97.609222
1,18:10:09:118,928,32,768,-35.793819,54.783489,-92.298302
2,18:10:09:204,-8512,14016,5888,-108.783569,212.341125,-55.202663
3,18:10:09:295,2624,15424,-4928,-81.536301,428.819519,-88.810295
4,18:10:09:384,96,512,960,-79.68885,430.335266,-80.931335


In [64]:
# Check data info.
Accel_6_curb_up_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  297 non-null    object 
 1   AX         297 non-null    int64  
 2   AY         297 non-null    int64  
 3   AZ         297 non-null    int64  
 4   fAX        297 non-null    float64
 5   fAY        297 non-null    float64
 6   fAZ        297 non-null    float64
dtypes: float64(3), int64(3), object(1)
memory usage: 16.4+ KB


In [65]:
# Save data frame as csv format.

Accel_6_curb_up_copy.to_csv('Accel_6_curb_up_31_05_2022_cleaned.csv')

## Accel_7_fast_and_stop_31_05

In [66]:
# Load and preview the data.

Accel_7_fast_and_stop = pd.read_csv('accel_7_fast_and_stop_31_05_2022.csv')

Accel_7_fast_and_stop.head()

Unnamed: 0,Timestamp,Mode,Data,AX,AY,AZ,fAX,fAY,fAZ
0,18:12:21:234,RX,0001462415 [app] INFO: ACCEL:,-704,-480,576,26.766279,168.241196,8.758143
1,18:12:21:325,RX,0001462551 [app] INFO: ACCEL:,-2016,1984,96,-3.032218,195.304382,10.175572
2,18:12:21:416,RX,0001462641 [app] INFO: ACCEL:,-1024,-544,1824,-18.302029,184.058868,39.223648
3,18:12:21:507,RX,0001462731 [app] INFO: ACCEL:,-960,-288,1600,-32.457813,176.883011,64.703102
4,18:12:21:603,RX,0001462821 [app] INFO: ACCEL:,-1216,448,1792,-50.310703,180.980347,93.299789


In [67]:
# Check data info.

Accel_7_fast_and_stop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  130 non-null    object 
 1   Mode       130 non-null    object 
 2   Data       130 non-null    object 
 3   AX         130 non-null    int64  
 4   AY         130 non-null    int64  
 5   AZ         130 non-null    int64  
 6   fAX        130 non-null    float64
 7   fAY        130 non-null    float64
 8   fAZ        130 non-null    float64
dtypes: float64(3), int64(3), object(3)
memory usage: 9.3+ KB


In [68]:
# Check for NaNs.

Accel_7_fast_and_stop.isna().sum()

Timestamp    0
Mode         0
Data         0
AX           0
AY           0
AZ           0
fAX          0
fAY          0
fAZ          0
dtype: int64

In [69]:
# Make copy of dataframe.

Accel_7_fast_and_stop_copy = Accel_7_fast_and_stop.copy()

In [70]:
# Remove the "Mode column".

Accel_7_fast_and_stop_copy.drop(labels=['Mode'], axis=1, inplace=True)

In [71]:
# Preview the data.

Accel_7_fast_and_stop_copy.head()

Unnamed: 0,Timestamp,Data,AX,AY,AZ,fAX,fAY,fAZ
0,18:12:21:234,0001462415 [app] INFO: ACCEL:,-704,-480,576,26.766279,168.241196,8.758143
1,18:12:21:325,0001462551 [app] INFO: ACCEL:,-2016,1984,96,-3.032218,195.304382,10.175572
2,18:12:21:416,0001462641 [app] INFO: ACCEL:,-1024,-544,1824,-18.302029,184.058868,39.223648
3,18:12:21:507,0001462731 [app] INFO: ACCEL:,-960,-288,1600,-32.457813,176.883011,64.703102
4,18:12:21:603,0001462821 [app] INFO: ACCEL:,-1216,448,1792,-50.310703,180.980347,93.299789


In [72]:
# Save the dataframe as csv format.

Accel_7_fast_and_stop_copy.to_csv('Accel_7_fast_and_stop_31_05_2022_cleaned.csv')

## Accel_8_curb_up_31_05:


In [73]:
# Load and preview the data.

Accel_8_curb_up = pd.read_csv('accel_8_curb_up_31_05_2022.csv')

Accel_8_curb_up.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,AX,AY,AZ,fAX,fAY,fAZ
0,0,18:15:17:321,-1984,-1184,-128,10.911464,20.143747,38.438805
1,3,18:15:21:642,1120,1504,-608,5.466331,39.520996,30.415575
2,4,18:15:22:408,-2144,-2048,960,-33.58997,1.751053,42.425961
3,6,18:15:22:720,-3552,-1696,384,-99.028473,-29.657183,46.862812
4,7,18:15:23:036,-2880,-2720,2880,-153.310471,-80.152374,83.436729


In [74]:
# Check data info.

Accel_8_curb_up.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  93 non-null     int64  
 1   Timestamp   93 non-null     object 
 2   AX          93 non-null     int64  
 3   AY          93 non-null     int64  
 4   AZ          93 non-null     int64  
 5   fAX         93 non-null     float64
 6   fAY         93 non-null     float64
 7   fAZ         93 non-null     float64
dtypes: float64(3), int64(4), object(1)
memory usage: 5.9+ KB


In [75]:
# Check for missing values.

Accel_8_curb_up.isna().sum()

Unnamed: 0    0
Timestamp     0
AX            0
AY            0
AZ            0
fAX           0
fAY           0
fAZ           0
dtype: int64

In [76]:
# Create a copy of dataframe.

Accel_8_curb_up_copy = Accel_8_curb_up.copy()

In [77]:
# Remove the "Unnamed: 0" column.

Accel_8_curb_up_copy.drop(labels=['Unnamed: 0'], axis=1, inplace=True)

In [78]:
# Preview the data.

Accel_8_curb_up_copy.head()

Unnamed: 0,Timestamp,AX,AY,AZ,fAX,fAY,fAZ
0,18:15:17:321,-1984,-1184,-128,10.911464,20.143747,38.438805
1,18:15:21:642,1120,1504,-608,5.466331,39.520996,30.415575
2,18:15:22:408,-2144,-2048,960,-33.58997,1.751053,42.425961
3,18:15:22:720,-3552,-1696,384,-99.028473,-29.657183,46.862812
4,18:15:23:036,-2880,-2720,2880,-153.310471,-80.152374,83.436729


In [79]:
# Save the data frame as csv format.

Accel_8_curb_up_copy.to_csv('Accel_8_curb_up_31_05_2022_cleaned.csv')

## Accel_9_curb_up_31_05:

In [80]:
# Load and preview the data.

Accel_9_curb_up = pd.read_csv('accel_9_curb_up_31_05_2022.csv')

Accel_9_curb_up.head()

Unnamed: 0.1,Unnamed: 0,"Timestamp,Mode,Data",Timestamp,mode,Data,AX,AY,AZ,fAX,fAY,fAZ
0,0,"18:16:32:074,RX,0001713110 [app] INFO: ACCEL: ...",18:16:32:074,RX,0001713110 [app] INFO: ACCEL:,768.0,-608.0,352.0,6.215045,4.296308,-7.954152
1,1,"18:16:32:294,RX,0001713380 [app] INFO: ACCEL: ...",18:16:32:294,RX,0001713380 [app] INFO: ACCEL:,704.0,-256.0,320.0,14.198598,2.22199,-5.066761
2,3,"18:16:32:514,RX,0001713605 [app] INFO: ACCEL: ...",18:16:32:514,RX,0001713605 [app] INFO: ACCEL:,-512.0,-704.0,384.0,8.1642,-3.390279,-1.64914
3,4,"18:16:32:606,RX,0001713830 [app] INFO: ACCEL: ...",18:16:32:606,RX,0001713830 [app] INFO: ACCEL:,768.0,192.0,256.0,16.868759,-1.828191,0.611687
4,5,"18:16:32:878,RX,0001713920 [app] INFO: ACCEL: ...",18:16:32:878,RX,0001713920 [app] INFO: ACCEL:,-320.0,-672.0,-448.0,12.996037,-7.164129,-3.310493


In [81]:
# Check data info.

Accel_9_curb_up.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           248 non-null    int64  
 1   Timestamp,Mode,Data  248 non-null    object 
 2   Timestamp            248 non-null    object 
 3   mode                 248 non-null    object 
 4   Data                 248 non-null    object 
 5   AX                   248 non-null    float64
 6   AY                   248 non-null    float64
 7   AZ                   248 non-null    float64
 8   fAX                  248 non-null    float64
 9   fAY                  248 non-null    float64
 10  fAZ                  248 non-null    float64
dtypes: float64(6), int64(1), object(4)
memory usage: 21.4+ KB


In [82]:
# Check for NaNs.

Accel_9_curb_up.isna().sum()

Unnamed: 0             0
Timestamp,Mode,Data    0
Timestamp              0
mode                   0
Data                   0
AX                     0
AY                     0
AZ                     0
fAX                    0
fAY                    0
fAZ                    0
dtype: int64

In [83]:
# Create copy of dataframe.

Accel_9_curb_up_copy = Accel_9_curb_up.copy()

In [84]:
# Remove 'Unnamed: 0' 'Timestamp,Mode,Data' and 'mode' columns.

Accel_9_curb_up_copy.drop(labels=['Unnamed: 0', 'Timestamp,Mode,Data', 'mode'], axis=1, inplace=True)

In [85]:
# Preview data.

Accel_9_curb_up_copy.head()

Unnamed: 0,Timestamp,Data,AX,AY,AZ,fAX,fAY,fAZ
0,18:16:32:074,0001713110 [app] INFO: ACCEL:,768.0,-608.0,352.0,6.215045,4.296308,-7.954152
1,18:16:32:294,0001713380 [app] INFO: ACCEL:,704.0,-256.0,320.0,14.198598,2.22199,-5.066761
2,18:16:32:514,0001713605 [app] INFO: ACCEL:,-512.0,-704.0,384.0,8.1642,-3.390279,-1.64914
3,18:16:32:606,0001713830 [app] INFO: ACCEL:,768.0,192.0,256.0,16.868759,-1.828191,0.611687
4,18:16:32:878,0001713920 [app] INFO: ACCEL:,-320.0,-672.0,-448.0,12.996037,-7.164129,-3.310493


In [86]:
# Save Dataframe as csv format.

Accel_9_curb_up_copy.to_csv('Accel_9_curb_up_31_05_2022_cleaned.csv')

## Accel_10_normal_ride_31_05:

In [87]:
# Load and preview the data.

Accel_10_normal_ride = pd.read_csv('accel_10_normal_ride.csv')

Accel_10_normal_ride.head()

Unnamed: 0.1,Unnamed: 0,"Timestamp,Mode,Data",Timestamp,mode,Data,AX,AY,AZ,fAX,fAY,fAZ
0,0,"18:17:58:418,RX,0001799643 [app] INFO: ACCEL: ...",18:17:58:418,RX,0001799643 [app] INFO: ACCEL:,32.0,-288.0,-224.0,2.453399,-0.454532,2.579186
1,1,"18:17:58:509,RX,0001799734 [app] INFO: ACCEL: ...",18:17:58:509,RX,0001799734 [app] INFO: ACCEL:,-96.0,-32.0,-32.0,2.290591,-0.49844,2.506613
2,2,"18:17:58:599,RX,0001799824 [app] INFO: ACCEL: ...",18:17:58:599,RX,0001799824 [app] INFO: ACCEL:,64.0,-64.0,64.0,2.392669,-0.58676,2.635491
3,3,"18:17:58:688,RX,0001799914 [app] INFO: ACCEL: ...",18:17:58:688,RX,0001799914 [app] INFO: ACCEL:,-32.0,-32.0,-64.0,2.335802,-0.630445,2.495957
4,4,"18:17:58:777,RX,0001800004 [app] INFO: ACCEL: ...",18:17:58:777,RX,0001800004 [app] INFO: ACCEL:,-32.0,-32.0,64.0,2.279083,-0.674036,2.624648


In [88]:
# Check data info.

Accel_10_normal_ride.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539 entries, 0 to 538
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           539 non-null    int64  
 1   Timestamp,Mode,Data  539 non-null    object 
 2   Timestamp            539 non-null    object 
 3   mode                 539 non-null    object 
 4   Data                 539 non-null    object 
 5   AX                   539 non-null    float64
 6   AY                   539 non-null    float64
 7   AZ                   539 non-null    float64
 8   fAX                  539 non-null    float64
 9   fAY                  539 non-null    float64
 10  fAZ                  539 non-null    float64
dtypes: float64(6), int64(1), object(4)
memory usage: 46.4+ KB


In [89]:
# Check for NaNs.

Accel_10_normal_ride.isna().sum()

Unnamed: 0             0
Timestamp,Mode,Data    0
Timestamp              0
mode                   0
Data                   0
AX                     0
AY                     0
AZ                     0
fAX                    0
fAY                    0
fAZ                    0
dtype: int64

In [90]:
# Make copy of dataframe.

Accel_10_normal_ride_copy = Accel_10_normal_ride.copy()

In [91]:
# Remove 'Unnamed: 0' 'Timestamp,Mode,Data' and 'mode' columns.

Accel_10_normal_ride_copy.drop(labels=['Unnamed: 0', 'Timestamp,Mode,Data', 'mode'], axis=1, inplace=True)

In [92]:
# Preview the data.

Accel_10_normal_ride_copy.head()

Unnamed: 0,Timestamp,Data,AX,AY,AZ,fAX,fAY,fAZ
0,18:17:58:418,0001799643 [app] INFO: ACCEL:,32.0,-288.0,-224.0,2.453399,-0.454532,2.579186
1,18:17:58:509,0001799734 [app] INFO: ACCEL:,-96.0,-32.0,-32.0,2.290591,-0.49844,2.506613
2,18:17:58:599,0001799824 [app] INFO: ACCEL:,64.0,-64.0,64.0,2.392669,-0.58676,2.635491
3,18:17:58:688,0001799914 [app] INFO: ACCEL:,-32.0,-32.0,-64.0,2.335802,-0.630445,2.495957
4,18:17:58:777,0001800004 [app] INFO: ACCEL:,-32.0,-32.0,64.0,2.279083,-0.674036,2.624648


In [93]:
# Save Dataframe as csv format.

Accel_10_normal_ride_copy.to_csv('Accel_10_normal_ride_31_05_2022_cleaned.csv')

## Gyro_1_curb_up_31_05:

In [94]:
# Load and preview data.

Gyro_1_curb_up = pd.read_csv('gyro_1_curb_up_31_05_2022.csv')

Gyro_1_curb_up.head()

Unnamed: 0,Timestamp,Mode,Data,X,Y,Z
0,18:53:24:751,RX,0000187888 [app] INFO: GYRO:,7.016,0.871,-2.113
1,18:53:24:841,RX,0000187977 [app] INFO: GYRO:,7.016,0.871,-2.113
2,18:53:24:930,RX,0000188067 [app] INFO: GYRO:,7.016,0.871,-2.113
3,18:53:25:023,RX,0000188157 [app] INFO: GYRO:,7.016,0.871,-2.113
4,18:53:25:113,RX,0000188247 [app] INFO: GYRO:,7.016,0.871,-2.113


In [95]:
# Get data info.

Gyro_1_curb_up.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  201 non-null    object 
 1   Mode       201 non-null    object 
 2   Data       201 non-null    object 
 3   X          201 non-null    float64
 4   Y          201 non-null    float64
 5   Z          201 non-null    float64
dtypes: float64(3), object(3)
memory usage: 9.5+ KB


In [96]:
# Check for missing values.

Gyro_1_curb_up.isna().sum()

Timestamp    0
Mode         0
Data         0
X            0
Y            0
Z            0
dtype: int64

In [97]:
# Create copy of dataframe.

Gyro_1_curb_up_copy = Gyro_1_curb_up.copy()

In [98]:
# Remove 'Mode' column.

Gyro_1_curb_up_copy.drop(labels=['Mode'], axis=1, inplace=True)

In [99]:
# Preview the data. 

Gyro_1_curb_up_copy.head()

Unnamed: 0,Timestamp,Data,X,Y,Z
0,18:53:24:751,0000187888 [app] INFO: GYRO:,7.016,0.871,-2.113
1,18:53:24:841,0000187977 [app] INFO: GYRO:,7.016,0.871,-2.113
2,18:53:24:930,0000188067 [app] INFO: GYRO:,7.016,0.871,-2.113
3,18:53:25:023,0000188157 [app] INFO: GYRO:,7.016,0.871,-2.113
4,18:53:25:113,0000188247 [app] INFO: GYRO:,7.016,0.871,-2.113


In [100]:
# Save the data frame as csv format. 

Gyro_1_curb_up_copy.to_csv('Gyro_1_curb_up_31_05_2022_cleaned.csv')

## Gyro_2_curb_up_31_05:

In [101]:
# Load and preview the data.

Gyro_2_curb_up = pd.read_csv('gyro_2_curb_up_31_05_2022.csv')

Gyro_2_curb_up.head()



Unnamed: 0,Timestamp,Mode,Data,X,Y,Z
0,18:54:03:946,RX,0000227081 [app] INFO: GYRO:,-0.505,0.402,0.167
1,18:54:04:082,RX,0000227172 [app] INFO: GYRO:,-0.505,0.402,0.167
2,18:54:04:176,RX,0000227307 [app] INFO: GYRO:,-0.505,0.402,0.167
3,18:54:04:261,RX,0000227397 [app] INFO: GYRO:,-0.505,0.402,0.167
4,18:54:04:357,RX,0000227487 [app] INFO: GYRO:,-0.505,0.402,0.167


In [102]:
# Check data info.

Gyro_2_curb_up.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  177 non-null    object 
 1   Mode       177 non-null    object 
 2   Data       177 non-null    object 
 3   X          177 non-null    float64
 4   Y          177 non-null    float64
 5   Z          177 non-null    float64
dtypes: float64(3), object(3)
memory usage: 8.4+ KB


In [103]:
# Check for NaNs.

Gyro_2_curb_up.isna().sum()

Timestamp    0
Mode         0
Data         0
X            0
Y            0
Z            0
dtype: int64

In [104]:
# Copy data frame.

Gyro_2_curb_up_copy = Gyro_2_curb_up.copy()

In [105]:
# Remmove "Mode" column

Gyro_2_curb_up_copy.drop(labels=['Mode'], axis=1, inplace=True)

In [106]:
# Preview the data.

Gyro_2_curb_up_copy.head()

Unnamed: 0,Timestamp,Data,X,Y,Z
0,18:54:03:946,0000227081 [app] INFO: GYRO:,-0.505,0.402,0.167
1,18:54:04:082,0000227172 [app] INFO: GYRO:,-0.505,0.402,0.167
2,18:54:04:176,0000227307 [app] INFO: GYRO:,-0.505,0.402,0.167
3,18:54:04:261,0000227397 [app] INFO: GYRO:,-0.505,0.402,0.167
4,18:54:04:357,0000227487 [app] INFO: GYRO:,-0.505,0.402,0.167


In [107]:
# Save data frame as csv format.

Gyro_2_curb_up_copy.to_csv('Gyro_2_curb_up_31_05_2022_cleaned.csv')

## Gyro_3_curb_up_31_05:


In [108]:
# Load and preview the data.

Gyro_3_curb_up = pd.read_csv('gyro_3_curb_up_31_05_2022.csv')

Gyro_3_curb_up.head()

Unnamed: 0,Timestamp,Mode,Data,X,Y,Z
0,18:54:49:306,RX,0000272396 [app] INFO: GYRO:,-7.311,-2.552,52.319
1,18:54:49:395,RX,0000272532 [app] INFO: GYRO:,-7.311,-2.552,52.319
2,18:54:49:577,RX,0000272622 [app] INFO: GYRO:,-7.311,-2.552,52.319
3,18:54:49:666,RX,0000272802 [app] INFO: GYRO:,-7.311,-2.552,52.319
4,18:54:49:803,RX,0000272892 [app] INFO: GYRO:,-7.311,-2.552,52.319


In [109]:
# Check for data info.

Gyro_3_curb_up.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  112 non-null    object 
 1   Mode       112 non-null    object 
 2   Data       112 non-null    object 
 3   X          112 non-null    float64
 4   Y          112 non-null    float64
 5   Z          112 non-null    float64
dtypes: float64(3), object(3)
memory usage: 5.4+ KB


In [110]:
# Check for missing values.

Gyro_3_curb_up.isna().sum()

Timestamp    0
Mode         0
Data         0
X            0
Y            0
Z            0
dtype: int64

In [111]:
# Create copy of dataframe.

Gyro_3_curb_up_copy = Gyro_3_curb_up.copy()

In [112]:
# Remove 'Mode' column.

Gyro_3_curb_up_copy.drop(labels=['Mode'], axis=1, inplace=True)

In [113]:
# Preview the data.

Gyro_3_curb_up_copy.head()

Unnamed: 0,Timestamp,Data,X,Y,Z
0,18:54:49:306,0000272396 [app] INFO: GYRO:,-7.311,-2.552,52.319
1,18:54:49:395,0000272532 [app] INFO: GYRO:,-7.311,-2.552,52.319
2,18:54:49:577,0000272622 [app] INFO: GYRO:,-7.311,-2.552,52.319
3,18:54:49:666,0000272802 [app] INFO: GYRO:,-7.311,-2.552,52.319
4,18:54:49:803,0000272892 [app] INFO: GYRO:,-7.311,-2.552,52.319


In [114]:
# Save dataframe as csv format.

Gyro_3_curb_up_copy.to_csv('Gyro_3_curb_up_31_05_2022_cleaned.csv')

## Gyro_4_throwing_bike_31_05:

In [115]:
# Load and preview the data. 

Gyro_4_throwing_bike = pd.read_csv('gyro_4_throwing_bike_31_05_2022.csv')

Gyro_4_throwing_bike.head()

Unnamed: 0,Timestamp,Mode,Data,X,Y,Z
0,18:55:43:665,RX,0000326756 [app] INFO: GYRO:,-4.695,-0.52,19.148
1,18:55:43:800,RX,0000326898 [app] INFO: GYRO:,-4.695,-0.52,19.148
2,18:55:43:891,RX,0000327026 [app] INFO: GYRO:,-4.695,-0.52,19.148
3,18:55:44:027,RX,0000327116 [app] INFO: GYRO:,-4.695,-0.52,19.148
4,18:55:44:117,RX,0000327251 [app] INFO: GYRO:,-4.695,-0.52,19.148


In [116]:
# Check data info.

Gyro_4_throwing_bike.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  182 non-null    object 
 1   Mode       182 non-null    object 
 2   Data       182 non-null    object 
 3   X          182 non-null    float64
 4   Y          182 non-null    float64
 5   Z          182 non-null    float64
dtypes: float64(3), object(3)
memory usage: 8.7+ KB


In [117]:
# Check for missing values.

Gyro_4_throwing_bike.isna().sum()

Timestamp    0
Mode         0
Data         0
X            0
Y            0
Z            0
dtype: int64

In [118]:
# Create copy of dataframe.

Gyro_4_throwing_bike_copy = Gyro_4_throwing_bike.copy()

In [119]:
# Remove 'Mode' column.

Gyro_4_throwing_bike_copy.drop(labels=['Mode'], axis=1, inplace=True)

In [120]:
# Preview data.

Gyro_4_throwing_bike_copy.head()

Unnamed: 0,Timestamp,Data,X,Y,Z
0,18:55:43:665,0000326756 [app] INFO: GYRO:,-4.695,-0.52,19.148
1,18:55:43:800,0000326898 [app] INFO: GYRO:,-4.695,-0.52,19.148
2,18:55:43:891,0000327026 [app] INFO: GYRO:,-4.695,-0.52,19.148
3,18:55:44:027,0000327116 [app] INFO: GYRO:,-4.695,-0.52,19.148
4,18:55:44:117,0000327251 [app] INFO: GYRO:,-4.695,-0.52,19.148


In [121]:
# Save dataframe as csv format.

Gyro_4_throwing_bike_copy.to_csv('Gyro_4_throwing_bike_31_05_2022_cleaned.csv')

## Gyro_5_throwing_bike_31_05:

In [122]:
# Load and previw data. 

Gyro_5_throwing_bike = pd.read_csv('gyro_5_throwing_bike_31_05_2022.csv')

Gyro_5_throwing_bike.head()

Unnamed: 0,Timestamp,Mode,Data,X,Y,Z
0,18:56:36:812,RX,0000379945 [app] INFO: GYRO:,0.066,0.027,0.901
1,18:56:36:902,RX,0000380036 [app] INFO: GYRO:,0.066,0.027,0.901
2,18:56:36:992,RX,0000380126 [app] INFO: GYRO:,0.066,0.027,0.901
3,18:56:37:082,RX,0000380216 [app] INFO: GYRO:,0.066,0.027,0.901
4,18:56:37:171,RX,0000380306 [app] INFO: GYRO:,0.066,0.027,0.901


In [123]:
# Get data info.

Gyro_5_throwing_bike.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Timestamp  97 non-null     object 
 1   Mode       97 non-null     object 
 2   Data       97 non-null     object 
 3   X          97 non-null     float64
 4   Y          97 non-null     float64
 5   Z          97 non-null     float64
dtypes: float64(3), object(3)
memory usage: 4.7+ KB


In [124]:
# Check for NaNs.

Gyro_5_throwing_bike.isna().sum()

Timestamp    0
Mode         0
Data         0
X            0
Y            0
Z            0
dtype: int64

In [125]:
# Create copy of dataframe.

Gyro_5_throwing_bike_copy = Gyro_5_throwing_bike.copy()

In [126]:
# Remove the 'Mode' column

Gyro_5_throwing_bike_copy.drop(labels=['Mode'], axis=1, inplace=True)

In [127]:
# Preview data.

Gyro_5_throwing_bike_copy.head()

Unnamed: 0,Timestamp,Data,X,Y,Z
0,18:56:36:812,0000379945 [app] INFO: GYRO:,0.066,0.027,0.901
1,18:56:36:902,0000380036 [app] INFO: GYRO:,0.066,0.027,0.901
2,18:56:36:992,0000380126 [app] INFO: GYRO:,0.066,0.027,0.901
3,18:56:37:082,0000380216 [app] INFO: GYRO:,0.066,0.027,0.901
4,18:56:37:171,0000380306 [app] INFO: GYRO:,0.066,0.027,0.901


In [128]:
# Save datat frame as csv format.

Gyro_5_throwing_bike_copy.to_csv('Gyro_5_throwing_bike_31_05_2022_cleaned.csv')

## 1hr_normal_rider

In [129]:
# Load and previw data. 

Normal_Rider_2 = pd.read_csv('1hr_normal_rider.csv')

In [130]:
# Create a copy of df

Normal_Rider_2_copy = Normal_Rider_2.copy()

In [131]:
# Remove NaNs of the data frame.

Normal_Rider_2_copy.dropna(axis=0, inplace=True)

In [132]:
# Check for NaNs again.

Normal_Rider_2_copy.isna().sum()

timestamp (ms)    0
acc_x             0
acc_y             0
acc_z             0
gyro_x            0
gyro_y            0
gyro_z            0
Speed(Km/h)       0
dtype: int64

In [133]:
# Data are still labelled as 'Object' data type.
# Thus some data points are not NaNs but are not relevant data.

def check_int(value):
    try:
        int(value)
        return 'int' # if value is an integer, show 'int'
    except ValueError:
        return value # if value is not an integer, show the value
    
# run the function on the y column
acc_y_int_check = Normal_Rider_2_copy['acc_y'].apply(check_int)

acc_y_int_check.loc[lambda x : x != 'int']

46       gps:locked
846      gps:locked
1640     gps:locked
2433     gps:locked
3230     gps:locked
4026     gps:locked
4821     gps:locked
5617     gps:locked
6413     gps:locked
7210     gps:locked
8008     gps:locked
8804     gps:locked
9601     gps:locked
10398    gps:locked
11193    gps:locked
11987    gps:locked
12783    gps:locked
13579    gps:locked
14374    gps:locked
15168    gps:locked
15970    gps:locked
16765    gps:locked
17562    gps:locked
18357    gps:locked
19154    gps:locked
19952    gps:locked
20745    gps:locked
21539    gps:locked
22333    gps:locked
23126    gps:locked
23923    gps:locked
24721    gps:locked
25516    gps:locked
26313    gps:locked
27108    gps:locked
27904    gps:locked
28705    gps:locked
29504    gps:locked
30305    gps:locked
31100    gps:locked
31531      id:0x2EE
31860    gps:locked
32554    gps:locked
33250    gps:locked
33948    gps:locked
34644    gps:locked
36036    gps:locked
37428    gps:locked
38122    gps:locked
38818    gps:locked


In [134]:
# Identify index that contains values not intiger (acc_y)

indexNames = acc_y_int_check.loc[lambda x : x != 'int'].index

In [135]:
# Delete these row indexes from dataFrame

Normal_Rider_2_copy.drop(indexNames, inplace = True)

In [136]:
# Run the function on the z column

acc_z_int_check = Normal_Rider_2_copy['acc_z'].apply(check_int)

acc_z_int_check.loc[lambda x : x != 'int']

31859    soc:100.00%
32553    soc:100.00%
33249    soc:100.00%
33947    soc:100.00%
34643    soc:100.00%
35340    soc:100.00%
36035    soc:100.00%
36733    soc:100.00%
38121    soc:100.00%
38817    soc:100.00%
Name: acc_z, dtype: object

In [137]:
# Identify index that contains values not intiger (acc_z)
indexNames2 = acc_z_int_check.loc[lambda x : x != 'int'].index

# Delete these row indexes from dataFrame
Normal_Rider_2_copy.drop(indexNames2, inplace = True)

In [138]:
# Data are still labelled as 'Object' data type. in gyro
# Thus some data points are not NaNs but are not relevant data.

def check_float(value):
    try:
        float(value)
        return 'float' # if value is an integer, show 'int'
    except ValueError:
        return value # if value is not an integer, show the value
    
# run the function on the y column
gyro_x_float_check = Normal_Rider_2_copy['gyro_x'].apply(check_float)

gyro_x_float_check.loc[lambda x : x != 'float']

Series([], Name: gyro_x, dtype: object)

In [139]:
# Correcting types

Normal_Rider_2_copy = Normal_Rider_2_copy.astype({'acc_y':int, 'acc_z':int, 'gyro_x':float, 'gyro_y':float, 'gyro_z':float})

In [140]:
# Check the types again

Normal_Rider_2_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52011 entries, 0 to 52114
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   timestamp (ms)  52011 non-null  int64  
 1   acc_x           52011 non-null  int64  
 2   acc_y           52011 non-null  int64  
 3   acc_z           52011 non-null  int64  
 4   gyro_x          52011 non-null  float64
 5   gyro_y          52011 non-null  float64
 6   gyro_z          52011 non-null  float64
 7   Speed(Km/h)     52011 non-null  float64
dtypes: float64(4), int64(4)
memory usage: 3.6 MB


In [141]:
# Check the descriptive statistics

Normal_Rider_2_copy.describe()

Unnamed: 0,timestamp (ms),acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,Speed(Km/h)
count,52011.0,52011.0,52011.0,52011.0,52011.0,52011.0,52011.0,52011.0
mean,2518251.0,0.09598,3.305762,-2.052489,-1.609354,0.681488,-1.941006,6.819727
std,448593.8,2865.438987,1491.5865,3422.188867,9.505255,2.57263,17.807291,7.291429
min,2019518.0,-28512.0,-13952.0,-25184.0,-43.817001,-7.402,-80.455002,0.0
25%,2183082.0,-512.0,-320.0,-640.0,-5.767,-0.023,-7.027,0.0
50%,2346783.0,0.0,0.0,0.0,-0.123,0.329,0.086,4.6
75%,2895668.0,512.0,352.0,608.0,4.348,1.756,4.947,13.0
max,4038535.0,26112.0,13024.0,23200.0,15.391,9.203,37.088001,24.0


In [142]:
# Export cleaned data as CSV

Normal_Rider_2_copy.to_csv('1hr_normal_rider_cleaned.csv')

## Zoomo_IoT:

# Data Cleaning process: 

- Separate acceleration and gyroscopic datat into two different data frames.
- Remove NaNs and non Numeric values
- Remove unwanted columns
- Rename time columns to distinguish between acceleration and gyrometric time data.


In [143]:
# Import and preview the data.

Zoomo_IoT = pd.read_excel('Zoomo_IoT.xlsx')

Zoomo_IoT.head()

Unnamed: 0,time,ax (m/s^2),ay (m/s^2),az (m/s^2),aT (m/s^2),Unnamed: 5,time.1,wx (rad/s),wy (rad/s),wz (rad/s)
0,0.004313,-0.2607,0.0004,0.1696,0.311,,0.000821,0.0,0,0.0
1,0.016459,0.0307,0.0412,0.1094,0.121,,0.000997,0.0,0,0.0
2,0.022998,-0.0977,0.0356,0.0876,0.136,,0.001085,0.0,0,0.0
3,0.033322,0.096,0.082,0.0731,0.146,,0.001171,0.0,0,0.0
4,0.042877,0.0219,0.0941,0.0491,0.108,,0.002772,0.0,0,0.0


In [144]:
# Get data info. 

Zoomo_IoT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 518223 entries, 0 to 518222
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   time        84152 non-null   float64
 1   ax (m/s^2)  84152 non-null   float64
 2   ay (m/s^2)  84152 non-null   float64
 3   az (m/s^2)  84152 non-null   float64
 4   aT (m/s^2)  84152 non-null   float64
 5   Unnamed: 5  0 non-null       float64
 6   time.1      518223 non-null  float64
 7   wx (rad/s)  518222 non-null  float64
 8   wy (rad/s)  518222 non-null  object 
 9   wz (rad/s)  518221 non-null  float64
dtypes: float64(9), object(1)
memory usage: 39.5+ MB


In [145]:
# Check for missing values.

print(Zoomo_IoT.isna().sum())

print(Zoomo_IoT.tail())


time          434071
ax (m/s^2)    434071
ay (m/s^2)    434071
az (m/s^2)    434071
aT (m/s^2)    434071
Unnamed: 5    518223
time.1             0
wx (rad/s)         1
wy (rad/s)         1
wz (rad/s)         2
dtype: int64
        time  ax (m/s^2)  ay (m/s^2)  az (m/s^2)  aT (m/s^2)  Unnamed: 5  \
518218   NaN         NaN         NaN         NaN         NaN         NaN   
518219   NaN         NaN         NaN         NaN         NaN         NaN   
518220   NaN         NaN         NaN         NaN         NaN         NaN   
518221   NaN         NaN         NaN         NaN         NaN         NaN   
518222   NaN         NaN         NaN         NaN         NaN         NaN   

             time.1  wx (rad/s) wy (rad/s)  wz (rad/s)  
518218  1036.693259        0.00          0         0.0  
518219  1036.694490        0.01          0         0.0  
518220  1036.695822        0.00          0         0.0  
518221  1036.697831        0.00          0         0.0  
518222  1036.699798        0.00    

### Observation: 

- The acceleration data have 434071 missing values. However, when looking at the excel spreadsheet we can see that this is because there are simply no data in those fields.
-  Therefore, before doing any data cleaning I will separate the acceleration and gyrometric data into different dataframes.

In [146]:
# Create a copy of orinigal dataframe.

Zoomo_IoT_copy = Zoomo_IoT.copy()

In [147]:
# Create a subset of acceleration data.

Zoomo_IoT_accel = Zoomo_IoT_copy[['time', 'ax (m/s^2)', 'ay (m/s^2)', 'az (m/s^2)','aT (m/s^2)']]

In [148]:
# Preview acceleration data.

Zoomo_IoT_accel.head()

Unnamed: 0,time,ax (m/s^2),ay (m/s^2),az (m/s^2),aT (m/s^2)
0,0.004313,-0.2607,0.0004,0.1696,0.311
1,0.016459,0.0307,0.0412,0.1094,0.121
2,0.022998,-0.0977,0.0356,0.0876,0.136
3,0.033322,0.096,0.082,0.0731,0.146
4,0.042877,0.0219,0.0941,0.0491,0.108


In [149]:
# Create a subset of gyroscopic data.

Zoomo_IoT_gyro = Zoomo_IoT_copy[['time.1', 'wx (rad/s)', 'wy (rad/s)', 'wz (rad/s)']]

In [150]:
# Preview the data gyroscopic data.

Zoomo_IoT_gyro.head()

Unnamed: 0,time.1,wx (rad/s),wy (rad/s),wz (rad/s)
0,0.000821,0.0,0,0.0
1,0.000997,0.0,0,0.0
2,0.001085,0.0,0,0.0
3,0.001171,0.0,0,0.0
4,0.002772,0.0,0,0.0


In [151]:
# Remove NaNs for acceleration data.

Zoomo_IoT_accel.dropna(axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [152]:
# Verify if there are any missing data left.

Zoomo_IoT_accel.isna().sum()



time          0
ax (m/s^2)    0
ay (m/s^2)    0
az (m/s^2)    0
aT (m/s^2)    0
dtype: int64

In [153]:
# Remove NaNs for gyrometric data. 

Zoomo_IoT_gyro.dropna(axis=0, inplace=True)

In [154]:
# Verify if there are any missing data left.

Zoomo_IoT_gyro.isna().sum()

time.1        0
wx (rad/s)    0
wy (rad/s)    0
wz (rad/s)    0
dtype: int64

In [155]:
# Check data type for both data frame.

print(Zoomo_IoT_accel.info())


print(Zoomo_IoT_gyro.info())


<class 'pandas.core.frame.DataFrame'>
Int64Index: 84152 entries, 0 to 84151
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   time        84152 non-null  float64
 1   ax (m/s^2)  84152 non-null  float64
 2   ay (m/s^2)  84152 non-null  float64
 3   az (m/s^2)  84152 non-null  float64
 4   aT (m/s^2)  84152 non-null  float64
dtypes: float64(5)
memory usage: 3.9 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 518221 entries, 0 to 518221
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   time.1      518221 non-null  float64
 1   wx (rad/s)  518221 non-null  float64
 2   wy (rad/s)  518221 non-null  object 
 3   wz (rad/s)  518221 non-null  float64
dtypes: float64(3), object(1)
memory usage: 19.8+ MB
None


### Observation:

- For acceleration data there are no missing data points or NaNs and the data type for each column is float64. Thus the data are clean.
- For Gyroscopic data, there are no missing data points or NaNs however for 'wy' column the data type is object. Thus I will change its data type to float64.

In [156]:
# Change data type of 'wy' column.

Zoomo_IoT_gyro['wy (rad/s)'] = Zoomo_IoT_gyro['wy (rad/s)'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Zoomo_IoT_gyro['wy (rad/s)'] = Zoomo_IoT_gyro['wy (rad/s)'].astype(float)


In [157]:
# Verify that 
Zoomo_IoT_gyro.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 518221 entries, 0 to 518221
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   time.1      518221 non-null  float64
 1   wx (rad/s)  518221 non-null  float64
 2   wy (rad/s)  518221 non-null  float64
 3   wz (rad/s)  518221 non-null  float64
dtypes: float64(4)
memory usage: 19.8 MB


In [158]:
# Save both the data frames as csv format.

Zoomo_IoT_accel.to_csv('Zoomo_IoT_accel_Cleaned.csv')

Zoomo_IoT_gyro.to_csv('Zoomo_IoT_gyro_Cleaned.csv')
