In [54]:
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

#consolidate all the files for one client into a single array
#Listing all the patients (0-14 in our direction)

#file path must be specified correctly
sensor_data = sorted(glob('Topics in Data Science/FORTH_TRACE_DATASET-master/FORTH_TRACE_DATASET-master/part0/*.csv'))

for file in sensor_data:
    patient0 = pd.concat((pd.read_csv(file, header = None) for file in sensor_data), ignore_index=True)

df = pd.DataFrame(patient0)
#header previously set to None, naming columns
column_names = [ "device", "accelerometer1", "accelerometer2", "accelerometer3", "gyroscope1", "gyroscope2", "gyroscope3", "magnetometer1", "magnetometer2", "magnetometer3", "timestamp", "activity_label" ]
df.columns = column_names

#attempting to separate data frames by activity label
#df.groupby("activity_label")
#df.loc[df.device == 1 ,["accelerometer1", "accelerometer2", "accelerometer3"]]


#calculate the modulus for relevant columns
df["acc_mod"] = np.sqrt( (df.accelerometer1 ** 2) + (df.accelerometer2 ** 2) + (df.accelerometer3 ** 2) )
df["gyr_mod"] = np.sqrt( (df.gyroscope1 ** 2) + (df.gyroscope2 ** 2 ) + (df.gyroscope3 ** 2) )
df["mag_mod"] = np.sqrt( (df.magnetometer1 ** 2) + (df.magnetometer2 ** 2) + (df.magnetometer3 ** 2) )


#calculating the z score relevant columns using SciPy
df["acc_z"] = stats.zscore(df.acc_mod, axis=0, ddof=0, nan_policy="propagate")
df["gyr_z"] = stats.zscore(df.gyr_mod, axis=0, ddof=0, nan_policy="propagate")
df["mag_z"] = stats.zscore(df.gyr_mod, axis=0, ddof=0, nan_policy="propagate")



# identifying outliers when k is 3
df["acc_out"] = ~df["acc_z"].between(-3.0, 3.0, inclusive=False)
df["gyr_out"] = ~df["gyr_z"].between(-3.0, 3.0, inclusive=False)
df["mag_out"] = ~df["acc_z"].between(-3.0, 3.0, inclusive=False)

#identifying outliers when k is 3.5
#df["acc_out"] = ~df["acc_z"].between(-3.5, 3.5, inclusive=False)
#df["gyr_out"] = ~df["gyr_z"].between(-3.5, 3.5, inclusive=False)
#df["mag_out"] = ~df["acc_z"].between(-3.5, 3.5, inclusive=False)

#identifying outliers when k is 4
#df["acc_out"] = ~df["acc_z"].between(-4.0, 4.0, inclusive=False)
#df["gyr_out"] = ~df["gyr_z"].between(-4.0, 4.0, inclusive=False)
#df["mag_out"] = ~df["acc_z"].between(-4.0, 4.0, inclusive=False)

#drop unused columns to save meemory
df.drop(["accelerometer1", "accelerometer2", "accelerometer3", "gyroscope1", "gyroscope2", "gyroscope3", "magnetometer1", "magnetometer2", "magnetometer3"], axis=1)

#plot the relevant columns individually
#plt.boxplot(df.acc_mod)
#plt.boxplot(df.gyr_mod)
#plt.boxplot(df.mag_mod)

#returns relevant columns together
#fig, (mod_acc, mod_gyr, mod_mag) = plt.subplots(1,3)
#fig.suptitle('boxplots for relevant columns')
#mod_acc.boxplot(df.acc_mod)
#mod_gyr.boxplot(df.gyr_mod)
#mod_mag.boxplot(df.mag_mod)











#counting outliers
#df["acc_out"].value_counts()
#df["gyr_out"].value_counts()
#df["mag_out"].value_counts()

#find the mean of each relevant columns
#np.mean(df.acc_mod), np.mean(df.gyr_mod), np.mean(df.mag_mod)

#find the standard deviation of each relevant column
#np.std(df.acc_mod), np.std(df.gyr_mod), np.std(df.mag_mod)













Unnamed: 0,device,timestamp,activity_label,acc_mod,gyr_mod,mag_mod,acc_z,gyr_z,mag_z,acc_out,gyr_out,mag_out
0,1,505.89,1,9.914340,1.852517,1.226340,-0.137880,-0.733547,-0.733547,False,False,False
1,1,525.42,1,9.853184,1.798268,1.203995,-0.169199,-0.734474,-0.734474,False,False,False
2,1,544.95,1,9.773151,1.754919,1.227185,-0.210186,-0.735216,-0.735216,False,False,False
3,1,564.48,1,9.845011,2.158681,1.235692,-0.173385,-0.728311,-0.728311,False,False,False
4,1,584.01,1,9.815237,1.462196,1.244169,-0.188633,-0.740222,-0.740222,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
265339,5,1038000.00,1,9.706023,1.468925,0.728639,-0.244563,-0.740107,-0.740107,False,False,False
265340,5,1038000.00,1,9.717415,1.071500,0.687962,-0.238729,-0.746903,-0.746903,False,False,False
265341,5,1038000.00,1,9.711826,1.174106,0.708404,-0.241591,-0.745148,-0.745148,False,False,False
265342,5,1038100.00,1,9.717415,0.983234,0.734298,-0.238729,-0.748413,-0.748413,False,False,False
