## Setting up the Environment

In [1]:
import os
import re
import numpy as np
import pandas as pd
import math
import statistics as st
import itertools
import matplotlib.pyplot as plt
import matplotlib.colors as colors

from scipy import stats
from datetime import datetime
from statsmodels import robust
from scipy.fft import fft

## Our Raw Data

In [2]:
def correlation(data1, data2):
    corr, p = stats.pearsonr(data1, data2)
    if math.isnan(corr):
        return 0
    else:
        return corr

def sma(x, y, z):
    sum_ = 0
    X = list(x)
    Y = list(y)
    Z = list(z)
    for i in range(len(X)):
        sum_ += abs(X[i]) + abs(Y[i]) + abs(Z[i])
    return sum_ / len(X)

def calc_entropy(data):
    entropy = stats.entropy(data, base=2)
    if math.isinf(entropy) or math.isnan(entropy):
        return -1
    else:
        return entropy

def energy(data):
    sum_ = 0
    for d in data:
        sum_ += d ** 2
        
    return sum_ / len(data)

def iqr(data):
    return np.subtract(*np.percentile(data, [75, 25]))

In [17]:
raw_datasets = []
raw_datalabels = []

directory = "RawDataSet"

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        raw = pd.read_csv(os.path.join(directory, filename))
        cols = raw.columns
        cols = cols.str.replace('([\(\[]).*?([\)\]])', '')
        cols = cols.str.replace('\s','_')
        raw.columns = cols
        to_drop = []
        for col in raw.columns:
            if not ("_X_" in col or "_Y_" in col or "_Z_" in col):
                to_drop.append(col)
        raw = raw.drop(to_drop, axis=1)
        column_names = raw.columns
        raw_datasets.append(raw)
        raw_datalabels.append(filename.split(".")[0])

raw = raw_datasets[0]["ACCELEROMETER_X_"][:5]

print(raw)

0    -0.8857
1    -0.8139
2   -10.2466
3    -4.3679
4     5.0936
Name: ACCELEROMETER_X_, dtype: float64


In [18]:
datasets = dict()
statistics = ["mean", "mad", "max", "min", "std", "energy", "iqr", "entropy"]
diff_col_names = []
for col in range(0,len(column_names),3):
    diff_col_names.append(column_names[col][:len(column_names[col])-3])
    
for col in column_names:
    for stat in statistics:
        key = col + "~" + stat
        datasets[key] = []
        
for col in diff_col_names:
    datasets[col+"_XY_~correlation"] = []
    datasets[col+"_YZ_~correlation"] = []
    datasets[col+"_ZX_~correlation"] = []
    datasets[col+"_XYZ_~sma"] = []
    
datasets["Activity"] = []

for ind,raw_data in enumerate(raw_datasets):
    print(ind)
    for d in range(0, len(raw_data), 5):
        if d+5 < len(raw_data):
            data = raw_data[d:d+5]
        else:
            break
        for c in diff_col_names:
            col_X = c + "_X_"
            col_Y = c + "_Y_"
            col_Z = c + "_Z_"

            datasets[col_X+"~mean"].append(st.mean(data[col_X])) # mean X
            datasets[col_Y+"~mean"].append(st.mean(data[col_Y])) # mean Y
            datasets[col_Z+"~mean"].append(st.mean(data[col_Z])) # mean Z

            datasets[col_X+"~mad"].append(robust.mad(np.array(data[col_X]))) # median absolute deviation X
            datasets[col_Y+"~mad"].append(robust.mad(np.array(data[col_Y]))) # median absolute deviation Y
            datasets[col_Z+"~mad"].append(robust.mad(np.array(data[col_Z]))) # median absolute deviation Z

            datasets[col_X+"~max"].append(max(data[col_X])) # maximum X
            datasets[col_Y+"~max"].append(max(data[col_Y])) # maximum Y
            datasets[col_Z+"~max"].append(max(data[col_Z])) # maximum Z

            datasets[col_X+"~min"].append(min(data[col_X])) # minimum X
            datasets[col_Y+"~min"].append(min(data[col_Y])) # minimum Y
            datasets[col_Z+"~min"].append(min(data[col_Z])) # minimum Z

            datasets[col_X+"~std"].append(st.stdev(data[col_X])) # standard deviation X
            datasets[col_Y+"~std"].append(st.stdev(data[col_Y])) # standard deviation Y
            datasets[col_Z+"~std"].append(st.stdev(data[col_Z])) # standard deviation Z

            datasets[col_X+"~energy"].append(energy(data[col_X])) # energy X
            datasets[col_Y+"~energy"].append(energy(data[col_Y])) # energy Y
            datasets[col_Z+"~energy"].append(energy(data[col_Z])) # energy Z

            datasets[col_X+"~iqr"].append(iqr(data[col_X])) # interquartile range X
            datasets[col_Y+"~iqr"].append(iqr(data[col_Y])) # interquartile range Y
            datasets[col_Z+"~iqr"].append(iqr(data[col_Z])) # interquartile range Z
            
            datasets[col_X+"~entropy"].append(calc_entropy(data[col_X])) # entropy X
            datasets[col_Y+"~entropy"].append(calc_entropy(data[col_Y])) # entropy Y
            datasets[col_Z+"~entropy"].append(calc_entropy(data[col_Z])) # entropy Z

            datasets[c+"_XY_~correlation"].append(correlation(data[col_X], data[col_Y])) # correlation between X and Y
            datasets[c+"_YZ_~correlation"].append(correlation(data[col_Y], data[col_Z])) # correlation between Y and Z
            datasets[c+"_ZX_~correlation"].append(correlation(data[col_Z], data[col_X])) # correlation between Z and X


            datasets[c+"_XYZ_~sma"].append(sma(data[col_X], data[col_Y], data[col_Z]))

        datasets["Activity"].append(raw_datalabels[ind])
        data = []


0
1
2
3
4


In [20]:
df = pd.DataFrame.from_dict(datasets, orient="columns")
df.to_csv("Processed_DataSet/ProcessedData.csv")

In [16]:
datasets = dict()
statistics_t = ["mean", "mad", "max", "min", "std", "energy", "iqr", "entropy"]
statistics_f = statistics_t + ["skewness", "kurtosis"]
diff_col_names = []
for col in range(0,len(column_names),3):
    diff_col_names.append(column_names[col][:len(column_names[col])-3])
    
for col in column_names:
    for stat in statistics_t:
        key = "t" + col + "~" + stat
        datasets[key] = []
        
for col in diff_col_names:
    datasets["t"+col+"_XY_~correlation"] = []
    datasets["t"+col+"_YZ_~correlation"] = []
    datasets["t"+col+"_ZX_~correlation"] = []
    datasets["t"+col+"_XYZ_~sma"] = []

for col in column_names:
    for stat in statistics_f:
        key = "f" + col + "~" + stat
        datasets[key] = []

for col in diff_col_names:
    datasets["f"+col+"_XYZ_~sma"] = []
    
datasets["Activity"] = []

for ind,raw_data in enumerate(raw_datasets):
    print(ind)
    for d in range(0, len(raw_data), 5):
        if d+5 < len(raw_data):
            data = raw_data[d:d+5]
        else:
            break
        for c in diff_col_names:
            col_X = c + "_X_"
            col_Y = c + "_Y_"
            col_Z = c + "_Z_"
            
            # time
            
            tcol_X = "t" + col_X
            tcol_Y = "t" + col_Y
            tcol_Z = "t" + col_Z

            datasets[tcol_X+"~mean"].append(st.mean(data[col_X])) # mean X
            datasets[tcol_Y+"~mean"].append(st.mean(data[col_Y])) # mean Y
            datasets[tcol_Z+"~mean"].append(st.mean(data[col_Z])) # mean Z

            datasets[tcol_X+"~mad"].append(robust.mad(np.array(data[col_X]))) # median absolute deviation X
            datasets[tcol_Y+"~mad"].append(robust.mad(np.array(data[col_Y]))) # median absolute deviation Y
            datasets[tcol_Z+"~mad"].append(robust.mad(np.array(data[col_Z]))) # median absolute deviation Z

            datasets[tcol_X+"~max"].append(max(data[col_X])) # maximum X
            datasets[tcol_Y+"~max"].append(max(data[col_Y])) # maximum Y
            datasets[tcol_Z+"~max"].append(max(data[col_Z])) # maximum Z

            datasets[tcol_X+"~min"].append(min(data[col_X])) # minimum X
            datasets[tcol_Y+"~min"].append(min(data[col_Y])) # minimum Y
            datasets[tcol_Z+"~min"].append(min(data[col_Z])) # minimum Z

            datasets[tcol_X+"~std"].append(st.stdev(data[col_X])) # standard deviation X
            datasets[tcol_Y+"~std"].append(st.stdev(data[col_Y])) # standard deviation Y
            datasets[tcol_Z+"~std"].append(st.stdev(data[col_Z])) # standard deviation Z

            datasets[tcol_X+"~energy"].append(energy(data[col_X])) # energy X
            datasets[tcol_Y+"~energy"].append(energy(data[col_Y])) # energy Y
            datasets[tcol_Z+"~energy"].append(energy(data[col_Z])) # energy Z

            datasets[tcol_X+"~iqr"].append(iqr(data[col_X])) # interquartile range X
            datasets[tcol_Y+"~iqr"].append(iqr(data[col_Y])) # interquartile range Y
            datasets[tcol_Z+"~iqr"].append(iqr(data[col_Z])) # interquartile range Z
            
            datasets[tcol_X+"~entropy"].append(calc_entropy(data[col_X])) # entropy X
            datasets[tcol_Y+"~entropy"].append(calc_entropy(data[col_Y])) # entropy Y
            datasets[tcol_Z+"~entropy"].append(calc_entropy(data[col_Z])) # entropy Z

            datasets["t"+c+"_XY_~correlation"].append(correlation(data[col_X], data[col_Y])) # correlation between X and Y
            datasets["t"+c+"_YZ_~correlation"].append(correlation(data[col_Y], data[col_Z])) # correlation between Y and Z
            datasets["t"+c+"_ZX_~correlation"].append(correlation(data[col_Z], data[col_X])) # correlation between Z and X


            datasets["t"+c+"_XYZ_~sma"].append(sma(data[col_X], data[col_Y], data[col_Z]))
            
            # frequency
            
            fcol_X = "f" + col_X
            fcol_Y = "f" + col_Y
            fcol_Z = "f" + col_Z
            
            fdata_X = fft(list(data[col_X]))
            fdata_Y = fft(list(data[col_Y]))
            fdata_Z = fft(list(data[col_Z]))
            
            datasets[fcol_X+"~mean"].append(st.mean(fdata_X)) # mean X
            datasets[fcol_Y+"~mean"].append(st.mean(fdata_Y)) # mean Y
            datasets[fcol_Z+"~mean"].append(st.mean(fdata_Z)) # mean Z

            datasets[fcol_X+"~mad"].append(robust.mad(np.array(fdata_X))) # median absolute deviation X
            datasets[fcol_Y+"~mad"].append(robust.mad(np.array(fdata_Y))) # median absolute deviation Y
            datasets[fcol_Z+"~mad"].append(robust.mad(np.array(fdata_Z))) # median absolute deviation Z

            datasets[fcol_X+"~max"].append(max(fdata_X)) # maximum X
            datasets[fcol_Y+"~max"].append(max(fdata_Y)) # maximum Y
            datasets[fcol_Z+"~max"].append(max(fdata_Z)) # maximum Z

            datasets[fcol_X+"~min"].append(min(fdata_X)) # minimum X
            datasets[fcol_Y+"~min"].append(min(fdata_Y)) # minimum Y
            datasets[fcol_Z+"~min"].append(min(fdata_Z)) # minimum Z

            datasets[fcol_X+"~std"].append(st.stdev(fdata_X)) # standard deviation X
            datasets[fcol_Y+"~std"].append(st.stdev(fdata_Y)) # standard deviation Y
            datasets[fcol_Z+"~std"].append(st.stdev(fdata_Z)) # standard deviation Z

            datasets[fcol_X+"~energy"].append(energy(fdata_X)) # energy X
            datasets[fcol_Y+"~energy"].append(energy(fdata_Y)) # energy Y
            datasets[fcol_Z+"~energy"].append(energy(fdata_Z)) # energy Z

            datasets[fcol_X+"~iqr"].append(iqr(fdata_X)) # interquartile range X
            datasets[fcol_Y+"~iqr"].append(iqr(fdata_Y)) # interquartile range Y
            datasets[fcol_Z+"~iqr"].append(iqr(fdata_Z)) # interquartile range Z
            
            datasets[fcol_X+"~entropy"].append(calc_entropy(fdata_X)) # entropy X
            datasets[fcol_Y+"~entropy"].append(calc_entropy(fdata_Y)) # entropy Y
            datasets[fcol_Z+"~entropy"].append(calc_entropy(fdata_Z)) # entropy Z

            datasets["f"+c+"_XYZ_~sma"].append(sma(fdata_X, fdata_Y, fdata_Z))
            
            datasets[fcol_X+"~kurtosis"].append(stats.kurtosis(fdata_X)) # kurtosis X
            datasets[fcol_Y+"~kurtosis"].append(stats.kurtosis(fdata_Y)) # kurtosis Y
            datasets[fcol_Z+"~kurtosis"].append(stats.kurtosis(fdata_Z)) # kurtosis Z

            datasets[fcol_X+"~skewness"].append(stats.skew(fdata_X)) # skewness X
            datasets[fcol_Y+"~skewness"].append(stats.skew(fdata_Y)) # skewness Y
            datasets[fcol_Z+"~skewness"].append(stats.skew(fdata_Z)) # skewness Z
            

        datasets["Activity"].append(raw_datalabels[ind])
        data = []


0


TypeError: can't convert type 'complex128' to numerator/denominator

In [6]:
df = pd.DataFrame.from_dict(datasets, orient="columns")
df.to_csv("Processed_DataSet/fProcessedData.csv")