In [1]:
dataset_path = '../datasets/Basketball/'
result_dataset_path = './intermediate_datafiles/Basketball/'

# Import the relevant classes.

from Chapter2.CreateDataset import CreateDataset
from util.VisualizeDataset import VisualizeDataset
from util import util
import copy
import os
import pandas as pd
import numpy as np
from Chapter3.KalmanFilters import KalmanFilters
from Chapter3.DataTransformation import PrincipalComponentAnalysis


DataViz = VisualizeDataset()
if not os.path.exists(result_dataset_path):
    print('Creating result directory: ' + result_dataset_path)
    os.makedirs(result_dataset_path)

In [None]:
# Chapter 2: Initial exploration of the dataset.

# Set a granularity (i.e. how big are our discrete time steps). We start very
# coarse grained, namely one measurement per minute, and secondly use four measurements
# per second

granularities = [500, 250]
datasets = []

for milliseconds_per_instance in granularities:

    # Create an initial dataset object with the base directory for our data and a granularity
    DataSet = CreateDataset(dataset_path, milliseconds_per_instance)

    # Add the selected measurements to it.

    # We add the accelerometer data (continuous numerical measurements) of the phone and the smartwatch
    # and aggregate the values per timestep by averaging the values/
    DataSet.add_numerical_dataset('Accelerometer.csv', 'timestamps', ['x','y','z'], 'avg', 'acc_')
   

    DataSet.add_numerical_dataset('Gravity.csv', 'timestamps', ['x','y','z'], 'avg', 'grav_')
    DataSet.add_numerical_dataset('Gyroscope.csv', 'timestamps', ['x','y','z'], 'avg', 'gyr_')
    DataSet.add_numerical_dataset('Linear_Acceleration.csv', 'timestamps', ['x','y','z'], 'avg', 'lin_acc_')
    DataSet.add_numerical_dataset('Magnetometer.csv', 'timestamps', ['x','y','z'], 'avg', 'mag_')
  #  DataSet.add_numerical_dataset('Orientation.csv', 'timestamps', ['x','y','z'], 'avg', 'orie_')
    DataSet.add_numerical_dataset('Pressure.csv', 'timestamps', ['value1','value2'], 'avg', 'press_')
    DataSet.add_numerical_dataset('Rotation vector.csv', 'timestamps', ['xsin','ysin','zsin','cos'], 'avg', 'rot_vec_')
    DataSet.add_numerical_dataset('Step_Counter.csv', 'timestamps', ['step'], 'avg', 'step_')
    
    
    DataSet.add_event_dataset('labels.csv', 'label_start', 'label_end', 'label', 'binary')

   
    # Get the resulting pandas data table

    dataset = DataSet.data_table

    # Plot the data


    # Boxplot
    DataViz.plot_dataset_boxplot(dataset, ['acc_x','acc_y','acc_z'])

    # Plot all data
   # DataViz.plot_dataset(dataset, ['acc_', 'grav_','gyr_','lin_acc_','mag_', 'orie_','press_','rot_vec_','step_', 'label'], ['like', 'like','like'], ['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])
    DataViz.plot_dataset(dataset, ['acc_', 'grav_','gyr_','lin_acc_','mag_','press_','rot_vec_','step_', 'label'], ['like', 'like','like','like', 'like','like','like', 'like','like'], ['line', 'line', 'line', 'line', 'line', 'line','line', 'points', 'points'],'basketball.png')

    # And print a summary of the dataset

    util.print_statistics(dataset)
    datasets.append(copy.deepcopy(dataset))

# And print the table that has been included in the book

util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1])

# Finally, store the last dataset we have generated (250 ms).
dataset.to_csv(result_dataset_path + 'chapter2_result.csv')


In [None]:
dataset.isnull().sum()

In [None]:
#get values 
dataset2 = dataset.loc[~dataset['lin_acc_x'].isnull()]

In [None]:
dataset2.to_csv(result_dataset_path + 'chapter2_result_dataset2.csv')
dataset2.isnull().sum()

In [None]:
DataViz.plot_dataset(dataset2, ['acc_', 'grav_','gyr_','lin_acc_','mag_','press_','rot_vec_','step_', 'label'], ['like', 'like','like','like', 'like','like','like', 'like','like'], ['line', 'line', 'line', 'line', 'line', 'line','line', 'points', 'points'],'basketball.png')

In [None]:
try:
    dataset3 = pd.read_csv(result_dataset_path + 'chapter2_result_dataset2.csv', index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

In [None]:
dataset3['acc_z'].plot()

In [None]:
dataset3

In [None]:
dataset3 = dataset3.reset_index()

In [None]:
dataset = dataset.reset_index()

In [None]:
for index, time in enumerate(dataset3['index']):
    dataset3['index'][index] = dataset['index'][270 + index]

In [None]:
dataset3.to_csv(result_dataset_path + 'chapter2_result_dataset3.csv')

In [None]:
dataset3 = dataset3.set_index('index')

In [3]:
dataset3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1338 entries, 2018-06-22 08:49:01.702 to 2018-06-22 08:54:35.952
Data columns (total 29 columns):
acc_x             1338 non-null float64
acc_y             1338 non-null float64
acc_z             1338 non-null float64
grav_x            1338 non-null float64
grav_y            1338 non-null float64
grav_z            1338 non-null float64
gyr_x             1338 non-null float64
gyr_y             1338 non-null float64
gyr_z             1338 non-null float64
lin_acc_x         1338 non-null float64
lin_acc_y         1338 non-null float64
lin_acc_z         1338 non-null float64
mag_x             1338 non-null float64
mag_y             1338 non-null float64
mag_z             1338 non-null float64
press_value1      1202 non-null float64
press_value2      1202 non-null float64
rot_vec_xsin      0 non-null float64
rot_vec_ysin      0 non-null float64
rot_vec_zsin      0 non-null float64
rot_vec_cos       0 non-null float64
step_step         411 non-nul

In [2]:
try:
    dataset3 = pd.read_csv(result_dataset_path + 'chapter2_result_dataset3.csv', index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

In [4]:
# Let us try the Kalman filter to impute missing data


selected_predictor_cols = [c for c in dataset3.columns if (not ('label' in c)) ]

dataset3.index = dataset3.index.to_datetime()
KalFilter = KalmanFilters()
for col in selected_predictor_cols:
    dataset4 = KalFilter.apply_kalman_filter(dataset3, col,True)
    
DataViz.plot_imputed_values(dataset4, ['original', 'kalman'], 'acc_x', dataset4['acc_x'])
DataViz.plot_dataset(dataset4, ['acc_x', 'acc_x'], ['exact','exact'], ['line', 'line'])

  


KeyError: 'acc_x_kalman'

In [None]:
selected_predictor_cols

In [None]:
dataset4.info()

In [None]:

# Determine the PC's for all but our target columns (the labels and the heart rate)
# We simplify by ignoring both, we could also ignore one first, and apply a PC to the remainder.
selected_predictor_cols = [c for c in dataset4.columns if (not ('label' in c)) ]

PCA = PrincipalComponentAnalysis()
pc_values = PCA.determine_pc_explained_variance(dataset4, selected_predictor_cols)


In [None]:

# Plot the variance explained.

plot.plot(range(1, len(selected_predictor_cols)+1), pc_values, 'b-')
plot.xlabel('principal component number')
plot.ylabel('explained variance')
plot.show(block=False)


In [None]:

# We select 7 as the best number of PC's as this explains most of the variance

n_pcs = 7

dataset = PCA.apply_pca(copy.deepcopy(dataset4), selected_predictor_cols, n_pcs)

#And we visualize the result of the PC's

DataViz.plot_dataset(dataset4, ['pca_', 'label'], ['like', 'like'], ['line', 'points'])
