In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import metrics

from src.stroop_analysis import plot_stroop_stacceleration
from src.accelerometer import plot_fourier_transformation, plot_acceleration, plot_feature_columns, accelerometer_feature_engineering
from src.plotting import box_plot_columns
from src.kmeans import kmeans
from src.principal_component_analysis import principal_component_analysis, plot_principal_component_analysis
from src.decision_tree import decision_tree
from src.time_series import median_filter, run_time_series_algorithms
from src.ml_util import evaluate_model

In [None]:
stroop_df = pd.read_csv('stroop_accelerations.csv')
stroop_df = stroop_df[stroop_df['age_group']!=0]

In [None]:
stroop_df.head(5)

In [None]:
stroop_df.describe()

In [None]:
uuid = stroop_df['uuid'].unique().any()
single_session_df = stroop_df[stroop_df['uuid'] == uuid]

In [None]:
stroop_df['click_number'].unique()

In [None]:
plot_stroop_stacceleration(single_session_df, 'test session')

In [None]:
single_session_df = median_filter(single_session_df)
plot_stroop_stacceleration(single_session_df, 'test session - filtered')

In [None]:
plot_fourier_transformation(single_session_df, 'test session')

In [None]:
box_plot_columns(single_session_df)

In [None]:
plot_acceleration(single_session_df)

In [None]:
stroop_df = median_filter(stroop_df)

In [None]:
stroop_processed_df = accelerometer_feature_engineering(stroop_df)

In [None]:
group_by_keys = ['age_group','subject','device', 'hand','uuid']
stroop_data_df = stroop_df.groupby(group_by_keys)[['click_distance_mean','click_distance_std','click_success_rate']].agg('first')
stroop_processed_df = stroop_processed_df.merge(stroop_data_df, on=group_by_keys)

In [None]:
stroop_processed_df.head()

In [None]:
stroop_processed_df.corr()

SEM is closely related to std -> therefore we drop it

In [None]:
stroop_processed_df = stroop_processed_df.drop(columns=['x_sem', 'y_sem', 'z_sem', 'mag_sem'])
stroop_processed_df.corr()

In [None]:
class_key='age_group'
plot_feature_columns(stroop_processed_df,'std')
plot_feature_columns(stroop_processed_df,'mean')
plot_feature_columns(stroop_processed_df,'peaks')
plot_feature_columns(stroop_processed_df,'sal')
plot_feature_columns(stroop_processed_df,'snr')
box_plot_columns(stroop_processed_df, class_key,['click_distance_mean','click_distance_std'])
box_plot_columns(stroop_processed_df, class_key,['click_success_rate'])
box_plot_columns(stroop_processed_df, class_key,['duration'])

We can see that there is no significant deviation between the two age groups in the following features:
- z_std
- mag_std
- z_mean
- mag_mean
- y_snr
- z_snr
- mag_snr
- z_sal
- mag_sal

In [None]:
stroop_processed_df = stroop_processed_df.reset_index(drop=False)
train_df, test_df = train_test_split(stroop_processed_df, test_size=0.10)

In [None]:
feature_keys = ['y_std','x_mean','y_peaks','x_snr','duration']

# ML Models

In [None]:
results = {}

## Apply Kmeans Clustering

In [None]:
kmeans_model = kmeans(train_df, feature_keys)
kmeans_predicitons = kmeans_model.predict(test_df[feature_keys])
predictions_df = pd.DataFrame()
predictions_df['age_group'] = test_df['age_group']
predictions_df['cluster'] = kmeans_predicitons
predictions_df

In [None]:
results['kmeans'] = (1-(predictions_df.groupby('age_group')[['cluster']].agg('sem').sum()/2))

## Principal Component Analysis

In [None]:
principal_components_df = principal_component_analysis(stroop_processed_df, feature_keys)

In [None]:
plot_principal_component_analysis(stroop_processed_df, principal_components_df, 'age_group', [30,50])

## Decision Tree

In [None]:
model = decision_tree(train_df, feature_keys)

results['decision_tree'] = evaluate_model(model, test_df[feature_keys],test_df['age_group'])

## Run Time Series Algorithm

In [None]:
time_series_results = run_time_series_algorithms(stroop_df)
results.update(time_series_results)

## ML Results

In [None]:
results