In [None]:
import pandas as pd
import numpy as np

from src.stroop_analysis import plot_stroop_stacceleration
from src.accelerometer import plot_fourier_transformation, plot_acceleration, plot_feature_columns, accelerometer_feature_engineering
from src.plotting import box_plot_columns
from src.kmeans import kmeans
from src.principal_component_analysis import principal_component_analysis, plot_principal_component_analysis
from src.time_series import median_filter, run_time_series_algorithms
from src.ml_util import run_feature_algorithms
from src.velocity_peaks import velocity_peaks
from src.pandas_util import correlation_matrix, get_min_value_across_columns

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
stroop_df = pd.read_csv('stroop_accelerations.csv')
stroop_df = stroop_df[stroop_df['age_group']!=0]

In [None]:
stroop_df.head(5)

In [None]:
stroop_df.describe()

In [None]:
uuid_30 = stroop_df[stroop_df['age_group']==30]['uuid'].unique().any()
uuid_50 = stroop_df[stroop_df['age_group']==50]['uuid'].unique().any()
single_session_30_df = stroop_df[stroop_df['uuid'] == uuid_30]
single_session_50_df = stroop_df[stroop_df['uuid'] == uuid_50]

In [None]:
plot_stroop_stacceleration(single_session_30_df, 'test session - raw')

In [None]:
single_session_30_df = median_filter(single_session_30_df)
single_session_50_df = median_filter(single_session_50_df)

In [None]:
plot_stroop_stacceleration(single_session_30_df, 'test session 30 - filtered')
plot_stroop_stacceleration(single_session_50_df, 'test session 50 - filtered')

In [None]:
subject = stroop_df[stroop_df['age_group']==30]['subject'].unique().any()
subject_df = stroop_df[stroop_df['subject'] == subject]
subject_df = median_filter(subject_df)
plot_stroop_stacceleration(subject_df, 'All sessions of '+subject)

In [None]:
subject = stroop_df[stroop_df['age_group']==50]['subject'].unique().any()
subject_df = stroop_df[stroop_df['subject'] == subject]
subject_df = median_filter(subject_df)
plot_stroop_stacceleration(subject_df, 'All sessions of '+subject)

In [None]:
plot_fourier_transformation(single_session_30_df, 'test session')

In [None]:
box_plot_columns(single_session_30_df)

In [None]:
plot_acceleration(single_session_30_df)

In [None]:
stroop_df = median_filter(stroop_df)

In [None]:
stroop_processed_df = accelerometer_feature_engineering(stroop_df)
stroop_processed_df = stroop_processed_df.reset_index(drop=False)

In [None]:
group_by_keys = ['age_group','subject','device', 'hand','uuid']
stroop_data_df = stroop_df.groupby(group_by_keys)[['click_distance_mean','click_distance_std','click_success_rate']].agg('first')
stroop_processed_df = stroop_processed_df.merge(stroop_data_df, on=group_by_keys)

In [None]:
stroop_processed_df.head()

In [None]:
correlation_matrix(stroop_processed_df)

SEM is closely related to std -> therefore we drop it

In [None]:
stroop_processed_df = stroop_processed_df.drop(columns=['x_sem', 'y_sem', 'z_sem', 'mag_sem','x_snr','z_snr','y_snr','mag_snr','x_peaks','y_peaks','z_peaks'])

In [None]:
def extract_peaks_for_clicks(base_df, processed_df):
    processed_df_copy = processed_df.copy()
    click_peaks_df = base_df.groupby(['age_group','subject','device', 'hand','uuid','click_number'])[['x', 'y', 'z', 'mag']].agg(velocity_peaks)
    click_peaks_df = click_peaks_df.reset_index(drop=False)
    for uuid in click_peaks_df['uuid'].unique():
        for click_number in click_peaks_df[click_peaks_df['uuid']==uuid]['click_number'].unique():
            session_df = click_peaks_df[(click_peaks_df['uuid'] == uuid) & (click_peaks_df['click_number'] == click_number)]
            for axis in ['x', 'y', 'z', 'mag']:
                column_name = f'click_{click_number}_{axis}_peaks'
                if column_name not in processed_df_copy.columns:
                    processed_df_copy[column_name] =  np.nan
                processed_df_copy.loc[processed_df_copy['uuid'] == uuid, column_name] = session_df[axis].max()
    return processed_df_copy

In [None]:
stroop_processed_df = extract_peaks_for_clicks(stroop_df, stroop_processed_df)

In [None]:
class_key='age_group'
plot_feature_columns(stroop_processed_df,'std')
plot_feature_columns(stroop_processed_df,'mean')
plot_feature_columns(stroop_processed_df,'sal')
box_plot_columns(stroop_processed_df, class_key,['mag_peaks'])
box_plot_columns(stroop_processed_df, class_key,['click_distance_mean','click_distance_std'])
box_plot_columns(stroop_processed_df, class_key,['click_success_rate'])
box_plot_columns(stroop_processed_df, class_key,['duration'])
box_plot_columns(stroop_processed_df, class_key,['click_0_x_peaks', 'click_1_x_peaks', 'click_2_x_peaks', 'click_3_x_peaks', 'click_4_x_peaks'])
box_plot_columns(stroop_processed_df, class_key,['click_0_z_peaks', 'click_1_z_peaks', 'click_2_z_peaks', 'click_3_z_peaks', 'click_4_z_peaks'])
box_plot_columns(stroop_processed_df, class_key,['click_0_y_peaks', 'click_1_y_peaks', 'click_2_y_peaks', 'click_3_y_peaks', 'click_4_y_peaks'])
box_plot_columns(stroop_processed_df, class_key,['click_0_mag_peaks', 'click_1_mag_peaks', 'click_2_mag_peaks', 'click_3_mag_peaks', 'click_4_mag_peaks'])

We can see that there is significant deviation between the two age groups in the following features:
- x_mean
- x_peaks
- y_peaks
- x_snr
- duration
- click_0_z_peaks

In [None]:
feature_keys = ['y_std','x_mean','mag_peaks','duration']

# ML Models

In [None]:
results = {}

## Principal Component Analysis

In [None]:
principal_components_df = principal_component_analysis(stroop_processed_df, feature_keys)

In [None]:
plot_principal_component_analysis(stroop_processed_df.reset_index(drop=False), principal_components_df, 'age_group', [30,50])

## Run Algorithms on extracted Features

In [None]:
feature_results = run_feature_algorithms(stroop_processed_df, feature_keys)
results.update(feature_results)

## Run Time Series Algorithm

In [None]:
time_series_results = run_time_series_algorithms(stroop_df)
results.update(time_series_results)

## ML Results

In [None]:
results