In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import metrics

from src.accelerometer import plot_fourier_transformation, plot_acceleration, plot_feature_columns, accelerometer_feature_engineering
from src.kmeans import kmeans
from src.principal_component_analysis import principal_component_analysis, plot_principal_component_analysis
from src.decision_tree import decision_tree
from src.device_handling import split_by_device
from src.plotting import box_plot_columns
from src.time_series import align_sequences_to_same_length, median_filter, run_time_series_algorithms
from src.ml_util import evaluate_model

In [None]:
drift_df = pd.read_csv('drift_accelerations.csv')
drift_df = drift_df[drift_df['age_group']!=0]

In [None]:
drift_df.head(5)

In [None]:
drift_df.describe()

In [None]:
single_session_df = drift_df[drift_df['uuid'] == drift_df['uuid'].unique()[1]]

In [None]:
single_sessions_devices_df = split_by_device(single_session_df)

In [None]:
plot_acceleration(single_sessions_devices_df, subplots=False)

In [None]:
single_session_df = median_filter(single_session_df)
single_sessions_devices_df = split_by_device(single_session_df)
plot_acceleration(single_sessions_devices_df, subplots=False)

In [None]:
plot_fourier_transformation(single_sessions_devices_df, 'test session')

In [None]:
box_plot_columns(single_sessions_devices_df)

In [None]:
def drift_features(df):
    entries = []
    for uuid in df['uuid'].unique():
        current_df = df[df['uuid'] == uuid]
        entry = {}
        entry['age_group'] = current_df['age_group'].unique().max()
        entry['subject'] = current_df['subject'].unique().any()
        entry['uuid'] = current_df['uuid'].unique().any()
        entry['file'] = current_df['file'].unique().any()
        entry['duration'] = current_df['duration'].max()
        session_devices = split_by_device(current_df)
        device_count = 0
        for device_df in session_devices:
            device_processed_df = accelerometer_feature_engineering(device_df)
            device_processed_df = device_processed_df.drop(columns=['x_sem', 'y_sem', 'z_sem', 'mag_sem','duration'])
            suffix = f'_{device_count}'
            device_count += 1
            device_processed_df.columns += suffix
            for column in device_processed_df.columns:
                entry[column]=device_processed_df[column].max()
        entries.append(entry)
    return pd.DataFrame(entries)

In [None]:
def calc_device_diffs(df, columns):
    local_df = df.copy()
    new_suffix = '_diff'
    processed_columns = []
    for column in columns:
        column_without_suffix = column.removesuffix('_0').removesuffix('_1')
        if column_without_suffix not in processed_columns:
            processed_columns.append(column_without_suffix)
            new_column = column_without_suffix + new_suffix
            column_device_1 = column_without_suffix+'_0'
            column_device_2 = column_without_suffix+'_1'
            local_df[new_column] = local_df[column_device_1]-local_df[column_device_2]
            local_df = local_df.drop(columns=[column_device_1, column_device_2])
    return local_df

In [None]:
drift_df = median_filter(drift_df)
feature_device_based_df = drift_features(drift_df)
feature_device_based_df = calc_device_diffs(feature_device_based_df,feature_device_based_df.columns[5:])
feature_device_based_df.head()

In [None]:
pd.set_option('display.max_columns', None)
feature_device_based_df.corr(numeric_only=True)

In [None]:
box_plot_columns(feature_device_based_df, 'age_group', ['x_std_diff','y_std_diff','z_std_diff','mag_std_diff'])
box_plot_columns(feature_device_based_df, 'age_group', ['x_mean_diff','y_mean_diff','z_mean_diff','mag_mean_diff'])
box_plot_columns(feature_device_based_df, 'age_group', ['x_peaks_diff','y_peaks_diff','z_peaks_diff','mag_peaks_diff'])
box_plot_columns(feature_device_based_df, 'age_group', ['x_snr_diff','y_snr_diff','z_snr_diff','mag_snr_diff'])
box_plot_columns(feature_device_based_df, 'age_group', ['x_sal_diff','y_sal_diff','z_sal_diff','mag_sal_diff'])

In [None]:
feature_df = accelerometer_feature_engineering(drift_df)
feature_df = feature_df.drop(columns=['x_sem', 'y_sem', 'z_sem', 'mag_sem'])
feature_df = feature_df.reset_index(drop=False)

In [None]:
plot_feature_columns(feature_df,'std')
plot_feature_columns(feature_df,'mean')
plot_feature_columns(feature_df,'peaks')
plot_feature_columns(feature_df,'sal')
plot_feature_columns(feature_df,'snr')

In [None]:
merged_df = feature_df.merge(feature_device_based_df, on=['age_group','subject','uuid'])

# ML Models

In [None]:
results = {}

In [None]:
feature_keys = ['z_std','y_std','z_std_diff']
class_key = ['age_group']

In [None]:
train_df, test_df = train_test_split(merged_df, test_size=0.10)

## Apply Kmeans Clustering

In [None]:
kmeans_model = kmeans(train_df, feature_keys)
kmeans_predicitons = kmeans_model.predict(test_df[feature_keys])
predictions_df = pd.DataFrame()
predictions_df['age_group'] = test_df['age_group']
predictions_df['cluster'] = kmeans_predicitons
predictions_df

In [None]:
results['kmeans'] = (predictions_df.groupby('age_group')[['cluster']].agg('mean').sum()/2)

## Principal Component Analysis

In [None]:
principal_components_df = principal_component_analysis(merged_df, feature_keys)

In [None]:
plot_principal_component_analysis(merged_df, principal_components_df, 'age_group', [30,50])

## Decision Tree

In [None]:
model = decision_tree(train_df, feature_keys)
results['decision_tree'] = evaluate_model(model, test_df[feature_keys],test_df['age_group'])

## Time Series Pre-Processing

In [None]:
def extract_sequences_by_device(session_df):
    device_sequences = {}
    device_sequences_length = {}
    for device in session_df['device'].unique():
        device_df = session_df[session_df['device']==device]
        sequence = device_df[['x','y','z','mag']].values
        device_sequences[device] = sequence
        device_sequences_length[device] = len(sequence)
    return device_sequences, device_sequences_length


In [None]:
def build_session_sequence(device_sequences, device_sequences_length):
    final_sequence = []
    min_length = min(device_sequences_length.values())
    for device in device_sequences.keys():
        device_sequence = device_sequences[device]
        for index in range(min_length):
            device_sequence_entry = device_sequence[index]
            if len(final_sequence) <= index:
                final_sequence.append(device_sequence_entry)
            else:
                current_sequence_entry = final_sequence[index]
                new_sequence_entry = list(np.append(current_sequence_entry,device_sequence_entry))
                final_sequence[index] = new_sequence_entry
    return final_sequence

In [None]:
def extract_sequences_and_labels(df):
    sequences = []
    labels = []
    length = []
    for uuid in df['uuid'].unique():
        current_df = df[df['uuid']==uuid]
        labels.append(current_df['age_group'].max())
        device_sequences, device_sequences_length = extract_sequences_by_device(current_df)
        session_sequence = build_session_sequence(device_sequences, device_sequences_length)
        sequences.append(session_sequence)
        length.append(len(session_sequence))
    target_length = np.int_(pd.Series(length).quantile(0.65))
    final_sequences = align_sequences_to_same_length(sequences,target_length)
    return np.stack(final_sequences), labels

## Run Time Series Algorithm

In [None]:
time_series_results = run_time_series_algorithms(drift_df, compile_sequences_function=extract_sequences_and_labels)
results.update(time_series_results)

## ML Results

In [None]:
results