# <strong>3b: Apex Only Notebook |</strong>


In [1]:
import os
import glob
import pandas as pd
from scipy.signal import find_peaks
import numpy as np
import warnings
warnings.simplefilter('ignore', category=FutureWarning)

# Function to align data

This Python function, `align_data`, is designed to align two dataframes: `mtdf` and `annotdf` which will contain the motiontracking data and the ELAN annotations.


In [None]:
# Function to align data
def align_data(mtdf, annotdf):
# Add a gesture ID column to the annotations dataframe
    annotdf['gesture_id'] = range(1, len(annotdf) + 1)

    # Create new columns in data_df for annotations and gesture IDs
    mtdf['phase'] = None
    mtdf['gesture_id'] = None

    # Iterate over each row in the annotations dataframe
    for _, row in annotdf.iterrows():
        # Find the range of time in data_df that falls within the current annotation time
        condition = (mtdf['time_ms'] >= row['start_time']) & (mtdf['time_ms'] <= row['end_time'])
        
        # Assign the annotation and gesture ID to these rows
        mtdf.loc[condition, 'phase'] = row['phase']
        mtdf.loc[condition, 'gesture_id'] = row['gesture_id']

    return mtdf

In this code cell, several directory paths are defined, and files within those directories are gathered:

### Directory Definitions
- `phase_dir`: The directory for the ELAN Phases data.
- `processed_dir`: The directory for processed manual gesture data.
- `output_dir`: The directory for output or annotated data.

### Output File Path
An `output_file` variable is created by joining `output_dir` with a modified version of `processed_file`. It replaces '_processed_data.csv' with '_final_annotations.csv' in the filename.

### Gathering Files
The `glob` library is used to get lists of files within the `phase_dir` and `processed_dir` directories. These lists are stored in the variables `phase_files` and `processed_files`, respectively.

### List of Gathered Files
Here are the lists of files found in the respective directories:
- `phase_files`: Contains a list of files from the `phase_dir`.
- `processed_files`: Contains a list of files from the `processed_dir`.


In [None]:
# Define directories
phase_dir = '/n/kfranich_speech_l3/Lab/6000_EMA/processed_articulatory_data/video_processing/elan_phases/SN6014_ARTGEST_ENG'
processed_dir = '/n/kfranich_speech_l3/Lab/6000_EMA/processed_articulatory_data/video_processing/speed_and_upsample/SN6014_ARTGEST_ENG'
output_dir = '/n/kfranich_speech_l3/Lab/6000_EMA/processed_articulatory_data/video_processing/apex_annotated/SN6014_ARTGEST_ENG'

# Output Structuring
output_file = os.path.join(output_dir, os.path.basename(processed_file).replace('_processed_data.csv', '_final_annotations.csv'))

# Get list of files in directories
phase_files = glob.glob(os.path.join(phase_dir, '*.csv'))
processed_files = glob.glob(os.path.join(processed_dir, '*.csv'))

# List gathered files
phase_files
processed_files

### Matching Phase and Processed Files

- **Matching Logic**: A list called `matched_files` is initialized to store pairs of matched files.
  
- **Iteration**: The code iterates through each `phase_file` in the `phase_files` list.

- **Filename Extraction**: For each `phase_file`, the code extracts the filename without the '_extended_annotations.csv' part using `os.path.basename(phase_file).split('_extended_annotations.csv')[0]`.

- **Matching**: It then iterates through each `processed_file` in the `processed_files` list and checks if the extracted `basename` is present in the `processed_file`.

- **Appending Matched Pairs**: If a match is found, a tuple containing the `phase_file` and the corresponding `processed_file` is appended to the `matched_files` list. The loop breaks after finding the first match for efficiency.

This code effectively pairs up files from the `phase_dir` and `processed_dir` directories based on their filenames, allowing further processing or analysis of matched pairs.


In [None]:
# Match phase and processed files based on filenames
matched_files = []
for phase_file in phase_files:
    basename = os.path.basename(phase_file).split('_extended_annotations.csv')[0]
    for processed_file in processed_files:
        if basename in processed_file:
            matched_files.append((phase_file, processed_file))
            break

### Processing Matched File Pairs

- **File Reading**: For each pair of matched files (`phase_file` and `processed_file`), the code reads the data from these CSV files into Pandas DataFrames using `pd.read_csv()`.

- **Column Renaming**: If necessary, the code renames the columns of the `df_phase` DataFrame to 'start_time', 'end_time', and 'phase'.

- **Main Processing Logic**: The core processing logic begins here. It uses a function named `align_data()` to align data in the `df_processed` DataFrame with the phase information in the `df_phase` DataFrame.

- **Data Filtering**: A new DataFrame named `filtered_df` is created by dropping rows with missing values in the 'right_wrist_speed' and 'phase' columns.

- **Gesture and Apex Detection**: The code iterates over unique 'gesture_id' values in the `filtered_df` DataFrame. For each gesture, it extracts relevant data, including 'Stroke' phase data. It calculates a dynamic peak threshold based on the mean and standard deviation of wrist speed during strokes. Using this threshold, it identifies peaks in the wrist speed data and marks them as 'AX' (apex). If no peaks are found, it selects the minimum speed point as the 'AX'. The resulting data is appended to the `more_refined_apexes_df` DataFrame.

- **Output Saving**: The `more_refined_apexes_df` DataFrame is saved as a CSV file at the specified `output_file` path using `to_csv()`.

- **Print Status Message**: A message is printed indicating the completion of processing for the current `output_file`.

This code block effectively processes each matched file pair, aligns data, filters it, and detects apexes in the wrist speed data, saving the results in the specified output file. It appears to be part of a data preprocessing pipeline.

In [None]:
# Process each matched file pair
for phase_file, processed_file in matched_files:
    df_phase = pd.read_csv(phase_file)
    df_processed = pd.read_csv(processed_file)

    # Rename columns if necessary
    df_phase.columns = ['start_time', 'end_time', 'phase']

    # Main processing logic
    df_processed = align_data(df_processed, df_phase)
    df_processed = df_processed.drop(columns=['right_wrist_delta_x', 'right_wrist_delta_y', 'right_index_x',
                                              'right_index_y', 'right_index_speed', 'right_index_delta_x','right_index_delta_y'])
    filtered_df = df_processed.dropna(subset=['right_wrist_speed', 'phase'])
    more_refined_apexes_df = pd.DataFrame(columns=['time_ms', 'right_wrist_speed', 'gesture_id', 'apex'])

    for gesture_id in filtered_df['gesture_id'].dropna().unique():
        gesture_data = filtered_df[filtered_df['gesture_id'] == gesture_id]
        stroke_data = gesture_data[gesture_data['phase'].isin(['S', 'Stroke'])]
        if len(stroke_data) == 0:
            continue
        dynamic_large_peak_threshold = stroke_data['right_wrist_speed'].mean() + stroke_data['right_wrist_speed'].std()
        peaks, _ = find_peaks(stroke_data['right_wrist_speed'].values, height=dynamic_large_peak_threshold)
        if len(peaks) > 0:
            for peak in peaks:
                subsequent_data = stroke_data['right_wrist_speed'].values[peak:]
                if len(subsequent_data) > 1:
                    min_speed_index = np.argmin(subsequent_data) + peak
                    original_index = stroke_data.index[min_speed_index]
                    stroke_data.loc[original_index, 'apex'] = 'AX'
        else:
            min_speed_index = np.argmin(stroke_data['right_wrist_speed'].values)
            original_index = stroke_data.index[min_speed_index]
            stroke_data.loc[original_index, 'apex'] = 'AX'
        more_refined_apexes_df = more_refined_apexes_df._append(stroke_data)

    # Save output
    more_refined_apexes_df.to_csv(output_file, index=False)

    print(f'Done Processing {output_file}')