In [1]:
import numpy as np
import pandas as pd
import os
import sys
import re
import wfdb
import scipy.stats as stats
from scipy.fft import fft, fftfreq

In [2]:
def extract_unique_values_from_folder(folder:str):
    """
    Function to extract just the unique headers of the dat and hea files (removing the file name) 
    """
    unique = set()
    for f in os.listdir(folder):
        unique.add(extract_basename(os.path.join(folder, f)))
        
    return list(unique)

In [3]:
def extract_basename(filename):
    """
    Function to extract just the unique headers of the dat and hea files (removing the file spec) 
    """
    return os.path.splitext(filename)[0]

In [4]:
def extract_info_from_name(filename):
    parsed = {}
    basename = os.path.basename(filename).split(".")[0]
    pattern = r"session(\d+)_participant(\d+)_gesture(\d+)_trial(\d+)"
    match = re.match(pattern, basename)
    parsed['session'] = match.group(1) 
    parsed['participant'] = match.group(2)
    parsed['gesture'] = match.group(3) 
    parsed['trial'] = match.group(4)
    parsed['filename'] = filename
    return parsed

In [5]:
base_path = "../../data/gesture-recognition-and-biometrics-electromyogram-grabmyo-1.0.2/"

In [118]:
session1 = os.path.abspath(base_path+"Session1")
session2 = os.path.abspath(base_path+"Session2")
session3 = os.path.abspath(base_path+"Session3")
unique_values = extract_unique_values_from_folder(session1)

parsed_data_1 = []
parsed_data_2 = []
parsed_data_3 = []

for participant_dir in os.listdir(session1):
    participant_path = os.path.join(session1, participant_dir)
    if os.path.isdir(participant_path):
        # Loop through files in participant directory
        for file_name in os.listdir(participant_path):
            if file_name.endswith(".hea"):  # Process only .hea files
                file_path = os.path.join(participant_path, file_name)
                if os.path.isfile(file_path):
                    parsed_data_1.append(extract_info_from_name(file_path))

for participant_dir in os.listdir(session2):
    participant_path = os.path.join(session2, participant_dir)
    if os.path.isdir(participant_path):
        # Loop through files in participant directory
        for file_name in os.listdir(participant_path):
            if file_name.endswith(".hea"):  # Process only .hea files
                file_path = os.path.join(participant_path, file_name)
                if os.path.isfile(file_path):
                    parsed_data_2.append(extract_info_from_name(file_path))

for participant_dir in os.listdir(session3):
    participant_path = os.path.join(session3, participant_dir)
    if os.path.isdir(participant_path):
        # Loop through files in participant directory
        for file_name in os.listdir(participant_path):
            if file_name.endswith(".hea"):  # Process only .hea files
                file_path = os.path.join(participant_path, file_name)
                if os.path.isfile(file_path):
                    parsed_data_3.append(extract_info_from_name(file_path))

#parsed_data
df1 = pd.DataFrame(parsed_data_1)
df2 = pd.DataFrame(parsed_data_2)
df3 = pd.DataFrame(parsed_data_3)

df = pd.concat([df1, df2, df3], axis=0)
# df = df1.iloc[:10, :]
df['filename'] = df['filename'].str.replace('.hea', '')
df

Unnamed: 0,session,participant,gesture,trial,filename
0,1,1,10,1,E:\DS5500-project\data\gesture-recognition-and...
1,1,1,10,2,E:\DS5500-project\data\gesture-recognition-and...
2,1,1,10,3,E:\DS5500-project\data\gesture-recognition-and...
3,1,1,10,4,E:\DS5500-project\data\gesture-recognition-and...
4,1,1,10,5,E:\DS5500-project\data\gesture-recognition-and...
...,...,...,...,...,...
5112,3,9,9,3,E:\DS5500-project\data\gesture-recognition-and...
5113,3,9,9,4,E:\DS5500-project\data\gesture-recognition-and...
5114,3,9,9,5,E:\DS5500-project\data\gesture-recognition-and...
5115,3,9,9,6,E:\DS5500-project\data\gesture-recognition-and...


In [119]:
# df['trial'].unique()

In [120]:
# df['participant'].unique()

In [121]:

# def integrated_EMG(signal):
#     return np.sum(np.abs(signal))

# def mean_absolute_value(signal):
#     return np.mean(np.abs(signal))

# def simple_square_integral(signal):
#     return np.sum(signal**2)

# def root_mean_square(signal):
#     return np.sqrt(np.mean(signal**2))

# def variance(signal):
#     return np.var(signal)

# def myopulse_percentage_rate(signal):
#     return len(np.where(np.diff(np.sign(signal)))[0]) / len(signal)

# def waveform_length(signal):
#     return np.sum(np.abs(np.diff(signal)))

# def difference_variance(signal):
#     return np.var(np.diff(signal))

# def difference_absolute_standard_deviation(signal):
#     return np.std(np.diff(np.abs(signal)))

# def willison_amplitude(signal, threshold=0.1):
#     return np.sum(np.abs(np.diff(signal)) > threshold)

In [122]:
t = np.random.randn(10,4)

th = np.mean(t, axis = 0) + 3 * np.std(t, axis = 0)

In [123]:
th

array([3.53320721, 3.82634451, 3.23082561, 1.6049687 ])

In [124]:
np.mean(t, axis = 0)

array([ 0.1260221 ,  0.11618935,  0.78446029, -0.4030366 ])

In [125]:
 np.sum(t**2, axis=0)

array([13.05760502, 15.42972331, 12.80344981,  6.10447978])

In [126]:
np.mean(t, axis = 0)

array([ 0.1260221 ,  0.11618935,  0.78446029, -0.4030366 ])

In [127]:
np.std(t, axis = 0)

array([1.13572837, 1.23671839, 0.8154551 , 0.6693351 ])

In [128]:
th

array([3.53320721, 3.82634451, 3.23082561, 1.6049687 ])

In [129]:
np.sum(np.abs(np.diff(t, axis = 0)) > th, axis = 0)

array([0, 0, 0, 0])

In [130]:
np.diff(t, axis = 0)

array([[ 2.99335433, -3.45811759,  0.26152882,  0.24160798],
       [-2.34207696,  2.35791158,  1.56628613, -0.24404068],
       [ 0.36437501, -3.22662996, -0.1890656 , -1.14076241],
       [ 0.14327592,  3.17231789,  0.93899233,  0.71616219],
       [-0.37651959,  0.05651657, -0.56747082,  0.25337814],
       [-2.31498092, -1.71324716, -0.18585936, -1.42222018],
       [ 2.53359462,  0.0922157 ,  0.44544493,  1.21581115],
       [-1.37045244,  0.90993165, -0.59303695,  1.33051117],
       [ 0.15225351, -0.74953327, -0.94581912, -1.37692946]])

In [134]:
cols = ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11',
       'F12', 'F13', 'F14', 'F15', 'F16', 'W1', 'W2', 'W3', 'W4', 'W5', 'W6',
       'W7', 'W8', 'W9', 'W10', 'W11', 'W12']

In [135]:
def extract_features(df):
    """
    Extracting the following features:
    'MIN','MAX','MEAN','RMS','VAR','STD','POWER','PEAK','P2P','CREST FACTOR','SKEW','KURTOSIS',
            'MAX_f','SUM_f','MEAN_f','VAR_f','PEAK_f','SKEW_f','KURTOSIS_f'
    These will be an array of values for each of the 32 sensors
    """
    #time data
    features = {}
    #time domain features
    time_data = df.values
    #Three std seems excessive. This can be changed later
    th = np.mean(time_data, axis = 0) + 2 * np.std(time_data, axis = 0)
    features['iemg'] = np.sum(np.abs(time_data), axis=0)
    features['mav'] = np.mean(np.abs(time_data), axis=0)
    features['ssi'] = np.sum(time_data**2, axis=0)
    # features['myopulse'] = len(np.where(np.diff(np.sign(time_data)))[0]) / len(time_data)
    #ref = https://github.com/tanmaygadgil/DS5500-project/blob/main/code_snippets/python-sebastian/feature_extraction.py
    features['myopulse'] = np.sum(time_data >= th, axis = 0) / len(time_data)
    features['wflen'] = np.sum(np.abs(np.diff(time_data, axis = 0)), axis=0)
    features['diffvar'] = np.var(np.diff(time_data, axis = 0), axis = 0)
    features['dasd'] = np.std(np.diff(np.abs(time_data), axis = 0), axis = 0)
    #Ref #ref = https://github.com/tanmaygadgil/DS5500-project/blob/main/code_snippets/python-sebastian/feature_extraction.py
    features['willison'] = np.sum(np.abs(np.diff(time_data, axis = 0)) > th, axis = 0)
    
    features['mean'] = np.mean(time_data, axis = 0)
    features['min'] = np.min(time_data, axis = 0)
    features['max'] = np.max(time_data, axis = 0)
    features['rms'] = np.sqrt(np.mean(time_data**2, axis = 0))
    features['max'] = np.std(time_data, axis = 0)
    features['power'] = np.mean(time_data**2, axis = 0)
    features['peak'] = np.max(np.abs(time_data), axis = 0)
    features['p2p'] = np.ptp(time_data, axis = 0)
    features['crest_factor'] = np.max(np.abs(time_data), axis = 0)/np.sqrt(np.mean(time_data**2, axis = 0))
    features['skew'] = stats.skew(time_data, axis = 0)
    features['kurtosis'] = stats.kurtosis(time_data, axis = 0)
    features['form_factor'] =np.sqrt(np.mean(time_data**2, axis = 0)) / np.mean(time_data, axis = 0)
    features['pulse_indicator'] = np.max(np.abs(time_data), axis = 0)/np.mean(time_data, axis = 0) 
    
    #Convert to frequency domain
    freq_data = fft(time_data)
    S_f = np.abs(freq_data**2)/len(df)
    features['max_f'] = np.max(S_f, axis = 0)
    features['sum_f'] = np.sum(S_f, axis = 0)
    features['mean_f'] = np.mean(S_f, axis = 0)
    features['var_f'] = np.var(S_f, axis = 0)
    features['peak_f'] = np.max(np.abs(S_f), axis = 0)
    features['skew_f'] = stats.skew(S_f, axis = 0)
    features['kurtosis_f'] = stats.kurtosis(S_f, axis = 0)
    
    return features
    

In [136]:
from tqdm import tqdm

In [84]:
wave = wfdb.rdrecord(df['filename'].iloc[0])
wave_df = wave.to_dataframe()
wave_df = wave_df.drop(['U1','U2', 'U3','U4'], axis = 1)

In [85]:
wave_df

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,W3,W4,W5,W6,W7,W8,W9,W10,W11,W12
0 days 00:00:00,0.018842,0.003025,0.046460,0.104214,0.078568,0.139992,0.093088,0.033105,0.003268,-0.046261,...,0.013156,-0.041120,-0.085215,0.001288,0.099477,0.077323,0.006146,-0.032260,-0.080570,0.010847
0 days 00:00:00.000488281,0.007086,-0.007874,0.004497,0.054350,-0.043084,0.000402,0.053829,0.018269,0.013704,-0.032529,...,-0.021570,-0.043785,-0.120230,-0.029939,0.118181,0.076367,-0.011716,-0.036650,-0.117002,-0.010532
0 days 00:00:00.000976562,-0.018860,-0.037642,-0.040174,-0.003930,-0.175231,-0.169030,-0.003461,-0.003630,-0.009620,-0.050602,...,-0.032417,-0.050653,-0.124954,-0.034229,0.126799,0.077234,-0.018928,-0.040363,-0.123522,-0.021235
0 days 00:00:00.001464843,-0.013056,-0.036159,-0.049285,-0.019746,-0.233636,-0.278486,-0.028711,-0.003322,-0.020864,-0.047258,...,-0.024087,-0.049352,-0.109183,-0.034107,0.109105,0.067106,-0.017590,-0.037744,-0.108100,-0.025145
0 days 00:00:00.001953124,0.025837,0.006250,-0.017654,0.013535,-0.205318,-0.291812,-0.012940,0.019870,0.004719,-0.003508,...,-0.002855,-0.024827,-0.082030,-0.039545,0.057957,0.042862,-0.012414,-0.026509,-0.081828,-0.032091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0 days 00:00:04.997556035,-0.046034,-0.075884,-0.073877,-0.076665,-0.115671,-0.118026,-0.048888,-0.043507,-0.046553,-0.068948,...,-0.006198,-0.006975,0.028131,0.032132,-0.009935,0.007103,-0.004418,-0.006439,0.023369,0.011748
0 days 00:00:04.998044316,-0.016398,-0.042997,-0.033536,-0.052874,-0.050325,-0.047177,-0.006694,-0.010307,-0.014983,-0.036733,...,-0.006423,-0.009028,0.035122,0.045688,0.011053,0.014663,-0.002744,-0.001974,0.031485,0.029136
0 days 00:00:04.998532597,0.001108,-0.024600,-0.008593,-0.027242,-0.008290,-0.007932,0.023454,0.010110,-0.000144,-0.024079,...,-0.005212,-0.002708,0.042888,0.058652,0.023663,0.012143,-0.002599,0.002870,0.042420,0.039055
0 days 00:00:04.999020878,0.003021,-0.018727,-0.003628,-0.012806,-0.012712,-0.004160,0.032820,0.016069,-0.001028,-0.023260,...,0.001042,0.006685,0.045966,0.057129,0.017105,0.003124,-0.002948,0.004054,0.047149,0.035199


In [86]:
cols = wave_df.columns

In [87]:
cols

Index(['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11',
       'F12', 'F13', 'F14', 'F15', 'F16', 'W1', 'W2', 'W3', 'W4', 'W5', 'W6',
       'W7', 'W8', 'W9', 'W10', 'W11', 'W12'],
      dtype='object')

In [88]:
f = extract_features(wave_df)

In [91]:
f

{'iemg': array([365.45665831, 486.2520967 , 572.29991509, 549.59147577,
        596.68857045, 574.35190439, 410.86856226, 340.19879641,
        406.83270407, 550.19696813, 546.10203721, 610.10055259,
        614.92412744, 617.93006649, 441.81474245, 389.79408233,
        395.75247076, 316.91692102, 318.13335483, 420.81227781,
        427.66678497, 374.33304146, 270.31216611, 241.42230482,
        174.74936456, 219.91073327, 431.81029796, 331.64010952]),
 'mav': array([0.03568913, 0.04748556, 0.05588866, 0.05367104, 0.05827037,
        0.05608905, 0.04012388, 0.03322254, 0.03972976, 0.05373017,
        0.05333028, 0.05958013, 0.06005118, 0.06034473, 0.04314597,
        0.03806583, 0.0386477 , 0.03094892, 0.03106771, 0.04109495,
        0.04176433, 0.03655596, 0.02639767, 0.0235764 , 0.01706537,
        0.02147566, 0.04216897, 0.03238673]),
 'ssi': array([23.51964209, 42.42566996, 61.31071888, 56.9693793 , 62.63437547,
        58.45371472, 28.43166296, 19.79431186, 29.34707639, 54.767860

In [103]:
a = pd.DataFrame(f, index=cols)

In [104]:
a = a.unstack().to_frame().sort_index(level=1).T

In [105]:
a.columns

MultiIndex([(   'crest_factor', 'F1'),
            (           'dasd', 'F1'),
            (        'diffvar', 'F1'),
            (    'form_factor', 'F1'),
            (           'iemg', 'F1'),
            (       'kurtosis', 'F1'),
            (     'kurtosis_f', 'F1'),
            (            'mav', 'F1'),
            (            'max', 'F1'),
            (          'max_f', 'F1'),
            ...
            (          'power', 'W9'),
            ('pulse_indicator', 'W9'),
            (            'rms', 'W9'),
            (           'skew', 'W9'),
            (         'skew_f', 'W9'),
            (            'ssi', 'W9'),
            (          'sum_f', 'W9'),
            (          'var_f', 'W9'),
            (          'wflen', 'W9'),
            (       'willison', 'W9')],
           length=756)

In [101]:
a.columns = a.columns.map('_'.join)

In [102]:
a

Unnamed: 0,crest_factor_F1,dasd_F1,diffvar_F1,form_factor_F1,iemg_F1,kurtosis_F1,kurtosis_f_F1,mav_F1,max_F1,max_f_F1,...,power_W9,pulse_indicator_W9,rms_W9,skew_W9,skew_f_W9,ssi_W9,sum_f_W9,var_f_W9,wflen_W9,willison_W9
0,7.091099,0.021834,0.000606,-6502.754507,365.456658,2.574033,85.547962,0.035689,0.047925,0.0046,...,0.00051,-22508.900332,0.022588,-0.148877,5.529415,5.224663,0.109574,2.901826e-10,69.944068,15.0


In [107]:
temp

Unnamed: 0,crest_factor_F1,dasd_F1,diffvar_F1,form_factor_F1,iemg_F1,kurtosis_F1,kurtosis_f_F1,mav_F1,max_F1,max_f_F1,...,rms_W9,skew_W9,skew_f_W9,ssi_W9,sum_f_W9,var_f_W9,wflen_W9,willison_W9,gesture,participant
0,7.091099,0.021834,0.000606,-6502.754507,365.456658,2.574033,85.547962,0.035689,0.047925,0.0046,...,0.022588,-0.148877,5.529415,5.224663,0.109574,2.901826e-10,69.944068,15.0,10,1


In [None]:
df_list = []
for i in tqdm(range(len(df))):
    wave = wfdb.rdrecord(df['filename'].iloc[i])
    wave_df = wave.to_dataframe()
    wave_df = wave_df.drop(['U1','U2', 'U3','U4'], axis = 1)
    f = extract_features(wave_df)
    temp = pd.DataFrame(f, index=cols)
    temp = temp.unstack().to_frame().sort_index(level=1).T
    temp.columns = temp.columns.map('_'.join)
    temp['gesture'] = df['gesture'].iloc[i]
    temp['participant'] = df['participant'].iloc[i]

    # e = pd.DataFrame.from_dict(temp, orient='index').T
    # new_df = pd.concat([new_df, temp])
    df_list.append(temp)

new_df = pd.concat(df_list)

new_df

  4%|██▊                                                                           | 554/15351 [01:11<33:45,  7.30it/s]

In [59]:
new_df.reset_index()

Unnamed: 0,index,iemg,mav,ssi,myopulse,wflen,diffvar,dasd,willison,mean,...,pulse_indicator,max_f,sum_f,mean_f,var_f,peak_f,skew_f,kurtosis_f,gesture,participant
0,0,365.456658,0.035689,23.519642,0.02168,188.828533,0.000606,0.021834,27,-0.000007,...,-46111.677787,0.0046,0.91217,0.000089,0.0,0.0046,6.936966,85.547962,1,1
1,1,486.252097,0.047486,42.42567,0.022559,224.464037,0.000868,0.02654,21,0.000011,...,36791.06522,0.001667,0.330219,0.000032,0.0,0.001667,8.149979,118.918727,0,
2,2,572.299915,0.055889,61.310719,0.023242,244.651459,0.001057,0.029418,18.0,0.000016,...,36933.187764,0.000121,0.044339,0.000004,0.0,0.000121,4.885285,42.644289,,
3,3,549.591476,0.053671,56.969379,0.022461,258.37943,0.001225,0.031316,34.0,-0.000041,...,-18318.669347,0.000404,0.131098,0.000013,0.0,0.000404,5.660337,56.951175,,
4,4,596.68857,0.05827,62.634375,0.022754,317.52357,0.001762,0.037045,44.0,-0.000035,...,-14193.066218,0.000334,0.109574,0.000011,0.0,0.000334,5.529415,54.880235,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429823,23,254.723731,0.024875,11.717116,0.026855,98.032284,0.000196,0.012823,25.0,0.000046,...,8552.141143,0.000068,0.034487,0.000003,0.0,0.000068,3.591427,23.562969,,
429824,24,112.723308,0.011008,2.435599,0.021484,40.497393,0.000028,0.004867,4.0,0.000008,...,14955.134796,0.000112,0.071311,0.000007,0.0,0.000112,3.590272,20.576495,,
429825,25,143.339922,0.013998,3.677581,0.031348,50.187893,0.000042,0.006065,1.0,-0.000008,...,-17841.908487,0.000202,0.113225,0.000011,0.0,0.000202,3.59251,21.367926,,
429826,26,181.603388,0.017735,5.517318,0.033008,62.935376,0.000063,0.007376,0.0,-0.000021,...,-5549.261444,0.000044,0.034422,0.000003,0.0,0.000044,2.889464,12.371417,,


In [370]:
# new_df1 = pd.DataFrame()
# for i in range(10):
#     wave = wfdb.rdrecord(df['filename'].iloc[i])
#     wave_df = wave.to_dataframe()
#     wave_df = wave_df.drop(['U1','U2', 'U3','U4'], axis = 1)
#     f = extract_features(wave_df)
#     f['gesture'] = df['gesture'].iloc[i]
#     f['participant'] = df['participant'].iloc[i]

#     e = pd.DataFrame.from_dict(f, orient='index').T
#     new_df1 = new_df1.append(e, ignore_index = False)

# new_df1.reset_index()

In [369]:
new_df.reset_index().to_csv(base_path+"intermediate.csv", index = False)

In [371]:
# i = 53
# wave = wfdb.rdrecord(df['filename'].iloc[i])
# wave_df = wave.to_dataframe()
# wave_df = wave_df.drop(['U1','U2', 'U3','U4'], axis = 1)
# f = extract_features(wave_df)
# f['gesture'] = df['gesture'].iloc[i]
# f['participant'] = df['participant'].iloc[i]

# g = pd.DataFrame.from_dict(f, orient='index').T

# # new_df = pd.DataFrame(columns = ['iemg', 'mav', 'ssi', 'myopulse', 'wflen', 'diffvar', 'dasd',
# #        'willison', 'mean', 'min', 'max', 'rms', 'power', 'peak', 'p2p',
# #        'crest_factor', 'skew', 'kurtosis', 'form_factor', 'pulse_indicator',
# #        'max_f', 'sum_f', 'mean_f', 'var_f', 'peak_f', 'skew_f', 'kurtosis_f',
# #        'gesture', 'participant'])
# # new_df.append(e, ignore_index=True)
# # new_df
# e = e.append(g)
# e