In [26]:
%matplotlib inline

import numpy as np
import pandas as pd # Data frames
import matplotlib.pyplot as plt # Visuals
import seaborn as sns 
sns.set()
import csv
import re
from sklearn.model_selection import train_test_split # Create training and test sets
from sklearn.tree import DecisionTreeClassifier # Decision Trees
from sklearn import tree 
from sklearn.ensemble import RandomForestClassifier # Random Forest
from sklearn import svm #SVM
from sklearn.metrics import roc_curve # ROC Curves
from sklearn.model_selection import cross_val_score  #cross validation 
from sklearn.metrics import precision_score, recall_score, f1_score
import pywt
from sklearn.metrics import mean_squared_error
import plotly.offline as py
from scipy import signal
from scipy.signal import butter, filtfilt
py.init_notebook_mode(connected=True)
plt.style.use('ggplot')

In [2]:
import os

In [3]:
root = os.getcwd()
root

'/home/bench-user/data/mt/EMG/EMG-Signal-Classification/Src/Preprocessing'

In [15]:
file_dir = root + "/../../Data/csv/AH"
file_list = os.listdir(file_dir)

def custom_sort_key(filename):
    # Check if the file is a .ipynb file
    if filename == '.ipynb_checkpoints':
        return (0, filename)
    else:
        # Split the filename into parts based on underscores and periods
        parts = filename.split('_')
        if len(parts) > 1:
            prefix = parts[0]
            suffix = parts[1].split('.')[0]  # Remove the file extension if present
            if prefix == 'ah':
                # Sort 'ah_x.csv' files by the numeric value of x
                return (1, int(suffix))
    # Sort other files at the end
    return (2, filename)

# Sort the file list using the custom key function
sorted_file_list = sorted(file_list, key=custom_sort_key)
sorted_file_list


['.ipynb_checkpoints',
 'ah_1.csv',
 'ah_2.csv',
 'ah_3.csv',
 'ah_4.csv',
 'ah_5.csv',
 'ah_6.csv']

In [16]:
def add_iteration_column(folder_path):
    dataframes = []
    for i, file in enumerate(sorted_file_list):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(folder_path, file)) #encoding='ISO-8859-1')
            df["class"] = i
            dataframes.append(df)
    return dataframes



In [17]:
dataframe = add_iteration_column(file_dir)
final_df = pd.concat(dataframe, axis=0, ignore_index=True)
final_df

Unnamed: 0,seconds,data1,data2,data3,data4,data5,data6,data7,data8,data9,class
0,0.002,25.793845,1620.981925,1591.963850,1739.472399,1794.284319,1617.757694,1682.242306,1665.315095,1665.315095,1
1,0.004,1643.551539,1628.236444,1647.581827,1537.957987,1760.429897,1652.418173,1645.163654,1649.193942,1649.193942,1
2,0.006,1657.254519,1629.042501,1735.442110,1298.558867,1731.411822,1686.272594,1589.545677,1633.072789,1633.072789,1
3,0.008,1670.151441,1620.175867,1787.835857,1259.062042,1704.811920,1718.514900,1571.812408,1621.787982,1621.787982,1
4,0.010,1661.284807,1628.236444,1779.775281,1520.224719,1754.787494,1677.405960,1605.666830,1640.327308,1640.327308,1
...,...,...,...,...,...,...,...,...,...,...,...
378849,123.752,1671.763556,1641.133366,1664.509038,1429.946263,1653.224231,1657.254519,1630.654617,1669.345383,1669.345383,6
378850,123.754,1674.987787,1649.193942,1674.987787,1550.048852,1654.836346,1665.315095,1632.266732,1660.478749,1660.478749,6
378851,123.756,1662.896922,1650.000000,1690.302882,1738.666341,1658.060576,1666.121153,1626.624328,1668.539326,1668.539326,6
378852,123.758,1679.018075,1654.030288,1692.721055,1826.526624,1660.478749,1668.539326,1630.654617,1660.478749,1660.478749,6


In [18]:
for i in range(1,10):
    final_df.iloc[:,i] = final_df.iloc[:,i]/1000
    
final_df

Unnamed: 0,seconds,data1,data2,data3,data4,data5,data6,data7,data8,data9,class
0,0.002,0.025794,1.620982,1.591964,1.739472,1.794284,1.617758,1.682242,1.665315,1.665315,1
1,0.004,1.643552,1.628236,1.647582,1.537958,1.760430,1.652418,1.645164,1.649194,1.649194,1
2,0.006,1.657255,1.629043,1.735442,1.298559,1.731412,1.686273,1.589546,1.633073,1.633073,1
3,0.008,1.670151,1.620176,1.787836,1.259062,1.704812,1.718515,1.571812,1.621788,1.621788,1
4,0.010,1.661285,1.628236,1.779775,1.520225,1.754787,1.677406,1.605667,1.640327,1.640327,1
...,...,...,...,...,...,...,...,...,...,...,...
378849,123.752,1.671764,1.641133,1.664509,1.429946,1.653224,1.657255,1.630655,1.669345,1.669345,6
378850,123.754,1.674988,1.649194,1.674988,1.550049,1.654836,1.665315,1.632267,1.660479,1.660479,6
378851,123.756,1.662897,1.650000,1.690303,1.738666,1.658061,1.666121,1.626624,1.668539,1.668539,6
378852,123.758,1.679018,1.654030,1.692721,1.826527,1.660479,1.668539,1.630655,1.660479,1.660479,6


In [19]:
# Total missing values for each feature
final_df.isnull().sum()

seconds    0
data1      0
data2      0
data3      0
data4      0
data5      0
data6      0
data7      0
data8      0
data9      0
class      0
dtype: int64

In [21]:
def remove_duplicate_columns(data_frame):
    # Create a dictionary to store column contents and their indices
    unique_columns = {}

    # Iterate through each column in the DataFrame
    for idx, column in data_frame.items():
        column_data = tuple(column)
        
        # If the column data is not already in the dictionary, add it
        if column_data not in unique_columns.values():
            unique_columns[idx] = column_data
    
    # Create a new DataFrame with the unique columns
    unique_df = pd.DataFrame(unique_columns)
    
    return unique_df

# Remove duplicate columns
unique_df = remove_duplicate_columns(final_df)
final_df = unique_df

In [22]:
lowcut = 15 # Hz
highcut = 450 # Hz
order = 4 # filter order
fs = 1000 # sampling frequency. It should be at least twice the maximum frequency present in the EMG signal

def butter_bandpass(lowcut, highcut, fs, order=4):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = signal.butter(order, [low, high], btype='band')
    return b, a

b, a = butter_bandpass(lowcut, highcut, fs, order)

In [23]:
# Get the column names of the DataFrame
col_names = final_df.columns.tolist()

# Initialize an empty list to store the numbers
numbers = []

# Loop through each column name
for name in col_names:
    # Use regular expression to find the number in the column name
    match = re.search('data(\d+)', name)
    if match:
        # If a number is found, append it to the list
        numbers.append(int(match.group(1)))

print(numbers)

for i in (numbers):
    final_df['data' + str(i)] = filtfilt(b, a, final_df['data' + str(i)])
    final_df['data' + str(i)] = np.abs(final_df['data' + str(i)])
    final_df['data' + str(i)] = (final_df['data' + str(i)] - final_df['data' + str(i)].min()) / (final_df['data' + str(i)].max() - final_df['data' + str(i)].min())

[1, 2, 3, 4, 5, 6, 7, 8]


In [25]:
final_df.to_csv("preprocessed_data.csv")