In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
pd.set_option("display.max_columns", None)
sns.set_theme(style = "whitegrid")
warnings.filterwarnings("ignore")
plt.rcParams['figure.figsize'] = (20, 10)

In [3]:
# run the following commands in the Terminal
# cd emg_data
# extract files, unrar x "./emg.rar" "./"
# install tree sudo apt install tree
# tree "./EMG Physical Action Data Set/sub1"
# ls -lrt ./EMG\ Physical\ Action\ Data\ Set/sub1/Aggressive/txt/
# cat ./EMG\ Physical\ Action\ Data\ Set/sub1/Aggressive/txt/Slapping.txt

In [4]:
# run the following commands in the command line (MacOS)
# cd emg_data
# extract files, tar -xf emg.rar
# install brew, brew install tree
# view the tree disgram of the file system in the EMG dataset directory, tree "./EMG Physical Action Data Set/sub1"
# view the files in one of the node directories, ls -lrt ./EMG\ Physical\ Action\ Data\ Set/sub1/Aggressive/txt/
# view the data in slappint.txt, cat ./EMG\ Physical\ Action\ Data\ Set/sub1/Aggressive/txt/Slapping.txt

# Data Chunking
Data chunking is a technique used to divide a large dataset into smaller, more manageable chunks. In the context of Electromyography (EMG) signals, chunking is employed to address the issue of data redundancy, where consequtive samples exhibit high similarity.

### Why chunk EMG data?
1. Redundancy reduction: By identifying and removing redundant data, chunking can significantly reduce the dataset's size.
2. Improved training efficiency: Smaller datasets require less computational resourves and time for training Machine Learning models.
3. Enhanced model performance: By focusing on unique information, models can learn more effectively and achieve better performance.

### Chunking process
1. Interval selection: The optimal chunk size depends on the specific dataset and application. In the given scenario, an interval size of 10 is chosen.
2. Data segmentation: The dataset is divided into consecutive intervals of length 10.
3. Feature extraction: For each interval, a representative feature is extracted, such as mean, median, or maximum value.

### Note on data loss
While chunking inevitably leads to some loss of information, the benefits often outweigh the drawbacks, especially when dealing with redundant data. The key is to choose an appropriate chunk size that balances data reduction with information preservation.

In [10]:
import os

actions = {}
data_dirs = [
    "./emg_data/EMG Physical Action Data Set/sub1/Aggressive/txt",
    "./emg_data/EMG Physical Action Data Set/sub1/Normal/txt"
]
ind = 0
df = pd.DataFrame()

for dirs in data_dirs:
    for files in os.listdir(dirs):
        with open(os.path.join(dirs, files), "r") as file:
            temp = pd.read_csv(
                file.name,
                sep = "\t",
                header = None,
                names = ["ch" + str(i) for i in range(1, 9)]
            )

            # chunking using max of every 10 sequential values
            temp_chunked = pd.DataFrame()

            for i in range(0, len(temp), 10):
                temp_chunked = temp_chunked.append(temp.iloc[i: i+10].max(), ignore_index = True)

            labels = [files[: -4] for i in range(len(temp_chunked))] # remove the last 4 characters = ".txt" from the filename
            actions[files[: -4]] = ind

            temp_chunked["Action"] = labels

            df = pd.concat([df, temp_chunked], ignore_index = True)

            ind += 1

actions

AttributeError: 'DataFrame' object has no attribute 'append'