In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder

In [2]:
def load_all_data(directory):
    left_data = []
    right_data = []
    subjects = []
    act_classes = []
    
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            subject = filename.split("_")[0].strip()
            act_class = filename.split("_")[1].split(".")[0].strip()
            df = pd.read_csv(os.path.join(directory, filename), header=None)

            # Add the entire first and second column as an array to the respective lists
            left_data.append(df[0].values)
            right_data.append(df[1].values)
            subjects.append(subject)
            act_classes.append(act_class)

    # Combine into a DataFrame
    combined_data = pd.DataFrame({
        'left_data': left_data,
        'right_data': right_data,
        'subject': subjects,
        'act_class': act_classes
    })

    return combined_data

def normalize_all_data(df):
    # Concatenate all arrays in 'left_data' and 'right_data' to find the global min and max
    all_left_data = np.concatenate(df['left_data'].tolist())
    all_right_data = np.concatenate(df['right_data'].tolist())

    left_min, left_max = all_left_data.min(), all_left_data.max()
    right_min, right_max = all_right_data.min(), all_right_data.max()

    print(f"left_min: {left_min}, left_max: {left_max}")
    print(f"right_min: {right_min}, right_max: {right_max}")

    # Define normalization functions based on the global min and max
    def normalize(array, min_val, max_val):
        if max_val - min_val == 0:
            return array  # Avoid division by zero
        return (array - min_val) / (max_val - min_val)

    # Apply normalization to each array in 'left_data' and 'right_data'
    df['left_data'] = df['left_data'].apply(normalize, args=(left_min, left_max))
    df['right_data'] = df['right_data'].apply(normalize, args=(right_min, right_max))

    return df

In [3]:
all_data = load_all_data("data/DesktopActivity/ALL")
all_data_normalized = normalize_all_data(all_data)

left_min: -0.12492, left_max: 1.0873
right_min: -0.19863, right_max: 1.4762


In [4]:
def calculate_std_deviation(df):
    # Concatenate all arrays in 'left_data' and 'right_data'
    all_left_data = np.concatenate(df['left_data'].tolist())
    all_right_data = np.concatenate(df['right_data'].tolist())

    # Calculate the standard deviation for the concatenated data
    std_dev_left = np.std(all_left_data)
    std_dev_right = np.std(all_right_data)

    # Print the standard deviations
    print(f"Standard Deviation for left_data: {std_dev_left}")
    print(f"Standard Deviation for right_data: {std_dev_right}")

    return std_dev_left, std_dev_right

# Calculate and print the standard deviations
std_left, std_right = calculate_std_deviation(all_data_normalized)

Standard Deviation for left_data: 0.09782551290226099
Standard Deviation for right_data: 0.12052316716926113


In [5]:
def apply_windowing(df, window_size, overlap):
    step_size = int(window_size * (1 - overlap))
    print(f"The step size of each sample is {step_size}, this is determined via the overlap")

    left_data_segments = []
    right_data_segments = []
    labels = []
    subjects = []

    for index, row in df.iterrows():
        left_data_array = row['left_data']
        right_data_array = row['right_data']
        label = row['act_class']  # or any other label you wish to use
        subject = row['subject']

        # Apply windowing for left_data
        for i in range(0, len(left_data_array) - window_size + 1, step_size):
            window = left_data_array[i:i + window_size]
            left_data_segments.append(window)
            labels.append(label)
            subjects.append(subject)

        # Apply windowing for right_data
        for i in range(0, len(right_data_array) - window_size + 1, step_size):
            window = right_data_array[i:i + window_size]
            right_data_segments.append(window)
            # Labels and subjects are the same for both left and right data

    # Combine into a DataFrame
    windowed_data = pd.DataFrame({
        'left_data': left_data_segments,
        'right_data': right_data_segments,
        'label': labels,
        'subject': subjects
    })

    return windowed_data
windowed_data_normalized = apply_windowing(all_data_normalized, 300, 0.8)

The step size of each sample is 59, this is determined via the overlap


In [6]:
print(windowed_data_normalized.head())
print(windowed_data_normalized.info())

                                           left_data  \
0  [0.417828446981571, 0.4195855537773672, 0.4193...   
1  [0.567636237646632, 0.559857121644586, 0.47971...   
2  [0.5321888766065566, 0.5156489746085695, 0.512...   
3  [0.49554536305291125, 0.49622180792265436, 0.4...   
4  [0.5517480325353484, 0.5503868934681825, 0.549...   

                                          right_data label subject  
0  [0.4559507532107736, 0.45598657774221857, 0.45...  PLAY     P04  
1  [0.4485828412435889, 0.44666622881128226, 0.41...  PLAY     P04  
2  [0.3223133094105073, 0.3336935688995301, 0.331...  PLAY     P04  
3  [0.36400709325722613, 0.3664909274374115, 0.36...  PLAY     P04  
4  [0.3300633497131052, 0.33221282159980414, 0.33...  PLAY     P04  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7104 entries, 0 to 7103
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   left_data   7104 non-null   object
 1   right_data  7104 n