In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from scipy.fft import fft
import time
from pyentrp import entropy as ent
from sklearn.model_selection import GroupKFold

# Function to load and preprocess data
def load_data(subject, repetition, experiment_type, minute):
    file_path = os.path.join(data_path, f'{subject}_{repetition}_{experiment_type}{minute}.csv')

    # Read data, treating potential mixed types as strings, and coercing errors to NaN
    data = pd.read_csv(file_path, dtype=str, low_memory=False, on_bad_lines='skip')    

    data = data.apply(pd.to_numeric, errors='coerce') 

    # Label assignment based on experiment type
    if experiment_type == 'm':
        data['label'] = 0
    elif experiment_type == 'l':
        data['label'] = 1
    elif experiment_type == 'c':
        data['label'] = 2
    elif experiment_type == 'e':
        data['label'] = 3
    else:
        raise ValueError("Invalid experiment_type.")

    return data

In [2]:
def calculate_total_band_energy(signal):
    """Calculates the total energy of the frequency band of a signal using FFT."""
    fft_values = np.fft.fft(signal)
    magnitude_squared = np.abs(fft_values) ** 2
    total_energy = np.sum(magnitude_squared) / len(signal)
    return total_energy

def max_power(signal):
    """Calculates the maximum power of a signal using FFT."""
    fft_values = np.fft.fft(signal)
    power_spectrum = np.abs(fft_values) ** 2 / len(signal)
    return np.max(power_spectrum)

def shannon_entropy(signal):
    """Calculates Shannon entropy of a signal."""
    value, counts = np.unique(signal, return_counts=True)
    probabilities = counts / counts.sum()
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))  # To avoid log(0)
    return entropy

def calculate_peak_to_peak(signal):
    """Calculates the peak-to-peak value of a signal."""
    peak_to_peak_value = signal.max() - signal.min()
    return peak_to_peak_value

def differential_entropy(signal):
    variance = np.var(signal)
    if variance == 0:
        return 0  # Avoid log(0), return 0 for a signal with no variability
    return 0.5 * np.log(2 * np.pi * np.e * variance)


def sample_entropy(signal, m=2, r=None):
    try:
        std_signal = np.std(signal)
        if std_signal == 0:
            return 0  # Return a default or placeholder value that maintains consistency
        if r is None:
            r = 0.2 * std_signal
        entropy_value = ent.sample_entropy(signal, m, r)
        if len(entropy_value) > 0:
            return entropy_value[0]  # Ensure this returns a single value
        else:
            return 0  # Return a default value if entropy array is empty
    except Exception as e:
        print(f"Error calculating sample entropy: {str(e)}")
        return 0  # Return a default value in case of other exceptions



def extract_features(data, num_windows):
    window_size = int((60*256) / num_windows)  # Adjusted to dataset specifics
    features, labels = [], []
    for i in range(num_windows):
        window_data = data.iloc[i*window_size : (i+1)*window_size]
        if len(window_data) < window_size:
            raise ValueError("NOT ENOUGH DATA FOR 1 MINUTE.")
        
        # Drop label from the window data if present
        if 'label' in window_data.columns:
            labels.append(window_data['label'].iloc[0])
            feature_data = window_data.drop(columns='label')
        else:
            feature_data = window_data

        # Calculate features for each channel
        means = feature_data.mean()
        max_powers = feature_data.apply(max_power)
        entropies = feature_data.apply(shannon_entropy)
        band_energies = feature_data.apply(calculate_total_band_energy)
        peak_to_peak_values = feature_data.apply(calculate_peak_to_peak)
        diff_entropies = feature_data.apply(differential_entropy)
        sample_entropies = feature_data.apply(sample_entropy)
        sample_entropies_array = sample_entropies.values.flatten() if sample_entropies.ndim > 1 else sample_entropies.values

        # Combine all features into a single array
        combined_features = np.concatenate([
            means.values,
            max_powers.values,
            entropies.values,
            band_energies.values,
            sample_entropies_array,
            diff_entropies.values,
            peak_to_peak_values.values            
        ])

        # Append combined features and labels
        features.append(combined_features)

    features, labels = np.array(features), np.array(labels)
    return features, labels

def generate_feature_names(data):
    feature_names = []
    # Assuming 'data' is a DataFrame with the same structure as your actual feature data
    sample_data = data.iloc[:1]  # Take just one row to minimize processing
    if 'label' in sample_data.columns:
        sample_data = sample_data.drop(columns='label')

    for column in sample_data.columns:
        feature_names.extend([
            f"mean_{column}",
            f"max_power_{column}",
            f"entropy_{column}",
            f"band_energy_{column}",
            f"sample_entropy_{column}", 
            f"differential_entropy_{column}",
            f"peak_to_peak_{column}"
        ])
    return feature_names


In [3]:
def save_features_labels(features, labels, subject_ids, num_windows, feature_names, folder_path):
    """
    Saves features, labels, and subject IDs to a CSV file, ensuring not to overwrite existing files.
    :param features: numpy array of features.
    :param labels: numpy array of labels.
    :param subject_ids: numpy array of subject identifiers.
    :param num_windows: number of windows, used for naming the file.
    :param folder_path: directory path where the files will be saved.
    """
    # Create the folder if it does not exist
    os.makedirs(folder_path, exist_ok=True)
    
    # Prepare the data for saving
    data = np.column_stack((subject_ids, features, labels))
    df = pd.DataFrame(data)
    df.columns = ["subject_id"] + feature_names + ["label"]
    
    # Generate the base file name
    base_file_name = os.path.join(folder_path, f"features_{num_windows}")
    extension = ".csv"
    file_name = base_file_name + extension
    counter = 1

    # Increment the file name if it already exists
    while os.path.exists(file_name):
        file_name = f"{base_file_name}({counter}){extension}"
        counter += 1

    # Save the DataFrame to a CSV file
    df.to_csv(file_name, index=False)
    print(f"File saved: {file_name}")

In [4]:
# Path to data and other constants
data_path = '/home/ximo/Escritorio/ProyectoTFG/MusePreprocessed'
subjects = range(1,31)
repetitions = ['1', '2']
minutes = ['1', '2', '3']
experiment_types = ['m', 'l', 'c', 'e']
num_windows_options = [1]
folder_name = "/home/ximo/Escritorio/ProyectoTFG/featuresExtended"

for num_windows in num_windows_options:

    # Preload all data and extract features once
    all_data = {}
    all_features = []
    all_labels = []
    all_subject_ids = []  

    # Load a small sample data to generate feature names
    sample_data = load_data(subjects[0], repetitions[0], experiment_types[0], minutes[0])
    feature_names = generate_feature_names(sample_data)

    print("Number of feature names:", len(feature_names))
    print("Expected number of DataFrame columns:", len(feature_names) + 2)  # +2 for "subject_id" and "label"



    for subject in subjects:
        subject_data = []
        for repetition in repetitions:
            for exp_type in experiment_types:
                for minute in minutes:
                    data = load_data(subject, repetition, exp_type, minute)
                    features, labels = extract_features(data, num_windows)
                    subject_data.append((features, labels))
                    all_features.extend(features)
                    all_labels.extend(labels)
                    all_subject_ids.extend([subject] * len(features))
        all_data[subject] = subject_data

    all_features = np.array(all_features)
    all_labels = np.array(all_labels)
    all_subject_ids = np.array(all_subject_ids)  # Convert list of subject IDs to an array

    #Guardamos las features en un ficheros
    save_features_labels(all_features, all_labels, all_subject_ids, num_windows, feature_names, folder_name)

Number of feature names: 175
Expected number of DataFrame columns: 177
File saved: /home/ximo/Escritorio/ProyectoTFG/featuresExtended/features_1.csv
