In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from scipy.fft import fft
import time
from pyentrp import entropy as ent
from concurrent.futures import ProcessPoolExecutor, as_completed

# Function to load and preprocess data
def load_data(subject, repetition, experiment_type, minute):
    file_path = os.path.join(data_path, f'{subject}_{repetition}_{experiment_type}{minute}.csv')

    # Read data, treating potential mixed types as strings, and coercing errors to NaN
    data = pd.read_csv(file_path, dtype=str, low_memory=False, on_bad_lines='skip')    

    data = data.apply(pd.to_numeric, errors='coerce') 

    # Label assignment based on experiment type
    if experiment_type == 'm':
        data['label'] = 0
    elif experiment_type == 'l':
        data['label'] = 1
    elif experiment_type == 'c':
        data['label'] = 2
    elif experiment_type == 'e':
        data['label'] = 3
    else:
        raise ValueError("Invalid experiment_type.")

    return data

# Determinar el número óptimo de características usando SelectKBest con validación cruzada
def optimal_k(features, labels):
    scores = []
    for k in range(1, min(20, features.shape[1]+1)):  # Asumimos un máximo de 20 características o menos si hay menos disponibles
        selector = SelectKBest(f_classif, k=k)
        selected_features = selector.fit_transform(features, labels)
        score = np.mean(cross_val_score(LinearSVC(dual=False), selected_features, labels, cv=5))
        scores.append((k, score))
    best_k = sorted(scores, key=lambda x: x[1], reverse=True)[0]
    print(f"Optimal number of features: {best_k[0]} with cross-validation score: {best_k[1]:.2f}")
    return best_k[0]

# Function to reduce features using SelectKBest
def select_features(X_train, y_train, X_test, k=10):
    # Initialize and fit SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X_train, y_train)
    
    # Transform both training and testing data
    X_train_reduced = selector.transform(X_train)
    X_test_reduced = selector.transform(X_test)
    
    return X_train_reduced, X_test_reduced

# Function to help calculate specifity and sensibility
def calc_metrics(cm):
    # Sum all confusion matrix entries to get total number of instances
    total = cm.sum()

    # Sum along the main diagonal to get all true positives
    TP = np.diag(cm)

    # Calculate False Positives, False Negatives, and True Negatives
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = total - (FP + FN + TP)

    # Calculate sensitivity (recall) and specificity for each class
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)

    # Calculate average sensitivity and specificity if needed
    avg_sensitivity = np.mean(sensitivity)
    avg_specificity = np.mean(specificity)

    return avg_sensitivity, avg_specificity


In [2]:
import numpy as np
import pandas as pd

def calculate_total_band_energy(signal):
    """Calculates the total energy of the frequency band of a signal using FFT."""
    fft_values = np.fft.fft(signal)
    magnitude_squared = np.abs(fft_values) ** 2
    total_energy = np.sum(magnitude_squared) / len(signal)
    return total_energy

def max_power(signal):
    """Calculates the maximum power of a signal using FFT."""
    fft_values = np.fft.fft(signal)
    power_spectrum = np.abs(fft_values) ** 2 / len(signal)
    return np.max(power_spectrum)

def shannon_entropy(signal):
    """Calculates Shannon entropy of a signal."""
    value, counts = np.unique(signal, return_counts=True)
    probabilities = counts / counts.sum()
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))  # To avoid log(0)
    return entropy

def calculate_peak_to_peak(signal):
    """Calculates the peak-to-peak value of a signal."""
    peak_to_peak_value = signal.max() - signal.min()
    return peak_to_peak_value

def sample_entropy(signal, m=2, r=None):
    """Calculates the sample entropy of a signal.
    - m: the length of compared run of data
    - r: the filtering level, a percentage of the data's standard deviation
    """
    if r is None:
        r = 0.2 * np.std(signal)  # Typically 0.1 to 0.25 times the standard deviation of the signal
    return ent.sample_entropy(signal, m, r)

def extract_features(data, num_windows):
    window_size = int((60*256) / num_windows)  # Adjusted to dataset specifics
    features, labels = [], []
    for i in range(num_windows):
        window_data = data.iloc[i*window_size : (i+1)*window_size]
        if len(window_data) < window_size:
            raise ValueError("NOT ENOUGH DATA FOR 1 MINUTE.")
        
        # Drop label from the window data if present
        if 'label' in window_data.columns:
            labels.append(window_data['label'].iloc[0])
            feature_data = window_data.drop(columns='label')
        else:
            feature_data = window_data

        # Calculate features for each channel
        means = feature_data.mean()
        max_powers = feature_data.apply(max_power)
        entropies = feature_data.apply(shannon_entropy)
        band_energies = feature_data.apply(calculate_total_band_energy)
        peak_to_peak_values = feature_data.apply(calculate_peak_to_peak)
        #sample_entropies = feature_data.apply(sample_entropy)
        #sample_entropies_array = sample_entropies.values.flatten() if sample_entropies.ndim > 1 else sample_entropies.values

        # Combine all features into a single array
        combined_features = np.concatenate([
            means.values,
            max_powers.values,
            entropies.values,
            band_energies.values,
            peak_to_peak_values.values
            #sample_entropies_array
        ])

        # Append combined features and labels
        features.append(combined_features)

    features, labels = np.array(features), np.array(labels)
    return features, labels

In [3]:
def load_and_extract_features(args):
    subject, repetition, exp_type, minute, num_windows = args
    data = load_data(subject, repetition, exp_type, minute)
    features, labels = extract_features(data, num_windows)
    return features, labels

# Main processing function
def process_data(subjects, repetitions, experiment_types, minutes, num_windows):
    all_features, all_labels = [], []

    # Prepare arguments for parallel processing
    tasks = []
    for subject in subjects:
        for repetition in repetitions:
            for exp_type in experiment_types:
                for minute in minutes:
                    tasks.append((subject, repetition, exp_type, minute, num_windows))

    # Use ProcessPoolExecutor to execute tasks in parallel
    with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
        future_to_task = {executor.submit(load_and_extract_features, task): task for task in tasks}
        for future in as_completed(future_to_task):
            features, labels = future.result()
            all_features.extend(features)
            all_labels.extend(labels)

    return np.array(all_features), np.array(all_labels)

In [4]:
data_path = '/home/ximo/Escritorio/ProyectoTFG/MusePreprocessed'
subjects = range(1, 31)
repetitions = ['1', '2']
minutes = ['1', '2', '3']
experiment_types = ['m', 'l', 'c', 'e']
num_windows_options = [1, 2]

loo = LeaveOneOut()

for num_windows in num_windows_options:
    start_time = time.time()
    all_features, all_labels = process_data(subjects, repetitions, experiment_types, minutes, num_windows)
    print(f"Completed processing for window size {num_windows}. Time taken: {time.time() - start_time:.2f} seconds")
    
    # Select optimal number of features
    k = optimal_k(all_features, all_labels)
    selector = SelectKBest(f_classif, k=k)
    all_features = selector.fit_transform(all_features, all_labels)

    # Prepare for leave-one-out cross-validation
    for train_index, test_index in loo.split(all_features):
        X_train, X_test = all_features[train_index], all_features[test_index]
        y_train, y_test = all_labels[train_index], all_labels[test_index]
        
        # Evaluate different models
        models = {
            'SVM': LinearSVC(dual=True, max_iter=100),
            'Random Forest': RandomForestClassifier(n_estimators=100),
            'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
            'KNN': KNeighborsClassifier(n_neighbors=4)
        }
        results = []
        for name, model in models.items():
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            accuracy = accuracy_score(y_test, predictions)
            cm = confusion_matrix(y_test, predictions)
            sens, spec = calc_metrics(cm)
            results.append([name, f"{accuracy*100:.2f}%", f"{spec:.2f}%", f"{sens:.2f}%"])
        
        # Display results for the current leave-one-out iteration
        print(tabulate(results, headers=["Model", "Accuracy", "Specificity", "Sensitivity"]))
        print("\n")

Completed processing for window size 1. Time taken: 40.71 seconds
Optimal number of features: 8 with cross-validation score: 0.41


  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  100.00%     nan%           1.00%
XGBoost        100.00%     nan%           1.00%
KNN            100.00%     nan%           1.00%




  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  0.00%       nan%           nan%
XGBoost        0.00%       nan%           nan%
KNN            0.00%       nan%           nan%




  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            100.00%     nan%           1.00%
Random Forest  100.00%     nan%           1.00%
XGBoost        100.00%     nan%           1.00%
KNN            100.00%     nan%           1.00%




  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            100.00%     nan%           1.00%
Random Forest  100.00%     nan%           1.00%
XGBoost        100.00%     nan%           1.00%
KNN            100.00%     nan%           1.00%




  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  0.00%       nan%           nan%
XGBoost        0.00%       nan%           nan%
KNN            100.00%     nan%           1.00%




  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            100.00%     nan%           1.00%
Random Forest  100.00%     nan%           1.00%
XGBoost        100.00%     nan%           1.00%
KNN            100.00%     nan%           1.00%




  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  100.00%     nan%           1.00%
XGBoost        100.00%     nan%           1.00%
KNN            0.00%       nan%           nan%




  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  100.00%     nan%           1.00%
XGBoost        100.00%     nan%           1.00%
KNN            100.00%     nan%           1.00%




  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            100.00%     nan%           1.00%
Random Forest  0.00%       nan%           nan%
XGBoost        0.00%       nan%           nan%
KNN            0.00%       nan%           nan%




  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  100.00%     nan%           1.00%
XGBoost        100.00%     nan%           1.00%
KNN            0.00%       nan%           nan%




  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  100.00%     nan%           1.00%
XGBoost        0.00%       nan%           nan%
KNN            0.00%       nan%           nan%




  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            100.00%     nan%           1.00%
Random Forest  0.00%       nan%           nan%
XGBoost        0.00%       nan%           nan%
KNN            100.00%     nan%           1.00%




  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            100.00%     nan%           1.00%
Random Forest  100.00%     nan%           1.00%
XGBoost        0.00%       nan%           nan%
KNN            100.00%     nan%           1.00%




  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  0.00%       nan%           nan%
XGBoost        0.00%       nan%           nan%
KNN            0.00%       nan%           nan%




  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  0.00%       nan%           nan%
XGBoost        0.00%       nan%           nan%
KNN            0.00%       nan%           nan%




  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            100.00%     nan%           1.00%
Random Forest  100.00%     nan%           1.00%
XGBoost        100.00%     nan%           1.00%
KNN            100.00%     nan%           1.00%




  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  100.00%     nan%           1.00%
XGBoost        100.00%     nan%           1.00%
KNN            100.00%     nan%           1.00%




  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  100.00%     nan%           1.00%
XGBoost        100.00%     nan%           1.00%
KNN            0.00%       nan%           nan%




  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            100.00%     nan%           1.00%
Random Forest  100.00%     nan%           1.00%
XGBoost        100.00%     nan%           1.00%
KNN            100.00%     nan%           1.00%




  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  0.00%       nan%           nan%
XGBoost        0.00%       nan%           nan%
KNN            0.00%       nan%           nan%




  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  0.00%       nan%           nan%
XGBoost        0.00%       nan%           nan%
KNN            0.00%       nan%           nan%




  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  specificity = TN / (TN + FP)


Model          Accuracy    Specificity    Sensitivity
-------------  ----------  -------------  -------------
SVM            0.00%       nan%           nan%
Random Forest  100.00%     nan%           1.00%
XGBoost        100.00%     nan%           1.00%
KNN            0.00%       nan%           nan%




KeyboardInterrupt: 

In [1]:
#FEATURES BIEN
import numpy as np
import pandas as pd

def calculate_total_band_energy(signal):
    """Calculates the total energy of the frequency band of a signal using FFT."""
    fft_values = np.fft.fft(signal)
    magnitude_squared = np.abs(fft_values) ** 2
    total_energy = np.sum(magnitude_squared) / len(signal)
    return total_energy

def max_power(signal):
    """Calculates the maximum power of a signal using FFT."""
    fft_values = np.fft.fft(signal)
    power_spectrum = np.abs(fft_values) ** 2 / len(signal)
    return np.max(power_spectrum)

def shannon_entropy(signal):
    """Calculates Shannon entropy of a signal."""
    value, counts = np.unique(signal, return_counts=True)
    probabilities = counts / counts.sum()
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))  # To avoid log(0)
    return entropy

def calculate_peak_to_peak(signal):
    """Calculates the peak-to-peak value of a signal."""
    peak_to_peak_value = signal.max() - signal.min()
    return peak_to_peak_value

def extract_features(data, num_windows):
    window_size = int((60*256) / num_windows)  # Adjusted to dataset specifics
    features, labels = [], []
    for i in range(num_windows):
        window_data = data.iloc[i*window_size : (i+1)*window_size]
        if len(window_data) < window_size:
            raise ValueError("NOT ENOUGH DATA FOR 1 MINUTE.")
        
        # Drop label from the window data if present
        if 'label' in window_data.columns:
            labels.append(window_data['label'].iloc[0])
            feature_data = window_data.drop(columns='label')
        else:
            feature_data = window_data

        # Calculate features for each channel
        means = feature_data.mean()
        max_powers = feature_data.apply(max_power)
        entropies = feature_data.apply(shannon_entropy)
        band_energies = feature_data.apply(calculate_total_band_energy)
        peak_to_peak_values = feature_data.apply(calculate_peak_to_peak)

        # Combine all features into a single array
        combined_features = np.concatenate([
            means.values,
            max_powers.values,
            entropies.values,
            band_energies.values,
            peak_to_peak_values.values
        ])

        # Append combined features and labels
        features.append(combined_features)

    features, labels = np.array(features), np.array(labels)
    return features, labels

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score
from statistics import mode
from sklearn.neighbors import KNeighborsClassifier

# Define excluded columns
excluded_columns = ['TimeStamp', 'Battery', 'HeadBandOn', 'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z', 'Mellow', 'Concentration', 'Elements']

# Function to load and preprocess data
def load_data(subject, repetition, experiment_type, minute):
    file_path = os.path.join(data_path, f'{subject}_{repetition}_{experiment_type}{minute}.csv')

    # Read data, treating potential mixed types as strings, and coercing errors to NaN
    data = pd.read_csv(file_path, dtype=str, low_memory=False, on_bad_lines='skip')
    data.drop(columns=excluded_columns, inplace=True)

    # Convert all columns to numeric, coercing errors to NaN
    for col in data.columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')

    # Replace infinities with NaN explicitly
    data.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Impute NaNs using the mean of each column
    mean_values = data.mean()
    for col in data.columns:
        if pd.isna(mean_values[col]):
            print(f"Error: Corrupted data in file: {subject}_{repetition}_{experiment_type}{minute}.csv")
            mean_values[col] = 1000000  # Super high number
    data.fillna(mean_values, inplace=True)


    # Additional step to check and ensure no infinity or extreme values remain
    if np.any(np.isinf(data)) or np.any(data.abs() > 1e308):
        raise ValueError("Data still contains infinities or extremely large values.")


    # Label assignment based on experiment type
    if experiment_type == 'm':
        data['label'] = 0
    elif experiment_type == 'l':
        data['label'] = 1
    elif experiment_type == 'c':
        data['label'] = 2
    elif experiment_type == 'e':
        data['label'] = 3
    else:
        raise ValueError("Invalid experiment_type.")

    return data


def max_power(signal):
    # Perform Fourier Transform
    fft_values = np.fft.fft(signal)
    # Calculate the power spectrum
    power_spectrum = np.abs(fft_values) ** 2 / len(signal)
    # Return the maximum power
    return np.max(power_spectrum)

# Function to extract features
def extract_features(data, num_windows):
    window_size = int((60*256) / num_windows)  # Adjusted to your dataset specifics
    num_features = 1  # We are only calculating 1 feature: max power
    features, labels = [], []
    for i in range(num_windows):
        window_data = data.iloc[i*window_size : (i+1)*window_size]
        if len(window_data) < window_size:
            raise ValueError("NOT ENOUGH DATA FOR 1 MINUTE.")
        
        # Drop label and calculate max power
        feature_data = window_data.drop(columns='label')
        max_powers = feature_data.apply(max_power)

        # Verify that the feature vector length is correct
        if len(max_powers) != 24*num_features:
            raise ValueError("Feature vector length mismatch.")
        
        # Collect max power features
        features.append(max_powers.values)
        labels.append(window_data.iloc[0]['label'])

    features, labels = np.array(features), np.array(labels)
    return features, labels


In [None]:
import os
import pandas as pd

# Define excluded columns
excluded_columns = ['TimeStamp', 'Battery', 'HeadBandOn', 'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z', 'Mellow', 'Concentration', 'Elements']

# Function to load and preprocess data
def load_data(subject, repetition, experiment_type, minute):
    file_path = os.path.join(data_path, f'{subject}_{repetition}_{experiment_type}{minute}.csv')
    try:
        # Read data, treating potential mixed types as strings, and coercing errors to NaN
        data = pd.read_csv(file_path, dtype=str, low_memory=False)
        data.drop(columns=excluded_columns, inplace=True)
        return True  # Return True to indicate success
    except Exception as e:
        print(f"Failed to drop columns in file: {file_path}")
        print(f"Error: {e}")
        return False  # Return False to indicate failure

# Define paths and parameters
data_path = '/content/drive/My Drive/Muse'
subjects = ['1', '2', '4', '5', '6', '7', '8', '9']
repetitions = ['1', '2']
minutes  = ['1','2','3']
experiment = ['m', 'l', 'c', 'e']

# Check each file for errors in dropping columns
error_count = 0
for subject in subjects:
    for repetition in repetitions:
        for exp_type in experiment:
            for minute in minutes:
                if not load_data(subject, repetition, exp_type, minute):
                    error_count += 1

print(f"Total files with errors: {error_count}")


In [4]:
import pandas as pd
import os

# Define the path to the data and the details of the experiment setup
data_path = './Muse'
subjects = range(1,19)
repetitions = ['1', '2']
minutes = ['1', '2', '3']
experiment_types = ['m', 'l', 'c', 'e']
window_size = 60*256  # Number of rows expected per minute

# Function to load data and check for sufficient length
def check_data_length(subject, repetition, experiment_type, minute):
    file_path = os.path.join(data_path, f'{subject}_{repetition}_{experiment_type}{minute}.csv')
    try:
        data = pd.read_csv(file_path)
        if len(data) < window_size:
            print(f"File {file_path} does NOT contain enough data for 1 minute: only {len(data)} rows available.")
    except Exception as e:
        print(f"Failed to process file {file_path}: {str(e)}")

# Loop over all combinations of subject, repetition, experiment type, and minute
for subject in subjects:
    for repetition in repetitions:
        for experiment_type in experiment_types:
            for minute in minutes:
                check_data_length(subject, repetition, experiment_type, minute)


  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)


File ./Muse/3_1_c1.csv does NOT contain enough data for 1 minute: only 14986 rows available.
Failed to process file ./Muse/3_2_m1.csv: [Errno 2] No such file or directory: './Muse/3_2_m1.csv'
Failed to process file ./Muse/3_2_m2.csv: [Errno 2] No such file or directory: './Muse/3_2_m2.csv'
Failed to process file ./Muse/3_2_m3.csv: [Errno 2] No such file or directory: './Muse/3_2_m3.csv'
Failed to process file ./Muse/3_2_l1.csv: [Errno 2] No such file or directory: './Muse/3_2_l1.csv'
Failed to process file ./Muse/3_2_l2.csv: [Errno 2] No such file or directory: './Muse/3_2_l2.csv'
Failed to process file ./Muse/3_2_l3.csv: [Errno 2] No such file or directory: './Muse/3_2_l3.csv'
Failed to process file ./Muse/3_2_e1.csv: [Errno 2] No such file or directory: './Muse/3_2_e1.csv'
Failed to process file ./Muse/3_2_e2.csv: [Errno 2] No such file or directory: './Muse/3_2_e2.csv'
Failed to process file ./Muse/3_2_e3.csv: [Errno 2] No such file or directory: './Muse/3_2_e3.csv'


  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data =

In [None]:
# Load data for all subjects and all experiment types
subjects = ['1','2','4','5','6','7','9','8', '10', '11']
repetitions = ['1', '2']
minutes = ['1', '2', '3']
experiment_types = ['m', 'l', 'c', 'e']
data_path = '/home/ximo/Escritorio/ProyectoTFG/Muse'

# Number of windows
num_windows_options = [60, 1]

# Setting up leave-one-out cross-validation
loo = LeaveOneOut()

for num_windows in num_windows_options:
    print(f"Running with {num_windows} windows")
    svm_accuracies, rf_accuracies, xgb_accuracies, knn_accuracies = [], [], [], []

    for train_index, test_index in loo.split(subjects):
        train_subjects = [subjects[i] for i in train_index]
        test_subject = subjects[test_index[0]]

        # Load and preprocess training data
        X_train, y_train = [], []
        for subject in train_subjects:
            for repetition in repetitions:
                for exp_type in experiment_types:
                    for minute in minutes:
                        data = load_data(subject, repetition, exp_type, minute)
                        features, labels = extract_features(data, num_windows)
                        X_train.extend(features)
                        y_train.extend(labels)

        X_train = np.array(X_train)
        y_train = np.array(y_train)

        # Load and preprocess test data
        X_test, y_test = [], []
        for repetition in repetitions:
            for exp_type in experiment_types:
                for minute in minutes:
                    data = load_data(test_subject, repetition, exp_type, minute)
                    features, labels = extract_features(data, num_windows)
                    X_test.extend(features)
                    y_test.extend(labels)

        X_test = np.array(X_test)
        y_test = np.array(y_test)

        print("Entra SVM")

        # Train and Test SVM model
        svm_model = SVC(kernel='linear', verbose=True)
        svm_model.fit(X_train, y_train)
        svm_predictions = svm_model.predict(X_test)
        svm_accuracy = accuracy_score(y_test, svm_predictions)
        svm_accuracies.append(svm_accuracy)

        print("Sale SVM")

        # Train and Test Random Forest model
        rf_model = RandomForestClassifier(n_estimators=100)
        rf_model.fit(X_train, y_train)
        rf_predictions = rf_model.predict(X_test)
        rf_accuracy = accuracy_score(y_test, rf_predictions)
        rf_accuracies.append(rf_accuracy)

        # Train and Test XGBoost model
        xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
        xgb_model.fit(np.array(X_train), y_train)
        xgb_predictions = xgb_model.predict(np.array(X_test))
        xgb_accuracy = accuracy_score(y_test, xgb_predictions)
        xgb_accuracies.append(xgb_accuracy)

        # Train and Test KNN model
        knn_model = KNeighborsClassifier(n_neighbors=4)
        knn_model.fit(X_train, y_train)
        knn_predictions = knn_model.predict(X_test)
        knn_accuracy = accuracy_score(y_test, knn_predictions)
        knn_accuracies.append(knn_accuracy)

        # Print results
        print(f"Test on subject {test_subject} results:")
        print(f"SVM Accuracy: {svm_accuracy:.2f}")
        print(f"Random Forest Accuracy: {rf_accuracy:.2f}")
        print(f"XGBoost Accuracy: {xgb_accuracy:.2f}")
        print(f"KNN Accuracy: {knn_accuracy:.2f}")

    # Calculate mean accuracy for each classifier
    print(f"Mean accuracy for {num_windows} windows:")
    print(f"Mean SVM Accuracy: {np.mean(svm_accuracies):.2f}")
    print(f"Mean Random Forest Accuracy: {np.mean(rf_accuracies):.2f}")
    print(f"Mean XGBoost Accuracy: {np.mean(xgb_accuracies):.2f}")
    print(f"Mean KNN Accuracy: {np.mean(knn_accuracies):.2f}")

Running with 60 windows
[LibSVM]........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

Delta_TP9: 0.7022680956280096
Delta_AF7: 0.4841606237491383
Delta_AF8: 0.6239981684431315
Delta_TP10: 0.0
Theta_TP9: 0.8822259153606598
Theta_AF7: 0.3631642347010586
Theta_AF8: 0.6034722704332841
Theta_TP10: 0.0
Alpha_TP9: 1.024559508277942
Alpha_AF7: 0.5939760714143281
Alpha_AF8: 0.81000935788097
Alpha_TP10: 0.0
Beta_TP9: 0.7816741958456426
Beta_AF7: 0.5472108381794683
Beta_AF8: 1.3554521819337761
Beta_TP10: 0.0
Gamma_TP9: 0.44364432540558846
Gamma_AF7: 0.24647286811829144
Gamma_AF8: 0.44323237686176764
Gamma_TP10: 0.0
RAW_TP9: 794.707583705687
RAW_AF7: 795.4374168094535
RAW_AF8: 795.2574749009109
RAW_TP10: 507.10187372488923
AUX_RIGHT: 799.6403819048498
Mellow: 0.0
Concentration: 0.0
Accelerometer_X: 0.6293184401541728
Accelerometer_Y: -0.40894100127227195
Accelerometer_Z: 0.6686388294665112
Gyro_X: 5.4184318328361565
Gyro_Y: -0.081615363477428
Gyro_Z: 4.796012529066077
HeadBandOn: 1.0
HSI_TP9: 1.0528064992614476
HSI_AF7: 1.0
HSI_AF8: 1.0
HSI_TP10: 2.0
Battery: 100.0


In [4]:
import os
import pandas as pd

# Define excluded columns
excluded_columns = ['TimeStamp', 'Battery', 'HeadBandOn', 'HSI_TP9', 'HSI_AF7', 'HSI_AF8', 'HSI_TP10', 'AUX_RIGHT', 'Accelerometer_X', 'Accelerometer_Y', 'Accelerometer_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z', 'Mellow', 'Concentration', 'Elements']

# Function to load and preprocess data
def load_data(subject, repetition, experiment_type, minute):
    file_path = os.path.join(data_path, f'{subject}_{repetition}_{experiment_type}{minute}.csv')
    try:
        # Read data, treating potential mixed types as strings, and coercing errors to NaN
        data = pd.read_csv(file_path, dtype=str, low_memory=False)
        data.drop(columns=excluded_columns, inplace=True)
        return True  # Return True to indicate success
    except Exception as e:
        print(f"Failed to drop columns in file: {file_path}")
        print(f"Error: {e}")
        return False  # Return False to indicate failure

# Define paths and parameters
data_path = './Muse'
subjects = range(1,31)
repetitions = ['1', '2']
minutes  = ['1','2','3']
experiment = ['m', 'l', 'c', 'e']

# Check each file for errors in dropping columns
error_count = 0
for subject in subjects:
    for repetition in repetitions:
        for exp_type in experiment:
            for minute in minutes:
                if not load_data(subject, repetition, exp_type, minute):
                    error_count += 1

print(f"Total files with errors: {error_count}")


Total files with errors: 0


In [9]:
import pandas as pd
import os

# Define the path to the data and the details of the experiment setup
data_path = './Muse'
subjects = range(1,31)
repetitions = ['1', '2']
minutes = ['1', '2', '3']
experiment_types = ['m', 'l', 'c', 'e']
window_size = 60*256  # Number of rows expected per minute

# Function to load data and check for sufficient length
def check_data_length(subject, repetition, experiment_type, minute):
    file_path = os.path.join(data_path, f'{subject}_{repetition}_{experiment_type}{minute}.csv')
    try:
        data = pd.read_csv(file_path)
        if len(data) < window_size:
            print(f"File {file_path} does NOT contain enough data for 1 minute: only {len(data)} rows available.")
    except Exception as e:
        print(f"Failed to process file {file_path}: {str(e)}")

# Loop over all combinations of subject, repetition, experiment type, and minute
for subject in subjects:
    for repetition in repetitions:
        for experiment_type in experiment_types:
            for minute in minutes:
                check_data_length(subject, repetition, experiment_type, minute)


  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data = pd.read_csv(file_path)
  data =

File ./Muse/30_2_c3.csv does NOT contain enough data for 1 minute: only 15237 rows available.
