In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import LeaveOneOut, cross_val_score, GroupKFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from scipy.fft import fft
import time
from pyentrp import entropy as ent
import warnings
from sklearn.exceptions import ConvergenceWarning
import csv
from sklearn.pipeline import Pipeline

In [2]:
# Function to reduce features using SelectKBest
def select_features(X_train, y_train, X_test, k=10):
    # Initialize and fit SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X_train, y_train)
    
    # Transform both training and testing data
    X_train_reduced = selector.transform(X_train)
    X_test_reduced = selector.transform(X_test)
    
    return X_train_reduced, X_test_reduced

In [3]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

def run_rf_grid_search(features, labels):
    pipe = Pipeline([
        ('select_k_best', SelectKBest(score_func=f_classif)),
        ('random_forest', RandomForestClassifier())
    ])

    # Definir rangos para 'k' con saltos de 5 en 5, ajusta según necesidad
    k_range = list(range(20, min(40, features.shape[1] + 1), 1))

    params = {
        'select_k_best__k': k_range,
        'random_forest__n_estimators': [300],

        'random_forest__max_depth': [None, 10, 20],
        'random_forest__min_samples_split': [2, 5, 10],
        'random_forest__min_samples_leaf': [1, 2, 4]
    }

    grid_search = GridSearchCV(pipe, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(features, labels)
    
    print("Optimized parameters for Random Forest:", grid_search.best_params_)
    print("Optimized cross-validation score: {:.2f}".format(grid_search.best_score_))

def process_file(file_path):
    df = pd.read_csv(file_path)
    labels = df['label']
    features = df.drop(columns=['subject_id', 'label'])
    run_rf_grid_search(features, labels)

def main(folder_path):
    import os
    for file_name in os.listdir(folder_path):
        if file_name.startswith("features_") and file_name.endswith(".csv"):
            file_path = os.path.join(folder_path, file_name)
            print(f"\nProcessing file: {file_name}")
            process_file(file_path)

# Cambia la ruta según corresponda
main('/home/ximo/Escritorio/ProyectoTFG/featuresExtended')



Processing file: features_1.csv


Optimized parameters for Random Forest: {'random_forest__max_depth': 10, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 300, 'select_k_best__k': 39}
Optimized cross-validation score: 0.55

Processing file: features_4.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 2, 'random_forest__n_estimators': 300, 'select_k_best__k': 38}
Optimized cross-validation score: 0.53

Processing file: features_3.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 10, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 2, 'random_forest__n_estimators': 300, 'select_k_best__k': 38}
Optimized cross-validation score: 0.54

Processing file: features_2.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 10, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'r

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

def run_rf_grid_search(features, labels):
    pipe = Pipeline([
        ('select_k_best', SelectKBest(score_func=f_classif)),
        ('random_forest', RandomForestClassifier())
    ])

    # Definir rangos para 'k' con saltos de 5 en 5, ajusta según necesidad
    k_range = list(range(40, 51, 1))

    params = {
        'select_k_best__k': k_range,
        'random_forest__n_estimators': [300],

        'random_forest__max_depth': [None, 10, 20],
        'random_forest__min_samples_split': [2, 5, 10],
        'random_forest__min_samples_leaf': [2, 4, 6]
    }

    grid_search = GridSearchCV(pipe, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(features, labels)
    
    print("Optimized parameters for Random Forest:", grid_search.best_params_)
    print("Optimized cross-validation score: {:.2f}".format(grid_search.best_score_))

def process_file(file_path):
    df = pd.read_csv(file_path)
    labels = df['label']
    features = df.drop(columns=['subject_id', 'label'])
    run_rf_grid_search(features, labels)

def main(folder_path):
    import os
    for file_name in os.listdir(folder_path):
        if file_name.startswith("features_") and file_name.endswith(".csv"):
            file_path = os.path.join(folder_path, file_name)
            print(f"\nProcessing file: {file_name}")
            process_file(file_path)

# Cambia la ruta según corresponda
main('/home/ximo/Escritorio/ProyectoTFG/featuresExtended')



Processing file: features_1.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 10, 'random_forest__min_samples_leaf': 6, 'random_forest__min_samples_split': 2, 'random_forest__n_estimators': 300, 'select_k_best__k': 49}
Optimized cross-validation score: 0.56

Processing file: features_4.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 2, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 300, 'select_k_best__k': 49}
Optimized cross-validation score: 0.55

Processing file: features_3.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 2, 'random_forest__min_samples_split': 2, 'random_forest__n_estimators': 300, 'select_k_best__k': 50}
Optimized cross-validation score: 0.55

Processing file: features_2.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 6, 'random_f

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

def run_rf_grid_search(features, labels):
    pipe = Pipeline([
        ('select_k_best', SelectKBest(score_func=f_classif)),
        ('random_forest', RandomForestClassifier())
    ])

    # Definir rangos para 'k' con saltos de 5 en 5, ajusta según necesidad
    k_range = list(range(51, features.shape[1], 1))

    params = {
        'select_k_best__k': k_range,
        'random_forest__n_estimators': [100, 200, 300],

        'random_forest__max_depth': [None, 10, 20],
        'random_forest__min_samples_split': [2, 5, 10],
        'random_forest__min_samples_leaf': [2, 4, 6]
    }

    grid_search = GridSearchCV(pipe, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(features, labels)
    
    print("Optimized parameters for Random Forest:", grid_search.best_params_)
    print("Optimized cross-validation score: {:.2f}".format(grid_search.best_score_))

def process_file(file_path):
    df = pd.read_csv(file_path)
    labels = df['label']
    features = df.drop(columns=['subject_id', 'label'])
    run_rf_grid_search(features, labels)

def main(folder_path):
    import os
    for file_name in os.listdir(folder_path):
        if file_name.startswith("features_") and file_name.endswith(".csv"):
            file_path = os.path.join(folder_path, file_name)
            print(f"\nProcessing file: {file_name}")
            process_file(file_path)

# Cambia la ruta según corresponda
main('/home/ximo/Escritorio/ProyectoTFG/featuresExtended')


Processing file: features_1.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 130}
Optimized cross-validation score: 0.59

Processing file: features_4.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 6, 'random_forest__min_samples_split': 2, 'random_forest__n_estimators': 100, 'select_k_best__k': 129}
Optimized cross-validation score: 0.56

Processing file: features_3.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 10, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 5, 'random_forest__n_estimators': 200, 'select_k_best__k': 134}
Optimized cross-validation score: 0.57

Processing file: features_2.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 5, 'random_forest__n_estimators': 300, 'select_k_best__k': 82}
Optimized cross-validation score: 0.58

In [4]:
import os
import numpy as np
import pandas as pd
import time
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GroupKFold
from sklearn.metrics import confusion_matrix
from sklearn.exceptions import ConvergenceWarning


folder_path = '/home/ximo/Escritorio/ProyectoTFG/featuresExtended'
output_path = '/home/ximo/Escritorio/ProyectoTFG/resultsEXTRA'

# Diccionario con parámetros optimizados solo para Random Forest
optimized_parameters = {
    "features_1.csv": {
        "Random Forest": {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 130}
    },
    "features_4.csv": {
        "Random Forest": {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 6, 'random_forest__min_samples_split': 2, 'random_forest__n_estimators': 100, 'select_k_best__k': 129}
    },
    "features_3.csv": {
        "Random Forest": {'random_forest__max_depth': 10, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 5, 'random_forest__n_estimators': 200, 'select_k_best__k': 134}
    },
    "features_2.csv": {
        "Random Forest": {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 5, 'random_forest__n_estimators': 300, 'select_k_best__k': 82}
    }
}

# Modelo base para Random Forest
models = {
    "Random Forest": RandomForestClassifier
}

# Function to create a unique file path if the file already exists
def create_unique_file_path(base_path, filename):
    counter = 1
    original_filename = filename
    while os.path.exists(os.path.join(base_path, filename)):
        filename = f"{os.path.splitext(original_filename)[0]}({counter}){os.path.splitext(original_filename)[1]}"
        counter += 1
    return os.path.join(base_path, filename)

# Function to perform evaluation for a given k and write the results to a file
def evaluate_and_save_results(features, labels, subjects, models, model_name, model_params, k, output_path, num_windows):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    selector = SelectKBest(f_classif, k=k)
    selected_features = selector.fit_transform(features, labels)
    gkf = GroupKFold(n_splits=len(np.unique(subjects)))

    # Remove the pipeline step prefix from the model parameters
    clean_params = {key.split('__')[-1]: value for key, value in model_params.items()}

    # Ajuste específico para SVM
    if model_name == "SVM" and clean_params.get('penalty') == 'l1':
        clean_params['dual'] = False  # Asegurarse de que dual=False cuando penalty='l1'
    
    model_cls = models[model_name]
    model = model_cls(**clean_params)
    cm_global = np.zeros((4, 4))  # Ajusta el tamaño según el número de etiquetas únicas
    model_times = []

    for train_idx, test_idx in gkf.split(selected_features, labels, groups=subjects):
        X_train, X_test = selected_features[train_idx], selected_features[test_idx]
        y_train, y_test = labels.iloc[train_idx], labels.iloc[test_idx]

        start_time = time.time()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", ConvergenceWarning)
            model.fit(X_train, y_train)
        
        end_time = time.time()
        predictions = model.predict(X_test)
        model_time = end_time - start_time
        model_times.append(model_time)

        cm = confusion_matrix(y_test, predictions, labels=[0, 1, 2, 3])
        cm_global += cm  # Acumular resultados de la matriz de confusión

    cm_df = pd.DataFrame(cm_global, index=[0, 1, 2, 3], columns=[0, 1, 2, 3])
    filename = f"conf_matrix_{num_windows}_{model_name}_{k}.csv"
    unique_file_path = create_unique_file_path(output_path, filename)
    cm_df.to_csv(unique_file_path)

    # Guardar el tiempo de ejecución del modelo
    average_time = sum(model_times)
    times_file_path = f"{output_path}/model_times.csv"
    header = not os.path.exists(times_file_path)  # Comprueba si el archivo ya existe
    with open(times_file_path, "a") as f:
        if header:
            f.write("Model,Window,K,Time(s)\n")  # Escribe los encabezados si el archivo no existe
            
        f.write(f"{model_name},{num_windows},{k},{average_time}\n")

# Main processing and evaluation function
def process_and_evaluate(file_path, models, optimized_parameters, output_path):
    df = pd.read_csv(file_path)
    drop_cols = ['subject_id', 'label']
    features = df.drop(columns=drop_cols)
    labels = df['label']
    subjects = df['subject_id']
    num_windows = file_path.split('_')[-1].split('.')[0]  # Extract number of windows from filename

    file_name = os.path.basename(file_path)
    if file_name in optimized_parameters:
        for model_name, params in optimized_parameters[file_name].items():
            k = params.pop('select_k_best__k')
            evaluate_and_save_results(features, labels, subjects, models, model_name, params, k, output_path, num_windows)

# Run the processing
for file_name in os.listdir(folder_path):
    if file_name.startswith("features_") and file_name.endswith(".csv"):
        file_path = os.path.join(folder_path, file_name)
        process_and_evaluate(file_path, models, optimized_parameters, output_path)