In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, GroupKFold
import os

# Function to reduce features using SelectKBest
def select_features(X_train, y_train, X_test, k=10):
    # Initialize and fit SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X_train, y_train)
    
    # Transform both training and testing data
    X_train_reduced = selector.transform(X_train)
    X_test_reduced = selector.transform(X_test)
    
    return X_train_reduced, X_test_reduced

def run_rf_grid_search(features, labels, groups):
    pipe = Pipeline([
        ('select_k_best', SelectKBest(score_func=f_classif)),
        ('random_forest', RandomForestClassifier())
    ])

    # Definir rangos para 'k' y otros hiperparámetros
    k_range = list(range(5, 50, 3))
    n_estimators_range = [100, 200, 300, 400]
    max_depth_range = [None, 10, 20]
    min_samples_split_range = [5, 10, 15]
    min_samples_leaf_range = [2, 4, 6]

    params = {
        'select_k_best__k': k_range,
        'random_forest__n_estimators': n_estimators_range,
        'random_forest__max_depth': max_depth_range,
        'random_forest__min_samples_split': min_samples_split_range,
        'random_forest__min_samples_leaf': min_samples_leaf_range
    }

    group_kfold = GroupKFold(n_splits=5)
    grid_search = GridSearchCV(pipe, param_grid=params, cv=group_kfold, scoring='accuracy', n_jobs=-1)
    grid_search.fit(features, labels, groups=groups)
    
    print("Optimized parameters for Random Forest:", grid_search.best_params_)
    print("Optimized cross-validation score: {:.2f}".format(grid_search.best_score_))

def process_file(file_path):
    df = pd.read_csv(file_path)
    df = df[df['label'].isin([3, 2])]  # Filtrar solo etiquetas 3 y 2
    labels = df['label']
    features = df.drop(columns=['subject_id', 'label'])
    groups = df['subject_id']  # Esta será la base para el GroupKFold
    run_rf_grid_search(features, labels, groups)

def main(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.startswith("features_") and file_name.endswith(".csv"):
            file_path = os.path.join(folder_path, file_name)
            print(f"\nProcessing file: {file_name}")
            process_file(file_path)

# Cambia la ruta según corresponda
main('/home/ximo/Escritorio/ProyectoTFG/featuresExtended')



Processing file: features_1.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': None, 'random_forest__min_samples_leaf': 6, 'random_forest__min_samples_split': 5, 'random_forest__n_estimators': 100, 'select_k_best__k': 47}
Optimized cross-validation score: 0.77

Processing file: features_4.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 2, 'random_forest__min_samples_split': 5, 'random_forest__n_estimators': 200, 'select_k_best__k': 47}
Optimized cross-validation score: 0.77

Processing file: features_3.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 10, 'random_forest__min_samples_leaf': 2, 'random_forest__min_samples_split': 5, 'random_forest__n_estimators': 100, 'select_k_best__k': 47}
Optimized cross-validation score: 0.78

Processing file: features_2.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 10, 'random_forest__min_samples_leaf': 6, 'random_

In [3]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, GroupKFold
import os

# Function to reduce features using SelectKBest
def select_features(X_train, y_train, X_test, k=10):
    # Initialize and fit SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X_train, y_train)
    
    # Transform both training and testing data
    X_train_reduced = selector.transform(X_train)
    X_test_reduced = selector.transform(X_test)
    
    return X_train_reduced, X_test_reduced

def run_rf_grid_search(features, labels, groups):
    pipe = Pipeline([
        ('select_k_best', SelectKBest(score_func=f_classif)),
        ('random_forest', RandomForestClassifier())
    ])

    # Definir rangos para 'k' y otros hiperparámetros
    k_range = list(range(45, 70, 1))
    n_estimators_range = [100, 200, 300]
    max_depth_range = [None, 10, 20, 30]
    min_samples_split_range = [5, 10]
    min_samples_leaf_range = [2, 6, 8]

    params = {
        'select_k_best__k': k_range,
        'random_forest__n_estimators': n_estimators_range,
        'random_forest__max_depth': max_depth_range,
        'random_forest__min_samples_split': min_samples_split_range,
        'random_forest__min_samples_leaf': min_samples_leaf_range
    }

    group_kfold = GroupKFold(n_splits=5)
    grid_search = GridSearchCV(pipe, param_grid=params, cv=group_kfold, scoring='accuracy', n_jobs=-1)
    grid_search.fit(features, labels, groups=groups)
    
    print("Optimized parameters for Random Forest:", grid_search.best_params_)
    print("Optimized cross-validation score: {:.2f}".format(grid_search.best_score_))

def process_file(file_path):
    df = pd.read_csv(file_path)
    df = df[df['label'].isin([3, 2])]  # Filtrar solo etiquetas 3 y 2
    labels = df['label']
    features = df.drop(columns=['subject_id', 'label'])
    groups = df['subject_id']  # Esta será la base para el GroupKFold
    run_rf_grid_search(features, labels, groups)

def main(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.startswith("features_") and file_name.endswith(".csv"):
            file_path = os.path.join(folder_path, file_name)
            print(f"\nProcessing file: {file_name}")
            process_file(file_path)

# Cambia la ruta según corresponda
main('/home/ximo/Escritorio/ProyectoTFG/featuresExtended')



Processing file: features_1.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 2, 'random_forest__min_samples_split': 5, 'random_forest__n_estimators': 100, 'select_k_best__k': 59}
Optimized cross-validation score: 0.79

Processing file: features_4.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 8, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 200, 'select_k_best__k': 69}
Optimized cross-validation score: 0.79

Processing file: features_3.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 6, 'random_forest__min_samples_split': 5, 'random_forest__n_estimators': 100, 'select_k_best__k': 63}
Optimized cross-validation score: 0.80

Processing file: features_2.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': None, 'random_forest__min_samples_leaf': 6, 'random

In [4]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, GroupKFold
import os

# Function to reduce features using SelectKBest
def select_features(X_train, y_train, X_test, k=10):
    # Initialize and fit SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X_train, y_train)
    
    # Transform both training and testing data
    X_train_reduced = selector.transform(X_train)
    X_test_reduced = selector.transform(X_test)
    
    return X_train_reduced, X_test_reduced

def run_rf_grid_search(features, labels, groups):
    pipe = Pipeline([
        ('select_k_best', SelectKBest(score_func=f_classif)),
        ('random_forest', RandomForestClassifier())
    ])

    # Definir rangos para 'k' y otros hiperparámetros
    k_range = list(range(48, 90, 1))
    n_estimators_range = [100, 200, 300, 400]
    max_depth_range = [None, 30, 40, 50]
    min_samples_split_range = [5, 10]
    min_samples_leaf_range = [2, 6, 8]

    params = {
        'select_k_best__k': k_range,
        'random_forest__n_estimators': n_estimators_range,
        'random_forest__max_depth': max_depth_range,
        'random_forest__min_samples_split': min_samples_split_range,
        'random_forest__min_samples_leaf': min_samples_leaf_range
    }

    group_kfold = GroupKFold(n_splits=5)
    grid_search = GridSearchCV(pipe, param_grid=params, cv=group_kfold, scoring='accuracy', n_jobs=-1)
    grid_search.fit(features, labels, groups=groups)
    
    print("Optimized parameters for Random Forest:", grid_search.best_params_)
    print("Optimized cross-validation score: {:.2f}".format(grid_search.best_score_))

def process_file(file_path):
    df = pd.read_csv(file_path)
    df = df[df['label'].isin([3, 2])]  # Filtrar solo etiquetas 3 y 2
    labels = df['label']
    features = df.drop(columns=['subject_id', 'label'])
    groups = df['subject_id']  # Esta será la base para el GroupKFold
    run_rf_grid_search(features, labels, groups)

def main(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.startswith("features_") and file_name.endswith(".csv"):
            file_path = os.path.join(folder_path, file_name)
            print(f"\nProcessing file: {file_name}")
            process_file(file_path)

# Cambia la ruta según corresponda
main('/home/ximo/Escritorio/ProyectoTFG/featuresExtended')



Processing file: features_1.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 40, 'random_forest__min_samples_leaf': 2, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 87}
Optimized cross-validation score: 0.80

Processing file: features_4.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 2, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 76}
Optimized cross-validation score: 0.80

Processing file: features_3.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 40, 'random_forest__min_samples_leaf': 2, 'random_forest__min_samples_split': 5, 'random_forest__n_estimators': 100, 'select_k_best__k': 83}
Optimized cross-validation score: 0.80

Processing file: features_2.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': None, 'random_forest__min_samples_leaf': 6, 'rando

In [6]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, GroupKFold
import os

# Function to reduce features using SelectKBest
def select_features(X_train, y_train, X_test, k=10):
    # Initialize and fit SelectKBest
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X_train, y_train)
    
    # Transform both training and testing data
    X_train_reduced = selector.transform(X_train)
    X_test_reduced = selector.transform(X_test)
    
    return X_train_reduced, X_test_reduced

def run_rf_grid_search(features, labels, groups):
    pipe = Pipeline([
        ('select_k_best', SelectKBest(score_func=f_classif)),
        ('random_forest', RandomForestClassifier())
    ])

    # Definir rangos para 'k' y otros hiperparámetros
    k_range = list(range(90, features.shape[1]+1, 1))
    n_estimators_range = [100, 200]
    max_depth_range = [None, 30, 40, 50]
    min_samples_split_range = [5, 10]
    min_samples_leaf_range = [2, 4, 6]

    params = {
        'select_k_best__k': k_range,
        'random_forest__n_estimators': n_estimators_range,
        'random_forest__max_depth': max_depth_range,
        'random_forest__min_samples_split': min_samples_split_range,
        'random_forest__min_samples_leaf': min_samples_leaf_range
    }

    group_kfold = GroupKFold(n_splits=5)
    grid_search = GridSearchCV(pipe, param_grid=params, cv=group_kfold, scoring='accuracy', n_jobs=-1)
    grid_search.fit(features, labels, groups=groups)
    
    print("Optimized parameters for Random Forest:", grid_search.best_params_)
    print("Optimized cross-validation score: {:.2f}".format(grid_search.best_score_))

def process_file(file_path):
    df = pd.read_csv(file_path)
    df = df[df['label'].isin([3, 2])]  # Filtrar solo etiquetas 3 y 2
    labels = df['label']
    features = df.drop(columns=['subject_id', 'label'])
    groups = df['subject_id']  # Esta será la base para el GroupKFold
    run_rf_grid_search(features, labels, groups)

def main(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.startswith("features_") and file_name.endswith(".csv"):
            file_path = os.path.join(folder_path, file_name)
            print(f"\nProcessing file: {file_name}")
            process_file(file_path)

# Cambia la ruta según corresponda
main('/home/ximo/Escritorio/ProyectoTFG/featuresExtended')



Processing file: features_1.csv


Optimized parameters for Random Forest: {'random_forest__max_depth': None, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 116}
Optimized cross-validation score: 0.80

Processing file: features_4.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 111}
Optimized cross-validation score: 0.81

Processing file: features_3.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 6, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 115}
Optimized cross-validation score: 0.81

Processing file: features_2.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 6, 'random_forest__min_samples_split':

Processing file: features_1.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': None, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 116}
Optimized cross-validation score: 0.80

Processing file: features_4.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 111}
Optimized cross-validation score: 0.81

Processing file: features_3.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 6, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 115}
Optimized cross-validation score: 0.81

Processing file: features_2.csv
Optimized parameters for Random Forest: {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 6, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 138}
Optimized cross-validation score: 0.82

In [2]:
import os
import time
import warnings
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GroupKFold, train_test_split

# Directorio base y modelo
folder_path = '/home/ximo/Escritorio/ProyectoTFG/featuresExtended'
output_path = '/home/ximo/Escritorio/ProyectoTFG/resultsBINMATES'
models = {
    "Random Forest": RandomForestClassifier
}

# Parámetros optimizados para Random Forest
optimized_parameters = {
    "features_1.csv": {'random_forest__max_depth': None, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 116},
    "features_4.csv": {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 111},
    "features_3.csv": {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 6, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 115},
    "features_2.csv": {'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 6, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100, 'select_k_best__k': 138}
}

# Funciones auxiliares
def create_unique_file_path(base_path, filename):
    counter = 1
    original_filename = filename
    while os.path.exists(os.path.join(base_path, filename)):
        filename = f"{os.path.splitext(original_filename)[0]}({counter}){os.path.splitext(original_filename)[1]}"
        counter += 1
    return os.path.join(base_path, filename)

def process_and_evaluate(file_path, models, optimized_parameters, output_path):
    df = pd.read_csv(file_path)
    df = df[df['label'].isin([2, 3])]  # Filtrar solo etiquetas 3 y 2
    features = df.drop(columns=['subject_id', 'label'])
    labels = df['label']
    subjects = df['subject_id']
    num_windows = file_path.split('_')[-1].split('.')[0]

    file_name = os.path.basename(file_path)
    if file_name in optimized_parameters:
        params = optimized_parameters[file_name]
        model_name = "Random Forest"
        k = params.pop('select_k_best__k')
        evaluate_and_save_results(features, labels, subjects, models, model_name, params, k, output_path, num_windows)

def evaluate_and_save_results(features, labels, subjects, models, model_name, model_params, k, output_path, num_windows):
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    selector = SelectKBest(f_classif, k=k)
    selected_features = selector.fit_transform(features, labels)
    gkf = GroupKFold(n_splits=len(np.unique(subjects)))

    clean_params = {key.split('__')[-1]: value for key, value in model_params.items()}
    model_cls = models[model_name]
    model = model_cls(**clean_params)
    cm_global = np.zeros((2, 2))
    model_times = []

    for train_idx, test_idx in gkf.split(selected_features, labels, groups=subjects):
        X_train, X_test = selected_features[train_idx], selected_features[test_idx]
        y_train, y_test = labels.iloc[train_idx], labels.iloc[test_idx]

        start_time = time.time()
        model.fit(X_train, y_train)
        end_time = time.time()
        predictions = model.predict(X_test)
        model_time = end_time - start_time
        model_times.append(model_time)

        cm = confusion_matrix(y_test, predictions, labels=[2, 3])
        cm_global += cm

    cm_df = pd.DataFrame(cm_global, index=[2, 3], columns=[2, 3])
    filename = f"conf_matrix_{num_windows}_{model_name}_{k}.csv"
    unique_file_path = create_unique_file_path(output_path, filename)
    cm_df.to_csv(unique_file_path)

    average_time = sum(model_times)
    times_file_path = f"{output_path}/model_times.csv"
    header = not os.path.exists(times_file_path)
    with open(times_file_path, "a") as f:
        if header:
            f.write("Model,Window,K,Time(s)\n")
        f.write(f"{model_name},{num_windows},{k},{average_time:.2f}\n")

# Procesamiento principal
for file_name in os.listdir(folder_path):
    if file_name.startswith("features_") and file_name.endswith(".csv"):
        file_path = os.path.join(folder_path, file_name)
        process_and_evaluate(file_path, models, optimized_parameters, output_path)
