In [None]:
import pandas as pd
import numpy as np
import os
import time
import warnings
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier, StackingClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GroupKFold
from sklearn.exceptions import ConvergenceWarning
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split


# Diccionario con parámetros optimizados
optimized_parameters = {
    "features_1.csv": {
        "HistGradientBoosting": {'histgb__learning_rate': 0.05, 'histgb__max_depth': None, 'histgb__max_iter': 200, 'histgb__min_samples_leaf': 20, 'select_k_best__k': 38},
        "CatBoost": {'catboost__depth': 6, 'catboost__iterations': 200, 'catboost__l2_leaf_reg': 3, 'catboost__learning_rate': 0.05, 'select_k_best__k': 41},
        "ExtraTrees": {'et__max_depth': None, 'et__min_samples_split': 15, 'et__n_estimators': 100, 'select_k_best__k': 43},
        "Random Forest": {'random_forest__max_depth': 10, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 300, 'select_k_best__k': 39},
        "XGBoost": {'select_k_best__k': 44, 'xgb__learning_rate': 0.05, 'xgb__max_depth': 7, 'xgb__n_estimators': 300},
        "KNN": {'knn__algorithm': 'auto', 'knn__n_neighbors': 49, 'knn__weights': 'distance', 'select_k_best__k': 8},
        "AdaBoost": {'adaboost__learning_rate': 0.1, 'adaboost__n_estimators': 200, 'select_k_best__k': 37},
        "MLP": {'mlp__activation': 'relu', 'mlp__alpha': 0.01, 'mlp__early_stopping': True, 'mlp__hidden_layer_sizes': (100,), 'mlp__n_iter_no_change': 20, 'mlp__solver': 'adam', 'mlp__validation_fraction': 0.1, 'select_k_best__k': 31},
        "GaussianNB": {'gnb__var_smoothing': 0.001, 'select_k_best__k': 28},
        "SVM": {'select_k_best__k': 44, 'svm__C': 250, 'svm__loss': 'squared_hinge', 'svm__penalty': 'l1'}
    },
    "features_4.csv": {
        "HistGradientBoosting": {'histgb__learning_rate': 0.05, 'histgb__max_depth': 5, 'histgb__max_iter': 200, 'histgb__min_samples_leaf': 40, 'select_k_best__k': 38},
        "CatBoost": {'catboost__depth': 4, 'catboost__iterations': 200, 'catboost__l2_leaf_reg': 3, 'catboost__learning_rate': 0.05, 'select_k_best__k': 42},
        "ExtraTrees": {'et__max_depth': None, 'et__min_samples_split': 10, 'et__n_estimators': 100, 'select_k_best__k': 28},
        "Random Forest": {'random_forest__max_depth': 20, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 2, 'random_forest__n_estimators': 300, 'select_k_best__k': 38},
        "XGBoost": {'select_k_best__k': 41, 'xgb__learning_rate': 0.05, 'xgb__max_depth': 3, 'xgb__n_estimators': 100},
        "KNN": {'knn__algorithm': 'auto', 'knn__n_neighbors': 35, 'knn__weights': 'uniform', 'select_k_best__k': 8},
        "AdaBoost": {'adaboost__learning_rate': 0.1, 'adaboost__n_estimators': 100, 'select_k_best__k': 38},
        "MLP": {'mlp__activation': 'relu', 'mlp__alpha': 0.3, 'mlp__early_stopping': True, 'mlp__hidden_layer_sizes': (150,), 'mlp__n_iter_no_change': 20, 'mlp__solver': 'adam', 'mlp__validation_fraction': 0.1, 'select_k_best__k': 12},
        "GaussianNB": {'gnb__var_smoothing': 0.0001, 'select_k_best__k': 22},
        "SVM": {'select_k_best__k': 50, 'svm__C': 250, 'svm__loss': 'squared_hinge', 'svm__penalty': 'l1'}
    },
    "features_3.csv": {
        "HistGradientBoosting": {'histgb__learning_rate': 0.01, 'histgb__max_depth': 5, 'histgb__max_iter': 200, 'histgb__min_samples_leaf': 20, 'select_k_best__k': 33},
        "CatBoost": {'catboost__depth': 4, 'catboost__iterations': 200, 'catboost__l2_leaf_reg': 3, 'catboost__learning_rate': 0.05, 'select_k_best__k': 35},
        "ExtraTrees": {'et__max_depth': None, 'et__min_samples_split': 15, 'et__n_estimators': 100, 'select_k_best__k': 38},
        "Random Forest": {'random_forest__max_depth': 10, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 2, 'random_forest__n_estimators': 300, 'select_k_best__k': 38},
        "XGBoost": {'select_k_best__k': 35, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 300},
        "KNN": {'knn__algorithm': 'brute', 'knn__n_neighbors': 23, 'knn__weights': 'distance', 'select_k_best__k': 5},
        "AdaBoost": {'adaboost__learning_rate': 0.1, 'adaboost__n_estimators': 150, 'select_k_best__k': 35},
        "MLP": {'mlp__activation': 'relu', 'mlp__alpha': 0.3, 'mlp__early_stopping': True, 'mlp__hidden_layer_sizes': (200,), 'mlp__n_iter_no_change': 20, 'mlp__solver': 'adam', 'mlp__validation_fraction': 0.1, 'select_k_best__k': 29},
        "GaussianNB": {'gnb__var_smoothing': 0.0001, 'select_k_best__k': 25},
        "SVM": {'select_k_best__k': 46, 'svm__C': 250, 'svm__loss': 'squared_hinge', 'svm__penalty': 'l1'}
    },
    "features_2.csv": {
        "HistGradientBoosting": {'histgb__learning_rate': 0.01, 'histgb__max_depth': 10, 'histgb__max_iter': 200, 'histgb__min_samples_leaf': 20, 'select_k_best__k': 33},
        "CatBoost": {'catboost__depth': 4, 'catboost__iterations': 200, 'catboost__l2_leaf_reg': 3, 'catboost__learning_rate': 0.1, 'select_k_best__k': 41},
        "ExtraTrees": {'et__max_depth': None, 'et__min_samples_split': 15, 'et__n_estimators': 200, 'select_k_best__k': 36},
        "Random Forest": {'random_forest__max_depth': 10, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 300, 'select_k_best__k': 39},
        "XGBoost": {'select_k_best__k': 35, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__n_estimators': 200},
        "KNN": {'knn__algorithm': 'auto', 'knn__n_neighbors': 15, 'knn__weights': 'uniform', 'select_k_best__k': 5},
        "AdaBoost": {'adaboost__learning_rate': 0.1, 'adaboost__n_estimators': 150, 'select_k_best__k': 38},
        "MLP": {'mlp__activation': 'relu', 'mlp__alpha': 0.1, 'mlp__early_stopping': True, 'mlp__hidden_layer_sizes': (100,), 'mlp__n_iter_no_change': 20, 'mlp__solver': 'adam', 'mlp__validation_fraction': 0.1, 'select_k_best__k': 17},
        "GaussianNB": {'gnb__var_smoothing': 0.0001, 'select_k_best__k': 28},
        "SVM": {'select_k_best__k': 39, 'svm__C': 100, 'svm__loss': 'squared_hinge', 'svm__penalty': 'l1'}
    }
}

# Modelos base
models = {
    "Random Forest": RandomForestClassifier,
    "XGBoost": xgb.XGBClassifier,
    "HistGradientBoosting": HistGradientBoostingClassifier,
    "CatBoost": CatBoostClassifier,
    "ExtraTrees": ExtraTreesClassifier,
    "KNN": KNeighborsClassifier,
    "AdaBoost": AdaBoostClassifier,
    "MLP": MLPClassifier,
    "GaussianNB": GaussianNB,
    "SVM": LinearSVC,
}

# Crear clasificadores base con los hiperparámetros optimizados
def create_base_classifiers(file_name):
    base_classifiers = []
    for model_name, params in optimized_parameters[file_name].items():
        clean_params = {key.split('__')[-1]: value for key, value in params.items() if key != 'select_k_best__k'}

        # Ajuste específico para SVM
        if model_name == "SVM":
            clean_params['dual'] = False  # Asegurarse de que dual=False cuando penalty='l1'

        model_cls = models[model_name]
        model = model_cls(**clean_params)
        base_classifiers.append((model_name, model))
    return base_classifiers

# Function to create a unique file path if the file already exists
def create_unique_file_path(base_path, filename):
    counter = 1
    original_filename = filename
    while os.path.exists(os.path.join(base_path, filename)):
        filename = f"{os.path.splitext(original_filename)[0]}({counter}){os.path.splitext(original_filename)[1]}"
        counter += 1
    return os.path.join(base_path, filename)

# Function to perform evaluation for a given k and write the results to a file
def evaluate_and_save_results(features, labels, subjects, model, model_name, k, output_path, num_windows):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    selector = SelectKBest(f_classif, k=k)
    selected_features = selector.fit_transform(features, labels)
    gkf = GroupKFold(n_splits=len(np.unique(subjects)))

    cm_global = np.zeros((4, 4))  # Ajusta el tamaño según el número de etiquetas únicas
    model_times = []

    for train_idx, test_idx in gkf.split(selected_features, labels, groups=subjects):
        X_train, X_test = selected_features[train_idx], selected_features[test_idx]
        y_train, y_test = labels.iloc[train_idx], labels.iloc[test_idx]

        start_time = time.time()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", ConvergenceWarning)
            model.fit(X_train, y_train)
        
        end_time = time.time()
        predictions = model.predict(X_test)
        model_time = end_time - start_time
        model_times.append(model_time)

        cm = confusion_matrix(y_test, predictions, labels=[0, 1, 2, 3])
        cm_global += cm  # Acumular resultados de la matriz de confusión

    cm_df = pd.DataFrame(cm_global, index=[0, 1, 2, 3], columns=[0, 1, 2, 3])
    filename = f"conf_matrix_{num_windows}_{model_name}_{k}.csv"
    unique_file_path = create_unique_file_path(output_path, filename)
    cm_df.to_csv(unique_file_path)

    # Guardar el tiempo de ejecución del modelo
    average_time = sum(model_times)
    times_file_path = f"{output_path}/model_times.csv"
    header = not os.path.exists(times_file_path)  # Comprueba si el archivo ya existe
    with open(times_file_path, "a") as f:
        if header:
            f.write("Model,Window,K,Time(s)\n")  # Escribe los encabezados si el archivo no existe
            
        f.write(f"{model_name},{num_windows},{k},{average_time}\n")

# Main processing and evaluation function
def process_and_evaluate(file_path, models, optimized_parameters, output_path):
    df = pd.read_csv(file_path)
    drop_cols = ['subject_id', 'label']
    features = df.drop(columns=drop_cols)
    labels = df['label']
    subjects = df['subject_id']
    num_windows = file_path.split('_')[-1].split('.')[0]  # Extract number of windows from filename

    file_name = os.path.basename(file_path)
    if file_name in optimized_parameters:
        base_classifiers = create_base_classifiers(file_name)
        
        for k in [33,35,37,39,41,43,45,50]:  # Bucle desde k=33 hasta k=45            
            # Stacking Classifier
            stacking_model = StackingClassifier(estimators=base_classifiers, final_estimator=RandomForestClassifier(n_estimators=100))
            evaluate_and_save_results(features, labels, subjects, stacking_model, 'StackingClassifier', k, output_path, num_windows)
            

# Run the processing
folder_path = '/home/ximo/Escritorio/ProyectoTFG/featuresExtended'
output_path = '/home/ximo/Escritorio/ProyectoTFG/results'
for file_name in os.listdir(folder_path):
    if file_name.startswith("features_") and file_name.endswith(".csv"):
        file_path = os.path.join(folder_path, file_name)
        process_and_evaluate(file_path, models, optimized_parameters, output_path)
