In [None]:
# Importar las librer√≠as necesarias
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score
)

# diccionario para guardar todas las m√©tricas
metricas = {}

In [2]:
# --- 1. Carga del Conjunto de Datos ---
data = pd.read_csv("../data/subset/clean_subset_lifestyledata_rows5200_seed5200.csv")

# --- 2. Codificaci√≥n de Etiquetas (Label Encoding) ---
label_encoder = LabelEncoder()
# Se transforma la variable objetivo 'Workout_Type' a valores num√©ricos.
data['Workout_Type'] = label_encoder.fit_transform(data['Workout_Type'])

# --- 3. Codificaci√≥n One-Hot (One-Hot Encoding) ---
# Se define la lista de columnas categ√≥ricas nominales a transformar.
nominal_cols = ['Gender']
# sparse_output=False: Devuelve una matriz densa (array de NumPy) en lugar de una dispersa.
# handle_unknown='ignore': Si aparece una categor√≠a no vista durante la transformaci√≥n, la ignora.
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Esto crea nuevas columnas binarias para cada categor√≠a.
encoded = ohe.fit_transform(data[nominal_cols])
# Se convierte la matriz resultante en un DataFrame con nombres de columna apropiados.
encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(nominal_cols))

# --- 4. Combinaci√≥n de los Datos Procesados ---
# Se elimina la columna original 'Gender' del DataFrame principal.
# reset_index(drop=True) asegura que los √≠ndices se alineen correctamente para la concatenaci√≥n.
data = data.drop(columns=nominal_cols).reset_index(drop=True)
encoded_df = encoded_df.reset_index(drop=True)

# Se concatenan el DataFrame original y el nuevo DataFrame con las columnas codificadas.
# axis=1 indica que la uni√≥n se realiza por columnas.
data = pd.concat([data, encoded_df], axis=1)

# --- 5. Visualizaci√≥n ---
data.head()

Unnamed: 0,Age,Weight_kg,Height_m,Max_BPM,Avg_BPM,Resting_BPM,Session_Duration_hours,Calories_Burned,Workout_Type,Gender_Female,Gender_Male
0,21.14,101.05,1.95,171.17,130.81,68.96,0.97,959.43,2,0.0,1.0
1,44.17,41.63,1.78,167.33,158.46,63.95,1.48,1424.35,0,0.0,1.0
2,20.07,63.81,1.78,187.86,137.11,60.93,1.7,1766.64,0,1.0,0.0
3,36.3,59.77,1.78,183.83,120.32,60.01,0.85,1028.5,1,1.0,0.0
4,51.99,57.6,1.56,166.25,151.82,67.97,1.66,1295.8,3,0.0,1.0


# √Årbol de Decisiones:

#### 1. √Årbol de Decisiones - CC:SI - ED:NO - Outliers:NO - Balanceo: NO

In [None]:
# ================================================================
# üìÇ Datos base
# ================================================================
data_tree_1 = data.copy()
X = data_tree_1.drop("Workout_Type", axis=1)
y = data_tree_1["Workout_Type"]

# ================================================================
# üîÅ Tres muestras
# ================================================================
random_states = [111, 222, 333]  # tres seeds diferentes
resultados = []

for i, seed in enumerate(random_states, start=1):
    print(f"\n=========== üß† CASO DE PRUEBA {i} (random_state={seed}) ===========")

    # Dividir datos
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    # --- Modelo 1: √Årbol con Gini ---
    modelo_gini = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=seed)
    modelo_gini.fit(X_train, y_train)
    y_pred_gini = modelo_gini.predict(X_test)

    # --- Modelo 2: √Årbol con Entrop√≠a ---
    modelo_entropy = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=seed)
    modelo_entropy.fit(X_train, y_train)
    y_pred_entropy = modelo_entropy.predict(X_test)

    # --- Modelo 3: Entrop√≠a con poda ---
    modelo_entropy_pruned = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=5, random_state=seed)
    modelo_entropy_pruned.fit(X_train, y_train)
    y_pred_entropy_pruned = modelo_entropy_pruned.predict(X_test)

    # --- Calcular m√©tricas ---
    modelos = {
        "Gini": y_pred_gini,
        "Entrop√≠a": y_pred_entropy,
        "Entrop√≠a Podado": y_pred_entropy_pruned
    }

    for nombre, y_pred in modelos.items():
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        
        resultados.append({
            "Caso": i,
            "Random State": seed,
            "Modelo": nombre,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-Score": f1
        })
        
        print(f"\nModelo: {nombre}")
        print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")




Modelo: Gini
Accuracy: 0.7327, Precision: 0.7412, Recall: 0.7327, F1-Score: 0.7350

Modelo: Entrop√≠a
Accuracy: 0.7779, Precision: 0.7923, Recall: 0.7779, F1-Score: 0.7804

Modelo: Entrop√≠a Podado
Accuracy: 0.7779, Precision: 0.7923, Recall: 0.7779, F1-Score: 0.7804


Modelo: Gini
Accuracy: 0.7452, Precision: 0.7431, Recall: 0.7452, F1-Score: 0.7438

Modelo: Entrop√≠a
Accuracy: 0.7875, Precision: 0.7968, Recall: 0.7875, F1-Score: 0.7892

Modelo: Entrop√≠a Podado
Accuracy: 0.7875, Precision: 0.7968, Recall: 0.7875, F1-Score: 0.7892


Modelo: Gini
Accuracy: 0.7587, Precision: 0.7538, Recall: 0.7587, F1-Score: 0.7559

Modelo: Entrop√≠a
Accuracy: 0.7837, Precision: 0.8018, Recall: 0.7837, F1-Score: 0.7823

Modelo: Entrop√≠a Podado
Accuracy: 0.7837, Precision: 0.8018, Recall: 0.7837, F1-Score: 0.7823


In [4]:
# TODO guardar m√©tricas en el diccionario
# TODO hacer la importancia de variables y gr√°ficar el arbol gini

#### 2. √Årbol de Decisiones - CC:SI - ED:NO - Outliers:NO - Balanceo: SI

In [32]:
# ================================================================
# üìÇ Datos base
# ================================================================
data_tree_2 = data.copy()
X = data_tree_2.drop("Workout_Type", axis=1)
y = data_tree_2["Workout_Type"]

# ================================================================
# üîÅ Tres muestras con class_weight='balanced'
# ================================================================
random_states = [111, 222, 333]  # Tres seeds diferentes
resultados = []

for i, seed in enumerate(random_states, start=1):
    print(f"\n=========== üß† CASO DE PRUEBA {i} (random_state={seed}) ===========")

    # Dividir datos (80/20 estratificado)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    # --- Modelo 1: √Årbol con Gini (class_weight='balanced') ---
    modelo_gini = DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,
        random_state=seed,
        class_weight='balanced' 
    )
    modelo_gini.fit(X_train, y_train)
    y_pred_gini = modelo_gini.predict(X_test)

    # --- Modelo 2: √Årbol con Entrop√≠a (class_weight='balanced') ---
    modelo_entropy = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=5,
        random_state=seed,
        class_weight='balanced' 
    )
    modelo_entropy.fit(X_train, y_train)
    y_pred_entropy = modelo_entropy.predict(X_test)

    # --- Modelo 3: Entrop√≠a con poda (class_weight='balanced') ---
    modelo_entropy_pruned = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=5,
        min_samples_split=5, # Poda
        random_state=seed,
        class_weight='balanced' 
    )
    modelo_entropy_pruned.fit(X_train, y_train)
    y_pred_entropy_pruned = modelo_entropy_pruned.predict(X_test)

    # --- Calcular m√©tricas ---
    modelos = {
        "Gini_Balanced": y_pred_gini,
        "Entrop√≠a_Balanced": y_pred_entropy,
        "Entrop√≠a_Podado_Balanced": y_pred_entropy_pruned
    }

    for nombre, y_pred in modelos.items():
        # Calcular m√©tricas, usando zero_division=0 para un manejo robusto
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        resultados.append({
            "Caso": i,
            "Random State": seed,
            "Modelo": nombre,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-Score": f1
        })

        print(f"\nModelo: {nombre}")
        print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")



Modelo: Gini_Balanced
Accuracy: 0.7327, Precision: 0.7412, Recall: 0.7327, F1-Score: 0.7350

Modelo: Entrop√≠a_Balanced
Accuracy: 0.7779, Precision: 0.7923, Recall: 0.7779, F1-Score: 0.7804

Modelo: Entrop√≠a_Podado_Balanced
Accuracy: 0.7779, Precision: 0.7923, Recall: 0.7779, F1-Score: 0.7804


Modelo: Gini_Balanced
Accuracy: 0.7375, Precision: 0.7338, Recall: 0.7375, F1-Score: 0.7348

Modelo: Entrop√≠a_Balanced
Accuracy: 0.7740, Precision: 0.7726, Recall: 0.7740, F1-Score: 0.7720

Modelo: Entrop√≠a_Podado_Balanced
Accuracy: 0.7740, Precision: 0.7726, Recall: 0.7740, F1-Score: 0.7720


Modelo: Gini_Balanced
Accuracy: 0.7365, Precision: 0.7361, Recall: 0.7365, F1-Score: 0.7243

Modelo: Entrop√≠a_Balanced
Accuracy: 0.7846, Precision: 0.8026, Recall: 0.7846, F1-Score: 0.7833

Modelo: Entrop√≠a_Podado_Balanced
Accuracy: 0.7846, Precision: 0.8026, Recall: 0.7846, F1-Score: 0.7833


In [6]:
# TODO guardar m√©tricas en el diccionario
# TODO hacer la importancia de variables y gr√°ficar el arbol gini

#### 3. √Årbol de Decisiones - CC:SI - ED:NO - Outliers:SI - Balanceo: NO

In [33]:
# ================================================================
# üìÇ Datos base
# ================================================================
data_tree_3 = data.copy()

# seleccionar solo las columnas num√©ricas
num_cols = data_tree_3.select_dtypes(include=['float64', 'int64']).columns

# calcular Q1, Q3 y el rango intercuart√≠lico (IQR)
Q1 = data_tree_3[num_cols].quantile(0.25)
Q3 = data_tree_3[num_cols].quantile(0.75)
IQR = Q3 - Q1

# crear una m√°scara booleana que identifique las filas SIN outliers
mask = ~((data_tree_3[num_cols] < (Q1 - 1.5 * IQR)) |
         (data_tree_3[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)

# filtrar los datos limpios
data_clean = data_tree_3[mask].reset_index(drop=True)

print("Tama√±o original:", data_tree_3.shape)
print("Tama√±o sin outliers:", data_clean.shape)

# Separar X e y con datos limpios
X = data_clean.drop("Workout_Type", axis=1)
y = data_clean["Workout_Type"]

# ================================================================
# üîÅ Tres muestras
# ================================================================
random_states = [111, 222, 333]  
resultados = []

for i, seed in enumerate(random_states, start=1):
    print(f"\n=========== üß† CASO DE PRUEBA {i} (random_state={seed}) ===========")

    # Dividir datos (80/20 estratificado con el seed actual)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    # --- Modelo 1: √Årbol con Gini ---
    modelo_gini = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=seed)
    modelo_gini.fit(X_train, y_train)
    y_pred_gini = modelo_gini.predict(X_test)

    # --- Modelo 2: √Årbol con Entrop√≠a ---
    modelo_entropy = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=seed)
    modelo_entropy.fit(X_train, y_train)
    y_pred_entropy = modelo_entropy.predict(X_test)

    # --- Modelo 3: Entrop√≠a con poda ---
    modelo_entropy_pruned = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=5, random_state=seed)
    modelo_entropy_pruned.fit(X_train, y_train)
    y_pred_entropy_pruned = modelo_entropy_pruned.predict(X_test)

    # --- Calcular m√©tricas ---
    modelos = {
        "Gini_Clean": y_pred_gini,
        "Entrop√≠a_Clean": y_pred_entropy,
        "Entrop√≠a_Podado_Clean": y_pred_entropy_pruned
    }

    for nombre, y_pred in modelos.items():
        # Calcular m√©tricas (usando zero_division=0 por robustez)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        resultados.append({
            "Caso": i,
            "Random State": seed,
            "Modelo": nombre,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-Score": f1
        })

        print(f"\nModelo: {nombre}")
        print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")


Tama√±o original: (5200, 11)
Tama√±o sin outliers: (5055, 11)


Modelo: Gini_Clean
Accuracy: 0.7399, Precision: 0.7479, Recall: 0.7399, F1-Score: 0.7403

Modelo: Entrop√≠a_Clean
Accuracy: 0.8042, Precision: 0.8154, Recall: 0.8042, F1-Score: 0.8054

Modelo: Entrop√≠a_Podado_Clean
Accuracy: 0.8042, Precision: 0.8154, Recall: 0.8042, F1-Score: 0.8054


Modelo: Gini_Clean
Accuracy: 0.7933, Precision: 0.8097, Recall: 0.7933, F1-Score: 0.7919

Modelo: Entrop√≠a_Clean
Accuracy: 0.7982, Precision: 0.8104, Recall: 0.7982, F1-Score: 0.8004

Modelo: Entrop√≠a_Podado_Clean
Accuracy: 0.7982, Precision: 0.8104, Recall: 0.7982, F1-Score: 0.8004


Modelo: Gini_Clean
Accuracy: 0.7923, Precision: 0.7979, Recall: 0.7923, F1-Score: 0.7928

Modelo: Entrop√≠a_Clean
Accuracy: 0.8200, Precision: 0.8163, Recall: 0.8200, F1-Score: 0.8177

Modelo: Entrop√≠a_Podado_Clean
Accuracy: 0.8180, Precision: 0.8149, Recall: 0.8180, F1-Score: 0.8160


In [8]:
# TODO guardar m√©tricas en el diccionario
# TODO hacer la importancia de variables y gr√°ficar el arbol gini

#### 4. √Årbol de Decisiones - CC:SI - ED:NO - Outliers:SI - Balanceo: SI

In [34]:
# ================================================================
# üìÇ Datos base
# ================================================================
data_tree_4 = data.copy()

# seleccionar solo las columnas num√©ricas
num_cols = data_tree_4.select_dtypes(include=['float64', 'int64']).columns

# calcular Q1, Q3 y el rango intercuart√≠lico (IQR)
Q1 = data_tree_4[num_cols].quantile(0.25)
Q3 = data_tree_4[num_cols].quantile(0.75)
IQR = Q3 - Q1

# crear una m√°scara booleana que identifique las filas SIN outliers
mask = ~((data_tree_4[num_cols] < (Q1 - 1.5 * IQR)) |
         (data_tree_4[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)

# filtrar los datos limpios
data_clean = data_tree_4[mask].reset_index(drop=True)

print("Tama√±o original:", data_tree_4.shape)
print("Tama√±o sin outliers:", data_clean.shape)

# Separar X e y con datos limpios
X = data_clean.drop("Workout_Type", axis=1)
y = data_clean["Workout_Type"]

# ================================================================
# üîÅ Tres muestras 
# ================================================================
random_states = [111, 222, 333]  #
resultados = []

for i, seed in enumerate(random_states, start=1):
    print(f"\n=========== üß† CASO DE PRUEBA {i} (random_state={seed}) ===========")

    # Dividir datos (80/20 estratificado con el seed actual)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    # --- Modelo 1: √Årbol con Gini + Balanced ---
    modelo_gini = DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,
        random_state=seed,
        class_weight='balanced' 
    )
    modelo_gini.fit(X_train, y_train)
    y_pred_gini = modelo_gini.predict(X_test)

    # --- Modelo 2: √Årbol con Entrop√≠a + Balanced ---
    modelo_entropy = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=5,
        random_state=seed,
        class_weight='balanced'
    )
    modelo_entropy.fit(X_train, y_train)
    y_pred_entropy = modelo_entropy.predict(X_test)

    # --- Modelo 3: Entrop√≠a con poda + Balanced ---
    modelo_entropy_pruned = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=5,
        min_samples_split=5, # Poda
        random_state=seed,
        class_weight='balanced' 
    )
    modelo_entropy_pruned.fit(X_train, y_train)
    y_pred_entropy_pruned = modelo_entropy_pruned.predict(X_test)

    # --- Calcular m√©tricas ---
    modelos = {
        "Gini_Clean_Balanced": y_pred_gini,
        "Entrop√≠a_Clean_Balanced": y_pred_entropy,
        "Entrop√≠a_Podado_Clean_Balanced": y_pred_entropy_pruned
    }

    for nombre, y_pred in modelos.items():
        # Calcular m√©tricas (usando zero_division=0 por robustez)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        resultados.append({
            "Caso": i,
            "Random State": seed,
            "Modelo": nombre,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-Score": f1
        })

        print(f"\nModelo: {nombre}")
        print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")

Tama√±o original: (5200, 11)
Tama√±o sin outliers: (5055, 11)


Modelo: Gini_Clean_Balanced
Accuracy: 0.7745, Precision: 0.7935, Recall: 0.7745, F1-Score: 0.7710

Modelo: Entrop√≠a_Clean_Balanced
Accuracy: 0.8042, Precision: 0.8154, Recall: 0.8042, F1-Score: 0.8054

Modelo: Entrop√≠a_Podado_Clean_Balanced
Accuracy: 0.8042, Precision: 0.8154, Recall: 0.8042, F1-Score: 0.8054


Modelo: Gini_Clean_Balanced
Accuracy: 0.7933, Precision: 0.8097, Recall: 0.7933, F1-Score: 0.7919

Modelo: Entrop√≠a_Clean_Balanced
Accuracy: 0.7982, Precision: 0.8104, Recall: 0.7982, F1-Score: 0.8004

Modelo: Entrop√≠a_Podado_Clean_Balanced
Accuracy: 0.7982, Precision: 0.8104, Recall: 0.7982, F1-Score: 0.8004


Modelo: Gini_Clean_Balanced
Accuracy: 0.7943, Precision: 0.7994, Recall: 0.7943, F1-Score: 0.7946

Modelo: Entrop√≠a_Clean_Balanced
Accuracy: 0.8190, Precision: 0.8156, Recall: 0.8190, F1-Score: 0.8168

Modelo: Entrop√≠a_Podado_Clean_Balanced
Accuracy: 0.8180, Precision: 0.8149, Recall: 0.8180, F1-Score: 

In [10]:
# TODO guardar m√©tricas en el diccionario
# TODO hacer la importancia de variables y gr√°ficar el arbol gini

#### 5. √Årbol de Decisiones - CC:SI - ED:SI - Outliers:NO - Balanceo: NO

In [35]:
# ================================================================
# üìÇ Datos base
# ================================================================
data_tree_5 = data.copy()

# Se definen las caracter√≠sticas (X) y la variable objetivo (y)
X = data_tree_5.drop("Workout_Type", axis=1)
y = data_tree_5["Workout_Type"]

# Definir el escalador
scaler = StandardScaler()

# ================================================================
# üîÅ Tres muestras 
# ================================================================
random_states = [111, 222, 333]  
resultados = []

for i, seed in enumerate(random_states, start=1):
    print(f"\n=========== üß† CASO DE PRUEBA {i} (random_state={seed}) ===========")

    # 1. Dividir datos (80/20 estratificado con el seed actual)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    # 2. Preprocesamiento: Escalado solo de columnas num√©ricas (excluyendo 'Gender')
    numeric_cols = [col for col in X_train.columns if col not in ['Gender']] 

    # Hacer copias para el escalado
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()

    # Escalar
    X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

    # 3. Entrenamiento y Predicci√≥n de Modelos
    
    # --- Modelo 1: √Årbol con Gini ---
    modelo_gini = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=seed)
    modelo_gini.fit(X_train_scaled, y_train)
    y_pred_gini = modelo_gini.predict(X_test_scaled)

    # --- Modelo 2: √Årbol con Entrop√≠a ---
    modelo_entropy = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=seed)
    modelo_entropy.fit(X_train_scaled, y_train)
    y_pred_entropy = modelo_entropy.predict(X_test_scaled)

    # --- Modelo 3: Entrop√≠a con poda ---
    modelo_entropy_pruned = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=5, random_state=seed)
    modelo_entropy_pruned.fit(X_train_scaled, y_train)
    y_pred_entropy_pruned = modelo_entropy_pruned.predict(X_test_scaled)

    # 4. Calcular m√©tricas
    modelos = {
        "Gini_Scaled": y_pred_gini,
        "Entrop√≠a_Scaled": y_pred_entropy,
        "Entrop√≠a_Podado_Scaled": y_pred_entropy_pruned
    }

    for nombre, y_pred in modelos.items():
        # Calcular m√©tricas (usando zero_division=0 por robustez)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        resultados.append({
            "Caso": i,
            "Random State": seed,
            "Modelo": nombre,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-Score": f1
        })

        print(f"\nModelo: {nombre}")
        print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")



Modelo: Gini_Scaled
Accuracy: 0.7327, Precision: 0.7412, Recall: 0.7327, F1-Score: 0.7350

Modelo: Entrop√≠a_Scaled
Accuracy: 0.7779, Precision: 0.7923, Recall: 0.7779, F1-Score: 0.7804

Modelo: Entrop√≠a_Podado_Scaled
Accuracy: 0.7779, Precision: 0.7923, Recall: 0.7779, F1-Score: 0.7804


Modelo: Gini_Scaled
Accuracy: 0.7452, Precision: 0.7431, Recall: 0.7452, F1-Score: 0.7438

Modelo: Entrop√≠a_Scaled
Accuracy: 0.7875, Precision: 0.7968, Recall: 0.7875, F1-Score: 0.7892

Modelo: Entrop√≠a_Podado_Scaled
Accuracy: 0.7875, Precision: 0.7968, Recall: 0.7875, F1-Score: 0.7892


Modelo: Gini_Scaled
Accuracy: 0.7587, Precision: 0.7538, Recall: 0.7587, F1-Score: 0.7559

Modelo: Entrop√≠a_Scaled
Accuracy: 0.7837, Precision: 0.8018, Recall: 0.7837, F1-Score: 0.7823

Modelo: Entrop√≠a_Podado_Scaled
Accuracy: 0.7837, Precision: 0.8018, Recall: 0.7837, F1-Score: 0.7823


In [12]:
# TODO guardar m√©tricas en el diccionario
# TODO hacer la importancia de variables y gr√°ficar el arbol gini

#### 6. √Årbol de Decisiones - CC:SI - ED:SI - Outliers:NO - Balanceo: SI

In [36]:
# ================================================================
# üìÇ Datos base
# ================================================================
data_tree_6 = data.copy()

# Se definen las caracter√≠sticas (X) y la variable objetivo (y)
X = data_tree_6.drop("Workout_Type", axis=1)
y = data_tree_6["Workout_Type"]

# Definir el escalador
scaler = StandardScaler()

# ================================================================
# üîÅ Tres muestras 
# ================================================================
random_states = [111, 222, 333]  
resultados = []

for i, seed in enumerate(random_states, start=1):
    print(f"\n=========== üß† CASO DE PRUEBA {i} (random_state={seed}) ===========")

    # 1. Dividir datos (80/20 estratificado con el seed actual)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    # 2. Preprocesamiento: Escalado solo de columnas num√©ricas
    numeric_cols = [col for col in X_train.columns if col not in ['Gender']] # Se asume 'Gender' es la √∫nica no num√©rica relevante aqu√≠

    # Hacer copias para el escalado
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()

    # Escalar
    X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

    # 3. Entrenamiento y Predicci√≥n de Modelos (con class_weight='balanced')

    # --- Modelo 1: √Årbol con Gini + Balanced ---
    modelo_gini = DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,
        random_state=seed,
        class_weight='balanced' 
    )
    modelo_gini.fit(X_train_scaled, y_train)
    y_pred_gini = modelo_gini.predict(X_test_scaled)

    # --- Modelo 2: √Årbol con Entrop√≠a + Balanced ---
    modelo_entropy = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=5,
        random_state=seed,
        class_weight='balanced' 
    )
    modelo_entropy.fit(X_train_scaled, y_train)
    y_pred_entropy = modelo_entropy.predict(X_test_scaled)

    # --- Modelo 3: Entrop√≠a con poda + Balanced ---
    modelo_entropy_pruned = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=5,
        min_samples_split=5, # Poda
        random_state=seed,
        class_weight='balanced' 
    )
    modelo_entropy_pruned.fit(X_train_scaled, y_train)
    y_pred_entropy_pruned = modelo_entropy_pruned.predict(X_test_scaled)

    # 4. Calcular m√©tricas
    modelos = {
        "Gini_Scaled_Balanced": y_pred_gini,
        "Entrop√≠a_Scaled_Balanced": y_pred_entropy,
        "Entrop√≠a_Podado_Scaled_Balanced": y_pred_entropy_pruned
    }

    for nombre, y_pred in modelos.items():
        # Calcular m√©tricas (usando zero_division=0 por robustez)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        resultados.append({
            "Caso": i,
            "Random State": seed,
            "Modelo": nombre,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-Score": f1
        })

        print(f"\nModelo: {nombre}")
        print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")



Modelo: Gini_Scaled_Balanced
Accuracy: 0.7327, Precision: 0.7412, Recall: 0.7327, F1-Score: 0.7350

Modelo: Entrop√≠a_Scaled_Balanced
Accuracy: 0.7779, Precision: 0.7923, Recall: 0.7779, F1-Score: 0.7804

Modelo: Entrop√≠a_Podado_Scaled_Balanced
Accuracy: 0.7779, Precision: 0.7923, Recall: 0.7779, F1-Score: 0.7804


Modelo: Gini_Scaled_Balanced
Accuracy: 0.7375, Precision: 0.7338, Recall: 0.7375, F1-Score: 0.7348

Modelo: Entrop√≠a_Scaled_Balanced
Accuracy: 0.7740, Precision: 0.7726, Recall: 0.7740, F1-Score: 0.7720

Modelo: Entrop√≠a_Podado_Scaled_Balanced
Accuracy: 0.7740, Precision: 0.7726, Recall: 0.7740, F1-Score: 0.7720


Modelo: Gini_Scaled_Balanced
Accuracy: 0.7365, Precision: 0.7361, Recall: 0.7365, F1-Score: 0.7243

Modelo: Entrop√≠a_Scaled_Balanced
Accuracy: 0.7846, Precision: 0.8026, Recall: 0.7846, F1-Score: 0.7833

Modelo: Entrop√≠a_Podado_Scaled_Balanced
Accuracy: 0.7846, Precision: 0.8026, Recall: 0.7846, F1-Score: 0.7833


In [14]:
# TODO guardar m√©tricas en el diccionario
# TODO hacer la importancia de variables y gr√°ficar el arbol gini

#### 7. √Årbol de Decisiones - CC:SI - ED:SI - Outliers:SI - Balanceo: NO

In [37]:
# ================================================================
# üìÇ Datos base
# ================================================================
data_tree_7 = data.copy()

# seleccionar solo las columnas num√©ricas
num_cols = data_tree_7.select_dtypes(include=['float64', 'int64']).columns

# calcular Q1, Q3 y el rango intercuart√≠lico (IQR)
Q1 = data_tree_7[num_cols].quantile(0.25)
Q3 = data_tree_7[num_cols].quantile(0.75)
IQR = Q3 - Q1

# crear una m√°scara booleana que identifique las filas SIN outliers
mask = ~((data_tree_7[num_cols] < (Q1 - 1.5 * IQR)) |
         (data_tree_7[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)

# filtrar los datos limpios
data_clean = data_tree_7[mask].reset_index(drop=True)

print("Tama√±o original:", data_tree_7.shape)
print("Tama√±o sin outliers:", data_clean.shape)

# Separar X e y con datos limpios
X = data_clean.drop("Workout_Type", axis=1)
y = data_clean["Workout_Type"]

# Definir el escalador
scaler = StandardScaler()
# Definir columnas a escalar (asumiendo 'Gender' es la √∫nica no num√©rica en X)
numeric_cols = [col for col in X.columns if col not in ['Gender']]

# ================================================================
# üîÅ Tres muestras 
# ================================================================
random_states = [111, 222, 333]  # Tres seeds diferentes
resultados = []

for i, seed in enumerate(random_states, start=1):
    print(f"\n=========== üß† CASO DE PRUEBA {i} (random_state={seed}) ===========")

    # 1. Dividir datos (80/20 estratificado con el seed actual)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    # 2. Preprocesamiento: Escalado
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()

    # Ajustar y transformar solo las columnas num√©ricas
    X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])


    # 3. Entrenamiento y Predicci√≥n de Modelos

    # --- Modelo 1: √Årbol con Gini ---
    modelo_gini = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=seed)
    modelo_gini.fit(X_train_scaled, y_train)
    y_pred_gini = modelo_gini.predict(X_test_scaled)

    # --- Modelo 2: √Årbol con Entrop√≠a ---
    modelo_entropy = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=seed)
    modelo_entropy.fit(X_train_scaled, y_train)
    y_pred_entropy = modelo_entropy.predict(X_test_scaled)

    # --- Modelo 3: Entrop√≠a con poda ---
    modelo_entropy_pruned = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=5, random_state=seed)
    modelo_entropy_pruned.fit(X_train_scaled, y_train)
    y_pred_entropy_pruned = modelo_entropy_pruned.predict(X_test_scaled)

    # 4. Calcular m√©tricas
    modelos = {
        "Gini_Clean_Scaled": y_pred_gini,
        "Entrop√≠a_Clean_Scaled": y_pred_entropy,
        "Entrop√≠a_Podado_Clean_Scaled": y_pred_entropy_pruned
    }

    for nombre, y_pred in modelos.items():
        # Calcular m√©tricas (usando zero_division=0 por robustez)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        resultados.append({
            "Caso": i,
            "Random State": seed,
            "Modelo": nombre,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-Score": f1
        })

        print(f"\nModelo: {nombre}")
        print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")

Tama√±o original: (5200, 11)
Tama√±o sin outliers: (5055, 11)


Modelo: Gini_Clean_Scaled
Accuracy: 0.7399, Precision: 0.7479, Recall: 0.7399, F1-Score: 0.7403

Modelo: Entrop√≠a_Clean_Scaled
Accuracy: 0.8042, Precision: 0.8154, Recall: 0.8042, F1-Score: 0.8054

Modelo: Entrop√≠a_Podado_Clean_Scaled
Accuracy: 0.8042, Precision: 0.8154, Recall: 0.8042, F1-Score: 0.8054


Modelo: Gini_Clean_Scaled
Accuracy: 0.7933, Precision: 0.8097, Recall: 0.7933, F1-Score: 0.7919

Modelo: Entrop√≠a_Clean_Scaled
Accuracy: 0.7982, Precision: 0.8104, Recall: 0.7982, F1-Score: 0.8004

Modelo: Entrop√≠a_Podado_Clean_Scaled
Accuracy: 0.7982, Precision: 0.8104, Recall: 0.7982, F1-Score: 0.8004


Modelo: Gini_Clean_Scaled
Accuracy: 0.7923, Precision: 0.7979, Recall: 0.7923, F1-Score: 0.7928

Modelo: Entrop√≠a_Clean_Scaled
Accuracy: 0.8200, Precision: 0.8163, Recall: 0.8200, F1-Score: 0.8177

Modelo: Entrop√≠a_Podado_Clean_Scaled
Accuracy: 0.8180, Precision: 0.8149, Recall: 0.8180, F1-Score: 0.8160


In [16]:
# TODO guardar m√©tricas en el diccionario
# TODO hacer la importancia de variables y gr√°ficar el arbol gini

#### 8. √Årbol de Decisiones - CC:SI - ED:SI - Outliers:SI - Balanceo: SI

In [38]:
# ================================================================
# üìÇ Datos base
# ================================================================
data_tree_8 = data.copy()

# 1. Eliminar outliers (IQR)
num_cols = data_tree_8.select_dtypes(include=['float64', 'int64']).columns
Q1 = data_tree_8[num_cols].quantile(0.25)
Q3 = data_tree_8[num_cols].quantile(0.75)
IQR = Q3 - Q1
mask = ~((data_tree_8[num_cols] < (Q1 - 1.5 * IQR)) |
         (data_tree_8[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)
data_clean = data_tree_8[mask].reset_index(drop=True)

print("Tama√±o original:", data_tree_8.shape)
print("Tama√±o sin outliers:", data_clean.shape)

# 2. Separar X e y
X = data_clean.drop("Workout_Type", axis=1)
y = data_clean["Workout_Type"]

# Definir el escalador y las columnas num√©ricas (excluyendo 'Gender')
scaler = StandardScaler()
numeric_cols = [col for col in X.columns if col not in ['Gender']]


# ================================================================
# üîÅ Tres muestras 
# ================================================================
random_states = [111, 222, 333]  
resultados = []

for i, seed in enumerate(random_states, start=1):
    print(f"\n=========== üß† CASO DE PRUEBA {i} (random_state={seed}) ===========")

    # 1. Dividir datos (80/20 estratificado con el seed actual)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    # 2. Aplicar Escalado (fit/transform en train, transform en test)
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()

    X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])


    # 3. Entrenamiento y Predicci√≥n de Modelos (con class_weight='balanced')

    # --- Modelo 1: Gini + Balanced ---
    modelo_gini = DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,
        random_state=seed,
        class_weight='balanced'
    )
    modelo_gini.fit(X_train_scaled, y_train)
    y_pred_gini = modelo_gini.predict(X_test_scaled)

    # --- Modelo 2: Entrop√≠a + Balanced ---
    modelo_entropy = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=5,
        random_state=seed,
        class_weight='balanced'
    )
    modelo_entropy.fit(X_train_scaled, y_train)
    y_pred_entropy = modelo_entropy.predict(X_test_scaled)

    # --- Modelo 3: Entrop√≠a con poda + Balanced ---
    modelo_entropy_pruned = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=5,
        min_samples_split=5, # Poda
        random_state=seed,
        class_weight='balanced'
    )
    modelo_entropy_pruned.fit(X_train_scaled, y_train)
    y_pred_entropy_pruned = modelo_entropy_pruned.predict(X_test_scaled)

    # 4. Calcular m√©tricas
    modelos = {
        "Gini_Clean_Scaled_Balanced": y_pred_gini,
        "Entrop√≠a_Clean_Scaled_Balanced": y_pred_entropy,
        "Entrop√≠a_Podado_Clean_Scaled_Balanced": y_pred_entropy_pruned
    }

    for nombre, y_pred in modelos.items():
        # Calcular m√©tricas (usando zero_division=0 por robustez)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        resultados.append({
            "Caso": i,
            "Random State": seed,
            "Modelo": nombre,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-Score": f1
        })

        print(f"\nModelo: {nombre}")
        print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")

Tama√±o original: (5200, 11)
Tama√±o sin outliers: (5055, 11)


Modelo: Gini_Clean_Scaled_Balanced
Accuracy: 0.7745, Precision: 0.7935, Recall: 0.7745, F1-Score: 0.7710

Modelo: Entrop√≠a_Clean_Scaled_Balanced
Accuracy: 0.8042, Precision: 0.8154, Recall: 0.8042, F1-Score: 0.8054

Modelo: Entrop√≠a_Podado_Clean_Scaled_Balanced
Accuracy: 0.8042, Precision: 0.8154, Recall: 0.8042, F1-Score: 0.8054


Modelo: Gini_Clean_Scaled_Balanced
Accuracy: 0.7933, Precision: 0.8097, Recall: 0.7933, F1-Score: 0.7919

Modelo: Entrop√≠a_Clean_Scaled_Balanced
Accuracy: 0.7982, Precision: 0.8104, Recall: 0.7982, F1-Score: 0.8004

Modelo: Entrop√≠a_Podado_Clean_Scaled_Balanced
Accuracy: 0.7982, Precision: 0.8104, Recall: 0.7982, F1-Score: 0.8004


Modelo: Gini_Clean_Scaled_Balanced
Accuracy: 0.7943, Precision: 0.7994, Recall: 0.7943, F1-Score: 0.7946

Modelo: Entrop√≠a_Clean_Scaled_Balanced
Accuracy: 0.8190, Precision: 0.8156, Recall: 0.8190, F1-Score: 0.8168

Modelo: Entrop√≠a_Podado_Clean_Scaled_Balanced


In [18]:
# TODO guardar m√©tricas en el diccionario
# TODO hacer la importancia de variables y gr√°ficar el arbol gini

# K Vecinos M√°s Cercanos:

#### 1. KNN - CC:SI - ED:NO - Outliers:NO - Balanceo: NO

In [40]:
# ================================================================
# üìÇ Preparaci√≥n de los datos
# ================================================================
data_knn_1 = data.copy()

X = data_knn_1.drop("Workout_Type", axis=1)
y = data_knn_1["Workout_Type"]

# Definici√≥n de la funci√≥n de evaluaci√≥n
def evaluar_modelo(X_train, X_test, y_train, y_test, metric_name, k_value, seed):
    """Entrena y eval√∫a un modelo KNN."""
    knn = KNeighborsClassifier(n_neighbors=k_value, metric=metric_name)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")

    return {
        'Random State': seed,
        'M√©trica': metric_name,
        'k': k_value,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1
    }

# ================================================================
# üîÅ Tres muestras
# ================================================================
random_states = [111, 222, 333]
k_range = range(1, 100)
resultados_finales = []

for i, seed in enumerate(random_states, start=1):
    print(f"\n=================================================")
    print(f"üß† CASO DE PRUEBA {i} (random_state={seed})")
    print(f"=================================================")

    # Divisi√≥n de datos (80% entrenamiento, 20% prueba)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    # üîé B√∫squeda de mejores K para el split actual
    accuracies_euclidean = []
    accuracies_manhattan = []

    for metric in ['euclidean', 'manhattan']:
        for k in k_range:
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            
            if metric == 'euclidean':
                accuracies_euclidean.append(acc)
            else:
                accuracies_manhattan.append(acc)

    # Encontrar el mejor K
    best_k_euclidean = k_range[accuracies_euclidean.index(max(accuracies_euclidean))]
    best_k_manhattan = k_range[accuracies_manhattan.index(max(accuracies_manhattan))]

    print(f"üîπ Mejor K (Euclidiana): {best_k_euclidean} | Max Acc: {max(accuracies_euclidean):.4f}")
    print(f"üîπ Mejor K (Manhattan): {best_k_manhattan} | Max Acc: {max(accuracies_manhattan):.4f}")

    # üß† Evaluaci√≥n final con los K √≥ptimos
    print("\n--- Evaluaci√≥n Final ---")
    
    # Evaluar Euclidiana
    print(f"-> Evaluaci√≥n Euclidiana (k={best_k_euclidean}):")
    resultados_finales.append(
        evaluar_modelo(X_train, X_test, y_train, y_test, 'euclidean', best_k_euclidean, seed)
    )
    
    # Evaluar Manhattan
    print(f"-> Evaluaci√≥n Manhattan (k={best_k_manhattan}):")
    resultados_finales.append(
        evaluar_modelo(X_train, X_test, y_train, y_test, 'manhattan', best_k_manhattan, seed)
    )


üß† CASO DE PRUEBA 1 (random_state=111)
üîπ Mejor K (Euclidiana): 1 | Max Acc: 0.7365
üîπ Mejor K (Manhattan): 1 | Max Acc: 0.7913

--- Evaluaci√≥n Final ---
-> Evaluaci√≥n Euclidiana (k=1):
Accuracy: 0.7365, Precision: 0.7362, Recall: 0.7365, F1-Score: 0.7362
-> Evaluaci√≥n Manhattan (k=1):
Accuracy: 0.7913, Precision: 0.7919, Recall: 0.7913, F1-Score: 0.7916

üß† CASO DE PRUEBA 2 (random_state=222)
üîπ Mejor K (Euclidiana): 1 | Max Acc: 0.7423
üîπ Mejor K (Manhattan): 1 | Max Acc: 0.7981

--- Evaluaci√≥n Final ---
-> Evaluaci√≥n Euclidiana (k=1):
Accuracy: 0.7423, Precision: 0.7452, Recall: 0.7423, F1-Score: 0.7429
-> Evaluaci√≥n Manhattan (k=1):
Accuracy: 0.7981, Precision: 0.8021, Recall: 0.7981, F1-Score: 0.7988

üß† CASO DE PRUEBA 3 (random_state=333)
üîπ Mejor K (Euclidiana): 1 | Max Acc: 0.7596
üîπ Mejor K (Manhattan): 1 | Max Acc: 0.8115

--- Evaluaci√≥n Final ---
-> Evaluaci√≥n Euclidiana (k=1):
Accuracy: 0.7596, Precision: 0.7602, Recall: 0.7596, F1-Score: 0.7596
-

In [None]:
# TODO guardar m√©tricas en el diccionario
# TODO hacer la gr√°fica de knn con el modelo entrenado

#### 2. KNN - CC:SI - ED:NO - Outliers:NO - Balanceo: SI

In [41]:
# ================================================================
# üìÇ Preparaci√≥n de los datos
# ================================================================
data_knn_2 = data.copy()

X = data_knn_2.drop("Workout_Type", axis=1)
y = data_knn_2["Workout_Type"]

# Definici√≥n de la funci√≥n de evaluaci√≥n
def evaluar_modelo(X_train, X_test, y_train, y_test, metric_name, k_value, seed):
    """Entrena y eval√∫a un modelo KNN usando weights='distance'."""
    # weights='distance' prioriza los vecinos m√°s cercanos, actuando como un balanceo ponderado.
    knn = KNeighborsClassifier(n_neighbors=k_value, metric=metric_name, weights='distance')
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")

    return {
        'Random State': seed,
        'M√©trica': metric_name,
        'k': k_value,
        'Weights': 'distance',
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1
    }

# ================================================================
# üîÅ Tres muestras 
# ================================================================
random_states = [111, 222, 333]
k_range = range(1, 100)
resultados_finales = []

for i, seed in enumerate(random_states, start=1):
    print(f"\n=================================================")
    print(f"üß† CASO DE PRUEBA {i} (random_state={seed}) - Balanced")
    print(f"=================================================")

    # Divisi√≥n de datos (80% entrenamiento, 20% prueba)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    # üîé B√∫squeda de mejores K para el split actual (usando weights='distance')
    accuracies_euclidean = []
    accuracies_manhattan = []

    for metric in ['euclidean', 'manhattan']:
        for k in k_range:
            # Importante: Usamos weights='distance' en la b√∫squeda de K
            knn = KNeighborsClassifier(n_neighbors=k, metric=metric, weights='distance')
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            
            if metric == 'euclidean':
                accuracies_euclidean.append(acc)
            else:
                accuracies_manhattan.append(acc)

    # Encontrar el mejor K
    best_k_euclidean = k_range[accuracies_euclidean.index(max(accuracies_euclidean))]
    best_k_manhattan = k_range[accuracies_manhattan.index(max(accuracies_manhattan))]

    print(f"üîπ Mejor K (Euclidiana): {best_k_euclidean} | Max Acc: {max(accuracies_euclidean):.4f}")
    print(f"üîπ Mejor K (Manhattan): {best_k_manhattan} | Max Acc: {max(accuracies_manhattan):.4f}")

    # üß† Evaluaci√≥n final con los K √≥ptimos
    print("\n--- Evaluaci√≥n Final ---")
    
    # Evaluar Euclidiana
    print(f"-> Evaluaci√≥n Euclidiana (k={best_k_euclidean}, weights='distance'):")
    resultados_finales.append(
        evaluar_modelo(X_train, X_test, y_train, y_test, 'euclidean', best_k_euclidean, seed)
    )
    
    # Evaluar Manhattan
    print(f"-> Evaluaci√≥n Manhattan (k={best_k_manhattan}, weights='distance'):")
    resultados_finales.append(
        evaluar_modelo(X_train, X_test, y_train, y_test, 'manhattan', best_k_manhattan, seed)
    )


üß† CASO DE PRUEBA 1 (random_state=111) - Balanced
üîπ Mejor K (Euclidiana): 1 | Max Acc: 0.7365
üîπ Mejor K (Manhattan): 1 | Max Acc: 0.7913

--- Evaluaci√≥n Final ---
-> Evaluaci√≥n Euclidiana (k=1, weights='distance'):
Accuracy: 0.7365, Precision: 0.7362, Recall: 0.7365, F1-Score: 0.7362
-> Evaluaci√≥n Manhattan (k=1, weights='distance'):
Accuracy: 0.7913, Precision: 0.7919, Recall: 0.7913, F1-Score: 0.7916

üß† CASO DE PRUEBA 2 (random_state=222) - Balanced
üîπ Mejor K (Euclidiana): 1 | Max Acc: 0.7423
üîπ Mejor K (Manhattan): 1 | Max Acc: 0.7981

--- Evaluaci√≥n Final ---
-> Evaluaci√≥n Euclidiana (k=1, weights='distance'):
Accuracy: 0.7423, Precision: 0.7452, Recall: 0.7423, F1-Score: 0.7429
-> Evaluaci√≥n Manhattan (k=1, weights='distance'):
Accuracy: 0.7981, Precision: 0.8021, Recall: 0.7981, F1-Score: 0.7988

üß† CASO DE PRUEBA 3 (random_state=333) - Balanced
üîπ Mejor K (Euclidiana): 1 | Max Acc: 0.7596
üîπ Mejor K (Manhattan): 1 | Max Acc: 0.8115

--- Evaluaci√≥n F

In [None]:
# TODO guardar m√©tricas en el diccionario
# TODO hacer la gr√°fica de knn con el modelo entrenado

#### 3. KNN - CC:SI - ED:NO - Outliers:SI - Balanceo: NO