## McNerman

In [None]:
!pip3 install numpy seaborn pandas matplotlib scikit-learn statsmodels

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from statsmodels.stats.contingency_tables import mcnemar
from itertools import combinations

# 1) Carga de datos
try:
    diabetes_df = pd.read_csv('../preprocessing_scripts/Data_preprocessed_10000.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    raise FileNotFoundError("File not found. Please check the file path.")

X = diabetes_df.drop('Diabetes_binary', axis=1).astype('float32')
y = diabetes_df['Diabetes_binary'].values

# 2) Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 3) Entrena cada modelo y genera predicciones en el conjunto de validación

# 3.1 GaussianNB con umbral optimizado
clf_nb = GaussianNB().fit(X_train, y_train)
probas_nb = clf_nb.predict_proba(X_val)[:, 1]
threshold = 0.08234715264318251
y_nb = (probas_nb >= threshold).astype(int)

# 3.2 Decision Tree
clf_dt = DecisionTreeClassifier(
    criterion='gini', max_depth=4,
    min_samples_split=2, min_samples_leaf=1,
    min_impurity_decrease=0.0,
    random_state=42
).fit(X_train, y_train)
y_dt = clf_dt.predict(X_val)

# 3.3 AdaBoost
clf_ada = AdaBoostClassifier(
    n_estimators=141, learning_rate=0.5, random_state=42
).fit(X_train, y_train)
y_ada = clf_ada.predict(X_val)

# 3.4 KNN
clf_knn = KNeighborsClassifier(n_neighbors=74, weights='distance')\
    .fit(X_train, y_train)
y_knn = clf_knn.predict(X_val)

# 3.5 Bagging sobre Decision Tree
clf_bg = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=75, random_state=42
).fit(X_train, y_train)
y_bg = clf_bg.predict(X_val)

# 3.6 Pipeline: SelectKBest + KNN
clf_pipe = Pipeline([
    ('selector', SelectKBest(score_func=mutual_info_classif, k=7)),
    ('knn', KNeighborsClassifier(
        n_neighbors=39, weights='uniform',
        metric='minkowski', p=1.75
    ))
]).fit(X_train, y_train)
y_pipe = clf_pipe.predict(X_val)

# 3.7 SVM RBF
clf_svm = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', C=100, gamma=0.001,
        probability=True, random_state=42)
).fit(X_train, y_train)
y_svm = clf_svm.predict(X_val)

# 4) Junta todas las predicciones en un DataFrame
preds = pd.DataFrame({
    'y_true': y_val,
    'NB':     y_nb,
    'DT':     y_dt,
    'ADA':    y_ada,
    'KNN':    y_knn,
    'BAG':    y_bg,
    'PIPE':   y_pipe,
    'SVM':    y_svm
})

# 5) Función auxiliar para contar b y c
def contar_b_c(y_true, y_a, y_b):
    ac_a = (y_a == y_true)
    ac_b = (y_b == y_true)
    b = int(((ac_a) & (~ac_b)).sum())  # A acierta, B falla
    c = int(((~ac_a) & (ac_b)).sum())  # A falla, B acierta
    return b, c

# 6) Prueba de McNemar para cada par de modelos
resultados = []
for A, B in combinations(['NB','DT','ADA','KNN','BAG','PIPE','SVM'], 2):
    b, c = contar_b_c(preds['y_true'], preds[A], preds[B])
    tabla = [[0, b],
             [c, 0]]
    test = mcnemar(tabla, exact=False, correction=True)
    resultados.append({
        'Modelo A': A,
        'Modelo B': B,
        'b (A acierta, B falla)': b,
        'c (A falla, B acierta)': c,
        'χ²': round(test.statistic, 3),
        'p-valor': round(test.pvalue, 4)
    })

df_mcnemar = pd.DataFrame(resultados)

# 7) Muestra la tabla de resultados
print(df_mcnemar)

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting matplotlib
  Downloading matplotlib-3.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting statsmodels
  Downloading statsmodels-0.14.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.2 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
 