## McNerman

In [2]:
!pip3 install numpy seaborn pandas matplotlib scikit-learn statsmodels

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (  AdaBoostClassifier,
                                BaggingClassifier,
                                RandomForestClassifier,
                                VotingClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from statsmodels.stats.contingency_tables import mcnemar
from itertools import combinations

# 1) Carga de datos
try:
    diabetes_df = pd.read_csv('../preprocessing_scripts/Data_preprocessed_10000.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    raise FileNotFoundError("File not found. Please check the file path.")

X = diabetes_df.drop('Diabetes_binary', axis=1).astype('float32')
y = diabetes_df['Diabetes_binary'].values

# 2) Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 3) Entrena cada modelo y genera predicciones en el conjunto de validación

# 3.1 GaussianNB con umbral optimizado
clf_nb = GaussianNB().fit(X_train, y_train)
probas_nb = clf_nb.predict_proba(X_val)[:, 1]
threshold = 0.08234715264318251
y_nb = (probas_nb >= threshold).astype(int)

# 3.2 Decision Tree
clf_dt = DecisionTreeClassifier(
    criterion='gini', max_depth=4,
    min_samples_split=2, min_samples_leaf=1,
    min_impurity_decrease=0.0,
    random_state=42
).fit(X_train, y_train)
y_dt = clf_dt.predict(X_val)

# 3.3 AdaBoost
clf_ada = AdaBoostClassifier(
    n_estimators=100, random_state=42
).fit(X_train, y_train)
y_ada = clf_ada.predict(X_val)

# 3.4 KNN
clf_knn = KNeighborsClassifier(n_neighbors=74, weights='distance')\
    .fit(X_train, y_train)
y_knn = clf_knn.predict(X_val)

# 3.5 Bagging sobre Decision Tree
clf_bg = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=75, random_state=42
).fit(X_train, y_train)
y_bg = clf_bg.predict(X_val)

# 3.6 Pipeline: SelectKBest + KNN
clf_pipe = Pipeline([
    ('selector', SelectKBest(score_func=mutual_info_classif, k=7)),
    ('knn', KNeighborsClassifier(
        n_neighbors=39, weights='uniform',
        metric='minkowski', p=1.75
    ))
]).fit(X_train, y_train)
y_pipe = clf_pipe.predict(X_val)

# 3.7 SVM RBF
clf_svm = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', C=100, gamma=0.001,
        probability=True, random_state=42)
).fit(X_train, y_train)
y_svm = clf_svm.predict(X_val)

# 3.8 Random Forest

clf_rf = RandomForestClassifier(n_estimators=75,random_state=42).fit(X_train,y_train)
y_rf = clf_rf.predict(X_val)

# 3.9 Majority Voting

clf_mv  = VotingClassifier(
        estimators=[
            ("nb", GaussianNB()),
            ("knn", KNeighborsClassifier(
                n_neighbors = 74,
                weights = "distance"
            )),
            ("dt", DecisionTreeClassifier(random_state=42,
                                          criterion="gini", 
                                          max_depth=4, 
                                          min_samples_leaf=1, 
                                          min_impurity_decrease=0.0, 
                                          min_samples_split=2))
        ],
        voting = "hard"
    ).fit(X_train,y_train)
y_mv = clf_mv.predict(X_val)

# 4) Junta todas las predicciones en un DataFrame
preds = pd.DataFrame({
    'y_true': y_val,
    'NB':     y_nb,
    'DT':     y_dt,
    'ADA':    y_ada,
    'KNN':    y_knn,
    'BAG':    y_bg,
    'PIPE':   y_pipe,
    'SVM':    y_svm,
    'RF' :    y_rf,
    'MV' :    y_mv
})

# 5) Función auxiliar para contar b y c
def contar_b_c(y_true, y_a, y_b):
    ac_a = (y_a == y_true)
    ac_b = (y_b == y_true)
    b = int(((ac_a) & (~ac_b)).sum())  # A acierta, B falla
    c = int(((~ac_a) & (ac_b)).sum())  # A falla, B acierta
    return b, c

# 6) Prueba de McNemar para cada par de modelos
resultados = []
for A, B in combinations(['NB','DT','ADA','KNN','BAG','PIPE','SVM','RF','MV'], 2):
    b, c = contar_b_c(preds['y_true'], preds[A], preds[B])
    tabla = [[0, b],
             [c, 0]]
    test = mcnemar(tabla, exact=False, correction=True)
    resultados.append({
        'Modelo A': A,
        'Modelo B': B,
        'b (A acierta, B falla)': b,
        'c (A falla, B acierta)': c,
        'χ²': round(test.statistic, 3),
        'p-valor': round(test.pvalue, 4)
    })

df_mcnemar = pd.DataFrame(resultados)

# 7) Muestra la tabla de resultados
print(df_mcnemar)

Dataset loaded successfully.
   Modelo A Modelo B  b (A acierta, B falla)  c (A falla, B acierta)      χ²  \
0        NB       DT                     297                     303   0.042   
1        NB      ADA                     209                     299  15.593   
2        NB      KNN                     225                     227   0.002   
3        NB      BAG                     255                     281   1.166   
4        NB     PIPE                     172                     257  16.448   
5        NB      SVM                     135                     231  24.658   
6        NB       RF                     225                     273   4.436   
7        NB       MV                     194                     250   6.813   
8        DT      ADA                     120                     204  21.262   
9        DT      KNN                     284                     280   0.016   
10       DT      BAG                     227                     247   0.762   
11       DT