In [168]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer

from sklearn.preprocessing import scale
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer


from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import  train_test_split

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier as KNC

In [139]:
# Inicializar X / y
data = pd.read_csv('../dataset.csv')
data = data.drop_duplicates()

X = data.drop(['label'], axis=1)
y = data['label'].astype(int)

X_test = pd.read_csv('../test_dataset.csv')
X_test['split'] = 'test'

print(y.value_counts())

X.info()

0    399
1    185
3    125
2    123
4     33
Name: label, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 865 entries, 0 to 865
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       865 non-null    float64
 1   sex       865 non-null    float64
 2   cp        865 non-null    float64
 3   trestbps  808 non-null    float64
 4   chol      839 non-null    float64
 5   fbs       783 non-null    float64
 6   restecg   864 non-null    float64
 7   thalach   810 non-null    float64
 8   exang     810 non-null    float64
 9   oldpeak   805 non-null    float64
 10  slope     570 non-null    float64
 11  ca        297 non-null    float64
 12  thal      413 non-null    float64
 13  split     865 non-null    object 
dtypes: float64(13), object(1)
memory usage: 101.4+ KB


In [140]:
# Definimos variables categóricas y numéricas
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
numerical_vars = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
target = ['label']

In [153]:
def drop_duplicates(X):
    return X.drop_duplicates()

def extract_columns(X, columns=[]):
    return X[columns]

def reset_index(X):
     return X.reset_index(drop=True)

def num_clipp_outliers(X):
    X = X.copy()
    for var in X.columns:
        tmp_data = X[var].dropna()
        # X.loc[X[var].isna(), var] = np.quantile(tmp_data, 0.5)
        q1, q3 = np.quantile(tmp_data, 0.25), np.quantile(tmp_data, 0.75)
        iqr = q3 - q1
        cut_off = iqr * 1.5
        lower, upper = q1 - cut_off, q3 + cut_off
        X.loc[X[var] > upper, var] = upper
        X.loc[X[var] < lower, var] = lower
    return X


def num_na_impute_knn(X):
    X = X.replace(pd.NA, np.nan)
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=11)
    X[X.columns] = imputer.fit_transform(X)
    return X

def num_scale(X):
    X[X.columns] = scale(X)
    return X

def cat_na_encode(X):
    return X.replace(np.nan, -1)


def cat_int_transform(X):
    return X.astype(int)

def cat_one_hot_encode(X):    
    ohe = OneHotEncoder(drop='first', sparse_output=False)
    ohe_fit = ohe.fit(X)
    X_ohe = pd.DataFrame(ohe.fit_transform(X))
    X_ohe.columns = ohe_fit.get_feature_names_out()

    # X = pd.concat((X_ohe, X[numerical_vars + ['split']].reset_index()), axis=1)
    # X.set_index('index')
    # X.drop('index', axis=1, inplace=True)
    return X_ohe

def cat_add_cp2(X):
    X['cp2'] = np.where(X['cp'] == 4, 1, 0)
    return X


def cat_add_thal2(X):
    X.loc[(X['thal'] == 7) | (X['thal'] == 6), 'thal2' ] = 1
    X.loc[X['thal'] == 3, 'thal2'] = 0
    X.loc[X['thal2'].isna(), 'thal2'] = -1
    return X


def slope_transform(X):
    X['slope2'] = np.where((data['slope'] == 2) | (X['slope'] == 3) , 1, 0)
    X.loc[X['slope2'].isna(), 'slope2'] = -1
    return X

def grid_search(model, params, X, y, n_splits = 5, n_jobs = 8, scoring = 'f1_micro'):
    grid = GridSearchCV(estimator=model, 
                        param_grid=params, 
                        scoring=scoring,
                        cv=n_splits, 
                        verbose=1,
                        refit=True, # para que devuelva el model entrenado con los mejores params
                        n_jobs=n_jobs)
    grid.fit(X, y)
    kfold_scores = kfold_test(grid.best_estimator_, 10, X, y)
    print(f'Best params: {grid.best_params_}')
    print(f'Best F1 score: {grid.best_score_}')
    print(f"F1 score 10-fold: {np.mean(kfold_scores)}")
    
    return grid

def kfold_test(model, n_splits, X, y):
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
    cv_score =[]
    for train_idx, val_idx in kf.split(X, y):
        x_tr, x_val = X.iloc[train_idx,:], X.iloc[val_idx,:]
        y_tr, y_val = y[train_idx], y[val_idx]

        model.fit(x_tr, y_tr)
        score_f1 = f1_score(y_val, model.predict(x_val), average='micro')
        cv_score.append(score_f1)

    return cv_score

In [169]:
num_pipe = Pipeline(
    steps=[
        ("extract", FunctionTransformer(extract_columns, kw_args={"columns": numerical_vars})),
        ("num_clipp_outliers", FunctionTransformer(num_clipp_outliers)),
        ("num_na_imputer_knn", FunctionTransformer(num_na_impute_knn)),
        ("num_scale", FunctionTransformer(num_scale)),
        ("reset_index", FunctionTransformer(reset_index))
    ]
)

cat_pipe = Pipeline(
    steps=[
        ("extract", FunctionTransformer(extract_columns, kw_args={"columns": categorical_vars})),
        ("cat_na_encode", FunctionTransformer(cat_na_encode)),
        ("cat_int_transform", FunctionTransformer(cat_int_transform)),
        ("cat_add_cp2", FunctionTransformer(cat_add_cp2)),  
        ("cat_add_thal2", FunctionTransformer(cat_add_thal2)),  
        ('cat_one_hot_encode', FunctionTransformer(cat_one_hot_encode)),
        ("reset_index", FunctionTransformer(reset_index))
    ]
)

full_pipe = Pipeline([
    ('features', FeatureUnion([
        ('numbers', num_pipe),
        ('categories', cat_pipe)
    ])),
])

full_pipe.set_output(transform='pandas')

y_train = y.astype(int)
y_train = y_train.reset_index(drop=True)
X_train = full_pipe.fit_transform(X)

model_knn = KNC()

params = {
    'weights': ['uniform', 'distance'],
    'n_neighbors': list(range(1, 14)), 
    'algorithm': ['ball_tree', 'brute', 'kd_tree'], # 
    'leaf_size': [10, 15, 20],
    'metric': ['euclidean',  'manhattan'] # 'cosine',
}

grid = grid_search(model_knn, params, X_train, y_train, n_jobs=6)



Fitting 5 folds for each of 468 candidates, totalling 2340 fits
Best params: {'algorithm': 'ball_tree', 'leaf_size': 10, 'metric': 'euclidean', 'n_neighbors': 6, 'weights': 'uniform'}
Best F1 score: 0.5526011560693642
F1 score 10-fold: 0.5444399893076718


In [170]:
num_pipe = Pipeline(
    steps=[
        ("extract", FunctionTransformer(extract_columns, kw_args={"columns": numerical_vars})),
        ("num_clipp_outliers", FunctionTransformer(num_clipp_outliers)),
        ("num_na_imputer_knn", FunctionTransformer(num_na_impute_knn)),
        ("num_scale", FunctionTransformer(num_scale)),
        ("reset_index", FunctionTransformer(reset_index))
    ]
)

cat_pipe = Pipeline(
    steps=[
        ("extract", FunctionTransformer(extract_columns, kw_args={"columns": categorical_vars})),
        ("cat_na_encode", FunctionTransformer(cat_na_encode)),
        ("cat_int_transform", FunctionTransformer(cat_int_transform)),
        ("cat_add_cp2", FunctionTransformer(cat_add_cp2)),  
        # ("cat_add_thal2", FunctionTransformer(cat_add_thal2)),  
        ('cat_one_hot_encode', FunctionTransformer(cat_one_hot_encode)),
        ("reset_index", FunctionTransformer(reset_index))
    ]
)

full_pipe = Pipeline([
    ('features', FeatureUnion([
        ('numbers', num_pipe),
        ('categories', cat_pipe)
    ])),
])

full_pipe.set_output(transform='pandas')

y_train = y.astype(int)
y_train = y_train.reset_index(drop=True)
X_train = full_pipe.fit_transform(X)

model_knn = KNC()

params = {
    'weights': ['uniform', 'distance'],
    'n_neighbors': list(range(1, 14)), 
    'algorithm': ['ball_tree', 'brute', 'kd_tree'], # 
    'leaf_size': [10, 15, 20],
    'metric': ['euclidean',  'manhattan'] # 'cosine',
}

grid = grid_search(model_knn, params, X_train, y_train, n_jobs=6)



Fitting 5 folds for each of 468 candidates, totalling 2340 fits
Best params: {'algorithm': 'ball_tree', 'leaf_size': 10, 'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'distance'}
Best F1 score: 0.5549132947976878
F1 score 10-fold: 0.5410318096765571
