In [188]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer

from sklearn.preprocessing import scale
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer


from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import  train_test_split

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier as KNC

In [442]:
# Inicializar X / y
data = pd.read_csv('../dataset.csv')
data = data.drop_duplicates()

X = data.drop(['label'], axis=1)
y = data['label'].astype(int)

X_test = pd.read_csv('../test_dataset.csv')
X_test['split'] = 'test'

print(y.value_counts())

X.info()

0    399
1    185
3    125
2    123
4     33
Name: label, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 865 entries, 0 to 865
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       865 non-null    float64
 1   sex       865 non-null    float64
 2   cp        865 non-null    float64
 3   trestbps  808 non-null    float64
 4   chol      839 non-null    float64
 5   fbs       783 non-null    float64
 6   restecg   864 non-null    float64
 7   thalach   810 non-null    float64
 8   exang     810 non-null    float64
 9   oldpeak   805 non-null    float64
 10  slope     570 non-null    float64
 11  ca        297 non-null    float64
 12  thal      413 non-null    float64
 13  split     865 non-null    object 
dtypes: float64(13), object(1)
memory usage: 101.4+ KB


In [443]:
# Definimos variables categóricas y numéricas
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
numerical_vars = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
target = ['label']

In [444]:
def drop_duplicates(X):
    return X.drop_duplicates()

def extract_columns(X, columns=[]):
    return X[columns]

def drop_columns(X, columns=[]):
    return X.drop(columns, axis=1)

def reset_index(X):
     return X.reset_index(drop=True)

def num_clipp_outliers(X):
    X = X.copy()
    for var in X.columns:
        tmp_data = X[var].dropna()
        # X.loc[X[var].isna(), var] = np.quantile(tmp_data, 0.5)
        q1, q3 = np.quantile(tmp_data, 0.25), np.quantile(tmp_data, 0.75)
        iqr = q3 - q1
        cut_off = iqr * 1.5
        lower, upper = q1 - cut_off, q3 + cut_off
        X.loc[X[var] > upper, var] = upper
        X.loc[X[var] < lower, var] = lower
    return X


def num_na_impute_knn(X):
    X = X.replace(pd.NA, np.nan)
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=11)
    X[X.columns] = imputer.fit_transform(X)
    return X

def num_scale(X):
    X[X.columns] = scale(X)
    return X

def cat_na_encode(X):
    return X.replace(np.nan, -1)


def cat_int_transform(X):
    return X.astype(int)

def cat_one_hot_encode(X):    
    ohe = OneHotEncoder(drop='first', sparse_output=False)
    ohe_fit = ohe.fit(X)
    X_ohe = pd.DataFrame(ohe.fit_transform(X))
    X_ohe.columns = ohe_fit.get_feature_names_out()

    # X = pd.concat((X_ohe, X[numerical_vars + ['split']].reset_index()), axis=1)
    # X.set_index('index')
    # X.drop('index', axis=1, inplace=True)
    return X_ohe

def cat_add_cp2(X):
    X['cp2'] = np.where(X['cp'] == 4, 1, 0)
    return X


def cat_add_thal2(X):
    X['thal2'] = np.where((X['thal'] == 7) | (X['thal'] == 6), 1, 0)
    return X

def cat_add_age2(X): 
    X['age2'] = X['age'] // 20
    return X
    
def cat_add_slope2(X):
    X['slope2'] = np.where((data['slope'] == 2) | (X['slope'] == 3) , 1, 0)
    return X

def cat_add_ca2(X):
    X['ca2'] = np.where(X['ca'] >= 1, 1, 0)
    return X

def cat_add_oldpeak2(X):
    X['oldpeak2'] = np.where(np.abs(X['oldpeak']) >= 0.5, 1, 0)
    return X

def cat_add_chol2(X):
    X['chol2'] = X['chol'] // 200
    return X

def grid_search(model, params, X, y, n_splits = 5, n_jobs = 8, scoring = 'f1_micro'):
    grid = GridSearchCV(estimator=model, 
                        param_grid=params, 
                        scoring=scoring,
                        cv=n_splits, 
                        verbose=1,
                        refit=True, # para que devuelva el model entrenado con los mejores params
                        n_jobs=n_jobs)
    grid.fit(X, y)
    kfold_scores = kfold_test(grid.best_estimator_, 10, X, y)
    print(f'Best params: {grid.best_params_}')
    print(f'Best F1 score: {grid.best_score_}')
    print(f"F1 score 10-fold: {np.mean(kfold_scores)}")
    
    return grid

def kfold_test(model, n_splits, X, y):
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
    cv_score =[]
    for train_idx, val_idx in kf.split(X, y):
        x_tr, x_val = X.iloc[train_idx,:], X.iloc[val_idx,:]
        y_tr, y_val = y[train_idx], y[val_idx]

        model.fit(x_tr, y_tr)
        score_f1 = f1_score(y_val, model.predict(x_val), average='micro')
        cv_score.append(score_f1)

    return cv_score

In [445]:
num_pipe = Pipeline(
    steps=[
        ("extract", FunctionTransformer(extract_columns, kw_args={"columns": numerical_vars})),
        ("num_clipp_outliers", FunctionTransformer(num_clipp_outliers)),
        ("num_na_imputer_knn", FunctionTransformer(num_na_impute_knn)),
        ("num_scale", FunctionTransformer(num_scale)),
        ("reset_index", FunctionTransformer(reset_index))
    ]
)

cat_pipe = Pipeline(
    steps=[
        ("extract", FunctionTransformer(extract_columns, kw_args={"columns": categorical_vars})),
        ("cat_na_encode", FunctionTransformer(cat_na_encode)),
        ("cat_int_transform", FunctionTransformer(cat_int_transform)), 
        ('cat_one_hot_encode', FunctionTransformer(cat_one_hot_encode)),
        ("reset_index", FunctionTransformer(reset_index))
    ]
)

feat_pipe = Pipeline(
    steps=[
        ("cp2", FunctionTransformer(cat_add_cp2)),  
        ("thal2", FunctionTransformer(cat_add_thal2)),
        ("slope2", FunctionTransformer(cat_add_slope2)),
        ("oldpeak2", FunctionTransformer(cat_add_oldpeak2)),
        # ("chol2", FunctionTransformer(cat_add_chol2)),
        ("drop_cols", FunctionTransformer(drop_columns, kw_args={"columns": (numerical_vars +  categorical_vars)})),
        ('cat_one_hot_encode', FunctionTransformer(cat_one_hot_encode)),
        ("reset_index", FunctionTransformer(reset_index))
    ]
)

full_pipe = Pipeline([
    ('features', FeatureUnion([
        ('numbers', num_pipe),
        ('categories', cat_pipe),
        ('new_feats', feat_pipe),
    ])),
])

full_pipe.set_output(transform='pandas')

y_train = y.astype(int)
y_train = y_train.reset_index(drop=True)
X_train = full_pipe.fit_transform(X)




In [446]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 865 entries, 0 to 864
Data columns (total 32 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         865 non-null    float64
 1   trestbps    865 non-null    float64
 2   chol        865 non-null    float64
 3   thalach     865 non-null    float64
 4   oldpeak     865 non-null    float64
 5   sex_1       865 non-null    float64
 6   cp_2        865 non-null    float64
 7   cp_3        865 non-null    float64
 8   cp_4        865 non-null    float64
 9   fbs_0       865 non-null    float64
 10  fbs_1       865 non-null    float64
 11  restecg_0   865 non-null    float64
 12  restecg_1   865 non-null    float64
 13  restecg_2   865 non-null    float64
 14  exang_0     865 non-null    float64
 15  exang_1     865 non-null    float64
 16  slope_1     865 non-null    float64
 17  slope_2     865 non-null    float64
 18  slope_3     865 non-null    float64
 19  ca_0        865 non-null    f

In [204]:
from sklearn.feature_selection import chi2,SelectKBest,SelectFromModel,RFE,VarianceThreshold, mutual_info_classif

In [424]:
Xf['chol2'] =  Xf['chol'] // 200


In [425]:
cols = ['ca', 'sex', 'cp', 'fbs', 'oldpeak3', 'exang', 'slope', 'thal', 'thal2', 'cp2', 'age2', 'slope2', 'chol2']
# Xf[cols] = Xf[cols].astype(object)
kbest = SelectKBest(score_func=mutual_info_classif, k='all')
fit = bestfeatures.fit(Xf[cols], y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(Xf[cols].columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score'] 

# featureScores.nlargest(20,'Score')['Feature'].tolist()

featureScores.sort_values(by=['Score'], ascending=False)

Unnamed: 0,Feature,Score
2,cp,0.138023
5,exang,0.11735
9,cp2,0.114908
11,slope2,0.095792
7,thal,0.090701
3,fbs,0.086337
6,slope,0.071944
1,sex,0.060446
0,ca,0.052337
10,age2,0.049747


In [418]:
cols = ['ca', 'sex', 'cp', 'fbs', 'exang', 'slope', 'thal', 'thal2', 'cp2', 'age2', 'slope2', 'oldpeak3', 'chol2'] # 'oldpeak3', 
Xf[cols] = Xf[cols].astype(object)
kbest = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(Xf[cols], y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(Xf[cols].columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score'] 

# featureScores.nlargest(20,'Score')['Feature'].tolist()

featureScores.sort_values(by=['Score'], ascending=False)

Unnamed: 0,Feature,Score
8,cp2,0.144817
2,cp,0.12857
4,exang,0.127309
5,slope,0.106161
6,thal,0.103013
7,thal2,0.066733
10,slope2,0.066197
3,fbs,0.044146
0,ca,0.040628
9,age2,0.028696


In [429]:
from sklearn.ensemble import RandomForestClassifier
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=200), threshold='1*median')
embeded_rf_selector.fit(scale(Xf), y_train)
embeded_rf_support = embeded_rf_selector.get_support()
Xf.loc[:,embeded_rf_support].columns.tolist()


['age',
 'trestbps',
 'chol',
 'thalach',
 'oldpeak',
 'cp',
 'restecg',
 'exang',
 'slope',
 'thal',
 'cp2']

In [428]:
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=10, step=100, verbose=5)
rfe_selector.fit(scale(Xf), y_train)
rfe_support = rfe_selector.get_support()
Xf.loc[:,rfe_support].columns.tolist()


Fitting estimator with 21 features.


['age',
 'thalach',
 'oldpeak',
 'sex',
 'ca',
 'thal',
 'cp2',
 'thal2',
 'ca2',
 'slope2']