In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.impute import KNNImputer
from sklearn.preprocessing import scale
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE

from sklearn.metrics import roc_auc_score, f1_score

In [2]:
data = pd.read_csv('dataset.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 866 entries, 0 to 865
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       866 non-null    float64
 1   sex       866 non-null    float64
 2   cp        866 non-null    float64
 3   trestbps  809 non-null    float64
 4   chol      840 non-null    float64
 5   fbs       784 non-null    float64
 6   restecg   865 non-null    float64
 7   thalach   811 non-null    float64
 8   exang     811 non-null    float64
 9   oldpeak   806 non-null    float64
 10  slope     570 non-null    float64
 11  ca        297 non-null    float64
 12  thal      413 non-null    float64
 13  label     866 non-null    int64  
 14  split     866 non-null    object 
dtypes: float64(13), int64(1), object(1)
memory usage: 101.6+ KB


## Data preprocessing

In [3]:

# Eliminamos filas duplicadas
data = data.drop_duplicates()

# Definimos variables categóricas y numéricas
categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
numerical_vars = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
target = ['label']

# Parseamos las variables categóricas como enteros, para eliminar los decimales, y luego las pasamos a string, para que su valor 
# numérico no influya en el análisis ni en las predicciones
data[categorical_vars + target] = data[categorical_vars + target].astype(float).astype('Int64')
data[categorical_vars + target] = data[categorical_vars + target].astype(str)
# ca = 9 es un oulier, lo clippeamos a 3, que es el valor máximo según las especificaciones
data['ca'] = data['ca'].replace('9', '3')

# Reemplazamos 0s en chol y trestbps con NAs
data[['trestbps', 'chol']] = data[['trestbps', 'chol']].replace(0, np.nan)

# Rellenamos los valores numéricos no informados
# data[numerical_vars ] = data[numerical_vars].replace(pd.NA, np.nan)
imputer = KNNImputer(missing_values=np.nan, n_neighbors=8)
data[numerical_vars + target] = imputer.fit_transform(data[numerical_vars + target])

# Clippeamos los outliers
for var in ['trestbps', 'chol', 'thalach', 'oldpeak']:
    tmp_data = data[var][data[var] > 0].dropna()
    q1, q3 = np.quantile(tmp_data, 0.25), np.quantile(tmp_data, 0.75)
    iqr = q3 - q1
    cut_off = iqr * 1.5
    lower, upper = q1 - cut_off, q3 + cut_off
    data[var].loc[data[var] > upper] = upper
    data[var].loc[data[var] < lower] = lower

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[var].loc[data[var] > upper] = upper
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[var].loc[data[var] < lower] = lower


In [4]:
X = data.drop(['label', 'split'], axis=1)
y = data['label'].astype(int)

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 865 entries, 0 to 865
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       865 non-null    float64
 1   sex       865 non-null    object 
 2   cp        865 non-null    object 
 3   trestbps  865 non-null    float64
 4   chol      865 non-null    float64
 5   fbs       865 non-null    object 
 6   restecg   865 non-null    object 
 7   thalach   865 non-null    float64
 8   exang     865 non-null    object 
 9   oldpeak   865 non-null    float64
 10  slope     865 non-null    object 
 11  ca        865 non-null    object 
 12  thal      865 non-null    object 
dtypes: float64(5), object(8)
memory usage: 94.6+ KB


In [6]:
# Escalamos los datos numéricos para que se ajusten a una distribución normal
X[numerical_vars] = scale(X[numerical_vars])

X[numerical_vars].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 865 entries, 0 to 865
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       865 non-null    float64
 1   trestbps  865 non-null    float64
 2   chol      865 non-null    float64
 3   thalach   865 non-null    float64
 4   oldpeak   865 non-null    float64
dtypes: float64(5)
memory usage: 40.5 KB


In [7]:
# ahora aplicamos OHE a las variables categóricas
ohe = OneHotEncoder(drop= 'first', sparse = False)
ohe_fit = ohe.fit(X[categorical_vars])
X_ohe = pd.DataFrame(ohe.fit_transform(X[categorical_vars]))
X_ohe.columns = ohe_fit.get_feature_names_out()

X = pd.concat((X_ohe, X[numerical_vars].reset_index()), axis=1)
X.set_index('index')
X.drop('index', axis=1, inplace=True)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 865 entries, 0 to 864
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sex_1         865 non-null    float64
 1   cp_2          865 non-null    float64
 2   cp_3          865 non-null    float64
 3   cp_4          865 non-null    float64
 4   fbs_1         865 non-null    float64
 5   fbs_<NA>      865 non-null    float64
 6   restecg_1     865 non-null    float64
 7   restecg_2     865 non-null    float64
 8   restecg_<NA>  865 non-null    float64
 9   exang_1       865 non-null    float64
 10  exang_<NA>    865 non-null    float64
 11  slope_2       865 non-null    float64
 12  slope_3       865 non-null    float64
 13  slope_<NA>    865 non-null    float64
 14  ca_1          865 non-null    float64
 15  ca_2          865 non-null    float64
 16  ca_3          865 non-null    float64
 17  ca_<NA>       865 non-null    float64
 18  thal_6        865 non-null    

In [8]:
counts = data['split'].value_counts()

{ n: round(100 * val / sum(counts.values), 2) for n, val in counts.items() }

{'train': 79.88, 'val': 20.12}

In [9]:
counts = y.value_counts()

print(counts) 

{ n: round(100 * val / counts[0], 2) for n, val in counts.items() }

0    399
1    185
3    125
2    123
4     33
Name: label, dtype: int64


{0: 100.0, 1: 46.37, 3: 31.33, 2: 30.83, 4: 8.27}

In [10]:
# Como tenemos muy pocos datos de la clase 4, vamos a hacer oversampling para que esté más balanceado, 
# doblando el número de muestras de esta clase, de 33 a 66

sm = SMOTE(sampling_strategy={4: 66}, random_state = 1, k_neighbors = 7)
X_sm, y_sm = sm.fit_resample(X, y)

In [11]:
counts = y_sm.value_counts()

{ n: round(100 * val / counts[0], 2) for n, val in counts.items() }

{0: 100.0, 1: 46.37, 3: 31.33, 2: 30.83, 4: 16.54}

## Test dataset

Tenemos que aplicar las mismas transformaciones al test dataset

In [12]:
X_test = pd.read_csv('test_dataset.csv')
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       50 non-null     float64
 1   sex       50 non-null     float64
 2   cp        50 non-null     float64
 3   trestbps  48 non-null     float64
 4   chol      46 non-null     float64
 5   fbs       43 non-null     float64
 6   restecg   49 non-null     float64
 7   thalach   50 non-null     float64
 8   exang     50 non-null     float64
 9   oldpeak   48 non-null     float64
 10  slope     38 non-null     float64
 11  ca        12 non-null     float64
 12  thal      20 non-null     float64
dtypes: float64(13)
memory usage: 5.2 KB


In [13]:
# Parseamos las variables categóricas como enteros, para eliminar los decimales, y luego las pasamos a string, para que su valor 
# numérico no influya en el análisis ni en las predicciones
X_test[categorical_vars] = X_test[categorical_vars].astype(float).astype('Int64')
X_test[categorical_vars] = X_test[categorical_vars].astype(str)

# Reemplazamos 0s en chol y trestbps con NAs
X_test[['trestbps', 'chol']] = X_test[['trestbps', 'chol']].replace(0, np.nan)

# Rellenamos los valores numéricos no informados
X_test[numerical_vars] = X_test[numerical_vars].replace(pd.NA, np.nan)
imputer = KNNImputer(missing_values=np.nan, n_neighbors=8)
X_test[numerical_vars] = imputer.fit_transform(X_test[numerical_vars])

# Clippeamos los outliers
for var in ['trestbps', 'chol', 'thalach', 'oldpeak']:
    tmp_data = X_test[var][X_test[var] > 0].dropna()
    q1, q3 = np.quantile(tmp_data, 0.25), np.quantile(tmp_data, 0.75)
    iqr = q3 - q1
    cut_off = iqr * 1.5
    lower, upper = q1 - cut_off, q3 + cut_off
    X_test[var].loc[X_test[var] > upper] = upper
    X_test[var].loc[X_test[var] < lower] = lower

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[var].loc[X_test[var] > upper] = upper
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[var].loc[X_test[var] < lower] = lower


In [14]:
X_test[numerical_vars] = scale(X_test[numerical_vars])

ohe = OneHotEncoder(drop= 'first', sparse = False)
ohe_fit = ohe.fit(X_test[categorical_vars])
X_test_ohe = pd.DataFrame(ohe.fit_transform(X_test[categorical_vars]))
X_test_ohe.columns = ohe_fit.get_feature_names_out()

X_test = pd.concat((X_test_ohe, X_test[numerical_vars].reset_index()), axis=1)
X_test.set_index('index')
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sex_1         50 non-null     float64
 1   cp_2          50 non-null     float64
 2   cp_3          50 non-null     float64
 3   cp_4          50 non-null     float64
 4   fbs_1         50 non-null     float64
 5   fbs_<NA>      50 non-null     float64
 6   restecg_1     50 non-null     float64
 7   restecg_2     50 non-null     float64
 8   restecg_<NA>  50 non-null     float64
 9   exang_1       50 non-null     float64
 10  slope_2       50 non-null     float64
 11  slope_3       50 non-null     float64
 12  slope_<NA>    50 non-null     float64
 13  ca_1          50 non-null     float64
 14  ca_2          50 non-null     float64
 15  ca_3          50 non-null     float64
 16  ca_<NA>       50 non-null     float64
 17  thal_6        50 non-null     float64
 18  thal_7        50 non-null     fl

In [15]:
X_test.drop('index', axis=1, inplace=True)
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sex_1         50 non-null     float64
 1   cp_2          50 non-null     float64
 2   cp_3          50 non-null     float64
 3   cp_4          50 non-null     float64
 4   fbs_1         50 non-null     float64
 5   fbs_<NA>      50 non-null     float64
 6   restecg_1     50 non-null     float64
 7   restecg_2     50 non-null     float64
 8   restecg_<NA>  50 non-null     float64
 9   exang_1       50 non-null     float64
 10  slope_2       50 non-null     float64
 11  slope_3       50 non-null     float64
 12  slope_<NA>    50 non-null     float64
 13  ca_1          50 non-null     float64
 14  ca_2          50 non-null     float64
 15  ca_3          50 non-null     float64
 16  ca_<NA>       50 non-null     float64
 17  thal_6        50 non-null     float64
 18  thal_7        50 non-null     fl

In [16]:
X_test['exang_<NA>'] = 0

## Selección de modelo

**1. Random forest**

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier as RFC

In [43]:
model_rf = RFC(random_state = 1, 
               class_weight = {0: 1, 1: 2, 2: 2, 3: 2, 4: 2})

In [44]:
params = {
'criterion': ['entropy', 'gini'],
'min_samples_leaf': [2, 3, 5, ], 
'min_samples_split': [2, 3, 5],
'max_depth': [5, 7, 10],
'max_features': [7, 15, 20],
'n_estimators': [25, 50, 75]
}

In [45]:
grid = GridSearchCV(estimator=model_rf, 
                    param_grid=params, 
                    scoring='f1_micro',
                    cv=5, 
                    n_jobs = 8)
grid.fit(X_sm, y_sm)
print(f'Best params : {grid.best_params_}')
print(f'Best score: {grid.best_score_}')

Best params : {'criterion': 'entropy', 'max_depth': 10, 'max_features': 7, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 75}
Best score: 0.5646182495344507


In [39]:
params = {
'criterion': ['entropy', 'gini'],
'min_samples_leaf': [2, 3], 
'min_samples_split': [4, 5, 7],
'max_depth': [9, 10, 11],
'max_features': [6, 7, 8],
'n_estimators': [25, 50, 75]
}

In [40]:
grid = GridSearchCV(estimator=model_rf, 
                    param_grid=params, 
                    scoring='f1_micro',
                    cv=5, 
                    n_jobs = 8)
grid.fit(X_sm, y_sm)
print(f'Best params : {grid.best_params_}')
print(f'Best score: {grid.best_score_}')

Best params : {'criterion': 'entropy', 'max_depth': 9, 'max_features': 6, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 25}
Best score: 0.5824084419615146


In [46]:
best_config = grid.best_params_

model_rf = RFC(criterion = best_config['criterion'], 
               min_samples_leaf = best_config['min_samples_leaf'],
               min_samples_split = best_config['min_samples_split'],
               max_depth = best_config['max_depth'],
               max_features = best_config['max_features'],
               n_estimators = best_config['max_features'], 
               class_weight = {0:1, 1: 2, 2: 2, 3: 2, 4: 2}, # best_config['weight']
               n_jobs = 7,
               random_state = 1)

model_rf.fit(X_sm, y_sm)

pred_y = model_rf.predict(X_sm) 

print(f"F1 score: {f1_score(y_sm, pred_y, average='micro')}")

F1 score: 0.8407572383073497


In [None]:
pred_y_test = model_rf.predict(X_test[X_sm.columns.to_list()])

In [None]:
pred_df = pd.DataFrame({'ID': X_test.index.values, 'label': pred_y_test.astype(int)})

pred_df.head()


In [None]:
pred_df.to_csv('submission.csv', index=False)

In [None]:
# Best params : {'criterion': 'entropy', 'max_depth': 10, 'max_features': 21, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
# Best score: 0.5601551831160769