## Importación de las librerías

In [50]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import PowerTransformer

import pandas as pd
import numpy as np

import seaborn as sns
sns.set_theme(style="whitegrid")

## Importación del dataset

In [51]:
stroke_df = pd.read_csv('/kaggle/input/exploratory-data-analysis/stroke_df_with_outliers_missings.csv')
stroke_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## División train-test

A continuación se procede a dividir el dataset importado en los conjuntos de **train** y **test**. Debido a que estamos tratando un problema de clasificación bastante desbalanceado, utilizaremos el ***stratifiedshufflesplit*** para conservar las proporciones de cada clase en los distintos conjuntos mencionados anteriormente.  

In [52]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=99)
X_train, X_test, y_train, y_test = None, None, None, None

for train_index, test_index in sss.split(stroke_df.drop(columns=['stroke']), stroke_df['stroke']):
    train_df = stroke_df.iloc[train_index]
    test_df = stroke_df.iloc[test_index]
    X_train, y_train = train_df.drop(columns=['stroke']), train_df.stroke
    X_test, y_test = test_df.drop(columns=['stroke']), test_df.stroke

Hacemos una comprobación de que las proporciones se han mantenido

In [53]:
print("TRAIN:\n", y_train.value_counts(normalize=True))
print('-'*50)
print("TEST:\n", y_test.value_counts(normalize=True))

TRAIN:
 0    0.951309
1    0.048691
Name: stroke, dtype: float64
--------------------------------------------------
TEST:
 0    0.951076
1    0.048924
Name: stroke, dtype: float64


## Tratamiento de los outliers

In [54]:
def detect_outliers_iqr(data, factor=3):
    # finding the 1st quartile
    q1 = np.nanquantile(data, 0.25)
 
    # finding the 3rd quartile
    q3 = np.nanquantile(data, 0.75)
 
    # finding the iqr region
    iqr = q3-q1
        
    # finding upper and lower whiskers
    upper_bound = q3+(factor*iqr)
    lower_bound = q1-(factor*iqr)
                    
    return len(data[(data<lower_bound) | (data>upper_bound)])

Las variables que presentan outliers son **BMI** y **avg_glucose_level**

In [55]:
print("\nPosible Outliers IQR age:", detect_outliers_iqr(stroke_df.age))
print("\nPosible Outliers IQR bmi:", detect_outliers_iqr(stroke_df.bmi))
print("\nPosible Outliers IQR avg_glucose_level:", detect_outliers_iqr(stroke_df.avg_glucose_level))


Posible Outliers IQR age: 0

Posible Outliers IQR bmi: 8

Posible Outliers IQR avg_glucose_level: 165


Se realiza el tratamiento de los outliers de la misma forma que el notebook anterior, con las **transformaciones box-cox**

### Variable avg_glucose_level

In [56]:
power_transformer = preprocessing.PowerTransformer(method='box-cox')
avg_glucose_level_transformed = power_transformer.fit_transform(X_train[['avg_glucose_level']])
X_train['avg_glucose_level'] = avg_glucose_level_transformed.flatten()
X_test['avg_glucose_level'] = power_transformer.transform(X_test[['avg_glucose_level']])

In [57]:
print("\nPosible Outliers train avg_glucose_level:", detect_outliers_iqr(X_train.avg_glucose_level))
print("\nPosible Outliers test avg_glucose_level:", detect_outliers_iqr(X_test.avg_glucose_level))


Posible Outliers train avg_glucose_level: 0

Posible Outliers test avg_glucose_level: 0


### Variable BMI

In [64]:
power_transformer = preprocessing.PowerTransformer(method='box-cox')
bmi_transformed = power_transformer.fit_transform(X_train[['bmi']])
X_train['bmi'] = bmi_transformed.flatten()
X_test['bmi'] = power_transformer.transform(X_test[['bmi']])

In [65]:
print("\nPosible Outliers train bmi:", detect_outliers_iqr(X_train.bmi))
print("\nPosible Outliers test bmi:", detect_outliers_iqr(X_test.bmi))


Posible Outliers train bmi: 1

Posible Outliers test bmi: 0
