# Step 3: Data Cleaning & Feature Engineering (Modify Stage)

In [1]:
# Install the required packages
## !pip -q install pycaret[full]
## !pip -q install dataprep
## !pip install matplotlib seaborn
## !pip install scikit-learn==1.2.2
## !pip install imblearn

In [3]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv("./data/stroke_data.csv")
df.columns = ["id", "gender", "age", "hypertension",	"heart_disease",
              "ever_married", "work_type", "residence_type", "avg_glucose_level",
              "bmi", "smoking_status", "stroke"]

# Assign data type to the columns
#df["gender"] = df["gender"].astype(pd.StringDtype())
#df["work_type"] = df["work_type"].astype(pd.StringDtype())
#df["residence_type"] = df["residence_type"].astype(pd.StringDtype())
#df["smoking_status"] = df["smoking_status"].astype(pd.StringDtype())
df["ever_married"] = df["ever_married"].map({"Yes": 1, "No": 0}).astype(int)
df["hypertension"] = df["hypertension"].astype(int)
df["heart_disease"] = df["heart_disease"].astype(int)


# drop id column
df = df.drop(columns=["id"])

AttributeError: module 'pyarrow' has no attribute '__version__'

In [34]:
# define categorical and numerical columns

numerical_var = ["age", "average_glucose_level", "bmi"]
categorical_var = ["gender", "hypertension", "heart_disease", "ever_married", "work_type", "smoking_status"]
target_var = ["stroke"]

In [None]:
df

### (i) One-hot encoding

In [36]:
data = df[['gender', 'age', 'hypertension', 'heart_disease',
           'ever_married', 'work_type', 'residence_type',	'avg_glucose_level',
           'bmi',	'smoking_status',	'stroke']].copy(deep=True)

df_encoded = pd.get_dummies(data, columns=['gender', 'smoking_status',
                                'work_type', 'residence_type'],
                              drop_first=True)

In [None]:
df_encoded

In [None]:
# performing univariate analysis with ydata profiling for encoded dataframe

from ydata_profiling import ProfileReport

# Generating a Report
profile = ProfileReport(df_encoded)
profile.to_file("encoded_dataframe.html")
profile

In [39]:
# we will drop `gender_Other` variable as it's has only 1 row
df_encoded = df_encoded.drop(columns=["gender_Other"])


### (ii) Train Test split

In [40]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Split the dataset using stratification
X = df_encoded.drop(columns='stroke', axis=1)  # Features
y = df_encoded['stroke']  # Target

X_train, X_test, y_train, y_test = train_test_split(
          X, y, test_size=0.2, stratify=y, random_state=42)

# combine X_train and y_train data
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

### (iii) Missing Value Imputation for `bmi` column

- `bmi`

  - **Type of missing value**: MAR 
  - **Explanation**: The missingness depends on observed variables like gender and age group, etc, and not on the value of the `bmi` itself
  - **Other observation**: 3% of the data is missing
  - **Approach**: 
      - Implement mean imputation
      - Create missingness indicator variable named `bmi_missing`

In [None]:
# check missing values in `bmi` column

## Check missing values in the training set
print("\nMissing values of `bmi` column in the training set X_train:", X_train["bmi"].isnull().sum())

## Check missing values in the test set
print("\nMissing values of `bmi` column in the test set X_test:", X_test["bmi"].isnull().sum())

In [42]:
X_train["bmi"].fillna(X_train["bmi"].mean(), inplace=True)
X_test["bmi"].fillna(X_test["bmi"].mean(), inplace=True)


In [None]:
# check missing values in `bmi` column

## Check missing values in the training set
print("\nMissing values of `bmi` column in the training set X_train:", X_train["bmi"].isnull().sum())

## Check missing values in the test set
print("\nMissing values of `bmi` column in the test set X_test:", X_test["bmi"].isnull().sum())

### （iv） SMOTE re-sampling

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

# Then create and apply the resampling pipeline only to training data
over = SMOTE(sampling_strategy=1)
under = RandomUnderSampler(sampling_strategy=0.1)

steps = [('under', under), ('over', over)]
pipeline = Pipeline(steps=steps)

# Apply the resampling
X_train_res, y_train_res = pipeline.fit_resample(X_train, y_train)
# Convert y_train_resampled to Series with the same name as original y
X_train_res = pd.DataFrame(X_train_res, columns=X_train.columns)
y_train_res = pd.Series(y_train_res, name=y.name)


print("Original training set distribution:", Counter(y_train))
print("Resampled training set distribution:", Counter(y_train_res))
print("Test set distribution (unchanged):", Counter(y_test))

In [45]:
# combine back the smote re-sampling dataset
train_data_res = pd.concat([X_train_res, y_train_res], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

## Modeling

### (i) Building a decision tree model as benchmark

In [None]:
from pycaret.classification import *

dt_experiment = setup(data=train_data_res,
                      target="stroke",
                      test_data=test_data,
                      session_id=24, # seed config to ensure experiment's reproducility
                      preprocess=False, # since we've manually do the feature engineer
                      #max_encoding_ohe = 25, 
                      normalize=True,
                      normalize_method = 'zscore', # apply zscore for numeric feature
                      fix_imbalance=False,
                      # fix_imbalance_method='SMOTE',
                      experiment_name = "dt_stroke_classification",
                      index=False)
dt_model = create_model('dt', fold=5)

type(dt_model)

In [None]:
plot_model(dt_model,"feature")

In [None]:
plot_model(dt_model,"confusion_matrix")

In [None]:
# Plot Decision Tree
plot_model(dt_model, plot="parameter")
print(dt_model)

In [None]:
# Get the evaluation metrics of the test set in the decision tree model
predictions = predict_model(dt_model, data=test_data)
holdout_score = pull() # Pulls test set's evaluation metrics
type(holdout_score)

### (ii) Bagging for decision tree model

In [None]:
# create boosting ensemble models with just one line of code.
bagging_dt = ensemble_model(dt_model, method="Bagging")

### (ii) Boosting for decision tree model

In [None]:
boosting = ensemble_model(dt_model, method= 'Boosting')

## AutoML

In [None]:
# Use these splits in PyCaret
from pycaret.classification import *
automl_experiment = setup(data=train_data_res,
                          target="stroke",
                          test_data=test_data,
                          session_id=24, # seed config to ensure experiment's reproducility
                          preprocess=False, # since we've manually do the feature engineer
                          #max_encoding_ohe = 25, # one hot encoding
                          normalize=True,
                          normalize_method = 'zscore', # apply zscore for numeric feature
                          # fix_imbalance=True, # fix imbalance via smote
                          # fix_imbalance_method='SMOTE',
                          # experiment_name = "automl_stroke_classification",
                          fold=10,
                          index=False)

# get a list of top 3 models
best_model = compare_models(sort="F1")

# generate logs
get_logs()

In [None]:
# get the pipeline
best_model_pipeline = get_config("pipeline")
best_model_pipeline

In [None]:
# hyperparameter tuning
tuned_best_model = tune_model(best_model, fold=10)

In [None]:
# making prediction
predictions = predict_model(tuned_best_model, data = test_data)
predictions

In [None]:
plot_model(tuned_best_model, plot="auc")

In [None]:
plot_model(tuned_best_model, plot='class_report')

In [None]:
plot_model(tuned_best_model, plot="confusion_matrix", data=test_data)

In [None]:
plot_model(tuned_best_model, plot='feature', data=test_data)

In [None]:
evaluate_model(tuned_best_model, data=test_data)

In [None]:
interpret_model(tuned_best_model, data=test_data)

In [None]:
predictions.columns

# Save model

In [None]:
# save model
save_model(tuned_best_model, 'tuned_best_model')

## Load model

In [None]:
# load model
tuned_best_model = load_model(model_name='tuned_best_model')

In [None]:
tuned_best_model

## Computing Environment

In [None]:
%load_ext watermark

%watermark --iversions

# date
%watermark -u -n -t -z

## References

- Running Low on Time? Use PyCaret to Build your Machine Learning Model in Seconds https://www.analyticsvidhya.com/blog/2020/05/pycaret-machine-learning-model-seconds/
- Pycaret: https://pycaret.gitbook.io/docs/get-started/functions/others
- PyCaret + MLflow: https://towardsdatascience.com/easy-mlops-with-pycaret-mlflow-7fbcbf1e38c6
- Automated EDA https://towardsdatascience.com/comparing-five-most-popular-eda-tools-dccdef05aa4c, https://medium.com/@HeCanThink/discover-dataprep-make-exploratory-data-analysis-easier-in-python-%EF%B8%8F-254896034d70, https://arxiv.org/pdf/2104.00841
- https://www.kaggle.com/code/rhythmcam/titanic-pycaret-decision-tree (note: setup() parameters is very interesting like `use_pca`, `remove_outliers`, `normalize`, `transformation`,`remove_multicollinearity`, `combine_rare_levels`, )