# Practice:

### Dataset: [Stroke Prediction Dataset](https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset)

---

## 1. Frame the Problem:

### Business Objective: Given a set of features, try and predict if a patient will have a stroke or not.

---

## 2. Get the Data

In [None]:
import os 
import pandas as pd


DATASET_PATH = os.path.join('datasets')

def load_data(dataset_path=DATASET_PATH):
    csv_path = os.path.join(dataset_path, "healthcare-dataset-stroke-data.csv")
    return pd.read_csv(csv_path)

In [144]:
stroke_data = load_data()
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


### Look at the Data Structure

In [None]:
stroke_data.describe()

In [None]:
stroke_data.info()

### Seperate Features by Data Type (Numeric vs Categorical)

In [145]:
stroke_numerical = stroke_data.select_dtypes(exclude="object")
stroke_categorical = stroke_data.select_dtypes(include="object")

In [None]:
stroke_categorical.describe()

In [None]:
stroke_numerical.describe()

### Examine non-numeric values

### Plot the Data

#### Numeric

In [None]:
%matplotlib inline 

import matplotlib.pyplot as plt
stroke_data.hist(bins=50, figsize=(20, 15))
plt.show()

#### Categorical

In [None]:
categorical_features = list(stroke_categorical.columns)

fig, ax = plt.subplots(1, len(categorical_features))

for i, categorical_feature in enumerate(stroke_data[categorical_features]):
    stroke_data[categorical_feature].value_counts().plot(kind="bar", ax=ax[i], figsize=(15, 5)).set_title(categorical_feature)

plt.tight_layout()

### Create a Test Set of Data


In [147]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(stroke_data, test_size=0.2, random_state=42)

train_set

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
802,4970,Male,79.00,0,0,Yes,Self-employed,Rural,112.64,28.5,formerly smoked,0
3927,56137,Female,62.00,0,0,Yes,Private,Urban,88.32,36.3,Unknown,0
2337,54590,Female,21.00,0,0,No,Private,Rural,59.52,33.7,never smoked,0
3910,36548,Male,31.00,0,0,Yes,Govt_job,Urban,65.70,30.4,formerly smoked,0
1886,61171,Female,31.00,0,0,No,Private,Rural,59.63,19.9,never smoked,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4426,13846,Male,43.00,0,0,Yes,Govt_job,Rural,88.00,30.6,never smoked,0
466,1307,Female,61.00,1,0,Yes,Private,Rural,170.05,60.2,smokes,0
3092,31481,Female,1.16,0,0,No,children,Urban,97.28,17.8,Unknown,0
3772,61827,Male,80.00,0,0,Yes,Self-employed,Rural,196.08,31.0,formerly smoked,0


## 3. Discover & Visualize the Data to Gain Insights

### Make a copy of the training set

In [None]:
stroke_data = train_set.copy()

### Look for Correlations

In [None]:
corr_matrix = stroke_data.corr(numeric_only=True)
corr_matrix["stroke"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

numeric_features = ["stroke", "age", "heart_disease", "avg_glucose_level", "hypertension"]
scatter_matrix(stroke_data[numeric_features], figsize=(12,8))

## 4. Prepare the Data For Machine Learning Algorithms

### Drop 'ID' Column & Copy Labels

In [148]:
stroke_data = train_set.drop("id", axis=1)
stroke_data = train_set.drop("stroke", axis=1)
stroke_labels = train_set["stroke"].copy()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
802,4970,Male,79.00,0,0,Yes,Self-employed,Rural,112.64,28.5,formerly smoked
3927,56137,Female,62.00,0,0,Yes,Private,Urban,88.32,36.3,Unknown
2337,54590,Female,21.00,0,0,No,Private,Rural,59.52,33.7,never smoked
3910,36548,Male,31.00,0,0,Yes,Govt_job,Urban,65.70,30.4,formerly smoked
1886,61171,Female,31.00,0,0,No,Private,Rural,59.63,19.9,never smoked
...,...,...,...,...,...,...,...,...,...,...,...
4426,13846,Male,43.00,0,0,Yes,Govt_job,Rural,88.00,30.6,never smoked
466,1307,Female,61.00,1,0,Yes,Private,Rural,170.05,60.2,smokes
3092,31481,Female,1.16,0,0,No,children,Urban,97.28,17.8,Unknown
3772,61827,Male,80.00,0,0,Yes,Self-employed,Rural,196.08,31.0,formerly smoked


### Numerical Pipeline

In [134]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler())
])

### With Categorical Pipeline (E.g One-Hot-Encoding)

In [149]:
stroke_numerical = stroke_data.select_dtypes(exclude="object")
stroke_categorical = stroke_data.select_dtypes(include="object")

num_attribs = list(stroke_numerical.columns)
cat_attribs = list(stroke_categorical.columns)

print(num_attribs)
print(stroke_attribs)

['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']


In [150]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

stroke_data_prepared = full_pipeline.fit_transform(stroke_data)

stroke_data_prepared

array([[-1.48974572,  1.58415472, -0.32198091, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.9372468 ,  0.82970766, -0.32198091, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.86386831, -0.98984115, -0.32198091, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.23225557, -1.87032526, -0.32198091, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.20713926,  1.62853396, -0.32198091, ...,  1.        ,
         0.        ,  0.        ],
       [-0.35311426,  0.11963983, -0.32198091, ...,  0.        ,
         0.        ,  1.        ]])

## 5. Select and Train a Model

### Helper Functions

In [176]:
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score

def quick_test_predictions(model, training_data, training_labels, pipeline=full_pipeline):
    some_data = training_data.iloc[:5]
    some_labels = training_labels.iloc[:5]
    some_data_prepared = pipeline.transform(some_data)
    
    print(f"Predictions: {model.predict(some_data_prepared)}")
    print(f"Labels: {list(some_labels)}")
    
def analyze_cost_function(model, prepared_train_data, train_labels, cost_function):
    predictions = model.predict_proba(prepared_train_data)
    log_loss = cost_function(train_labels, predictions)
    print(f"{round(log_loss * 100, 2)}%")
    
def cross_fold_validation(model, prepared_train_data, train_labels):
    scores = cross_val_score(model, prepared_train_data, train_labels, cv=10)
    print(f"Scores")
    for score in scores:
        print(f"{round(score * 100, 2)}%")

### Logistic Regression

In [177]:
from sklearn.linear_model import LogisticRegression

# Create Model
log_reg = LogisticRegression()
log_reg.fit(stroke_data_prepared, stroke_labels)

# Test Some Predictions to gauge accuracy
quick_test_predictions(log_reg, stroke_data, stroke_labels)

# Gauge Accuracy with Cost Function
analyze_cost_function(log_reg, stroke_data_prepared, stroke_labels, log_loss)

# Gauge Accuracy with Cross Fold Validation
cross_fold_validation(log_reg, stroke_data_prepared, stroke_labels)

Predictions: [0 0 0 0 0]
Labels: [0, 0, 0, 0, 0]
14.86%
Scores
95.6%
95.35%
95.35%
95.35%
95.35%
95.35%
95.35%
95.35%
95.59%
95.59%


### KNN

In [182]:
from sklearn.neighbors import KNeighborsClassifier

# Create Model
knn_class = KNeighborsClassifier()
knn_class.fit(stroke_data_prepared, stroke_labels)

# Test Some Predictions to gauge accuracy
quick_test_predictions(knn_class, stroke_data, stroke_labels)

# Gauge Accuracy with Cost Function
analyze_cost_function(knn_class, stroke_data_prepared, stroke_labels, log_loss)

# Gauge Accuracy with Cross Fold Validation
cross_fold_validation(knn_class, stroke_data_prepared, stroke_labels)

Predictions: [0 0 0 0 0]
Labels: [0, 0, 0, 0, 0]
9.69%
Scores
95.6%
94.87%
95.35%
95.35%
95.35%
95.35%
94.87%
95.11%
95.59%
95.83%


### DecisionTreeClassifier

In [183]:
from sklearn.tree import DecisionTreeClassifier

# Create Model
tree_class = DecisionTreeClassifier()
tree_class.fit(stroke_data_prepared, stroke_labels)

# Test Some Predictions to gauge accuracy
quick_test_predictions(tree_class, stroke_data, stroke_labels)

# Gauge Accuracy with Cost Function
analyze_cost_function(tree_class, stroke_data_prepared, stroke_labels, log_loss)

# Gauge Accuracy with Cross Fold Validation
cross_fold_validation(tree_class, stroke_data_prepared, stroke_labels)

Predictions: [0 0 0 0 0]
Labels: [0, 0, 0, 0, 0]
0.0%
Scores
91.44%
89.0%
92.42%
90.22%
89.73%
90.22%
90.22%
91.69%
91.18%
92.89%


### Random Forest Classifier

In [184]:
from sklearn.ensemble import RandomForestClassifier

# Create Model
forest_class = RandomForestClassifier()
forest_class.fit(stroke_data_prepared, stroke_labels)

# Test Some Predictions to gauge accuracy
quick_test_predictions(forest_class, stroke_data, stroke_labels)

# Gauge Accuracy with Cost Function
analyze_cost_function(forest_class, stroke_data_prepared, stroke_labels, log_loss)

# Gauge Accuracy with Cross Fold Validation
cross_fold_validation(forest_class, stroke_data_prepared, stroke_labels)

Predictions: [0 0 0 0 0]
Labels: [0, 0, 0, 0, 0]
3.4%
Scores
95.6%
95.35%
95.35%
95.35%
95.35%
95.11%
95.35%
95.35%
95.59%
95.59%


### GradientBoostingClassifier

In [185]:
from sklearn.ensemble import GradientBoostingClassifier

# Create Model
grad_class = GradientBoostingClassifier()
grad_class.fit(stroke_data_prepared, stroke_labels)

# Test Some Predictions to gauge accuracy
quick_test_predictions(grad_class, stroke_data, stroke_labels)

# Gauge Accuracy with Cost Function
analyze_cost_function(grad_class, stroke_data_prepared, stroke_labels, log_loss)

# Gauge Accuracy with Cross Fold Validation
cross_fold_validation(grad_class, stroke_data_prepared, stroke_labels)

Predictions: [0 0 0 0 0]
Labels: [0, 0, 0, 0, 0]
10.21%
Scores
95.6%
95.11%
95.11%
95.11%
95.11%
95.11%
95.35%
95.11%
95.34%
95.1%
