# Stroke Prediction 

## 1. Setup and Data Loading

In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("/content/stroke_data.csv")

## 2. Exploratory Data Analysis (EDA)

In [38]:
df.shape

Unnamed: 0,0
id,5110
gender,3
age,104
hypertension,2
heart_disease,2
ever_married,2
work_type,5
Residence_type,2
avg_glucose_level,3979
bmi,418


In [None]:
df.describe()
df.dtypes

In [None]:
df.nunique()
print("Shape of the dataset:", df.shape)
print("\nSummary statistics:\n", df.describe())
print("\nData types:\n", df.dtypes)
print("\nNumber of unique values per column:\n", df.nunique())

In [None]:
print("\nUnique values in 'work_type':", df['work_type'].unique().tolist())
print("Unique values in 'Residence_type':", df['Residence_type'].unique().tolist())
print("Unique values in 'smoking_status':", df['smoking_status'].unique().tolist())
print("Unique values in 'gender':", df['gender'].unique().tolist())

In [None]:
print("\nValue counts for 'gender':\n", df['gender'].value_counts())
print("\nValue counts for 'smoking_status':\n", df['smoking_status'].value_counts())

## 3. Data Cleaning and Preprocessing

In [None]:
# Replace empty strings and similar with NaN
df.replace(["", " ", "NA", "N/A", "None", "-", "nan", "null", "NULL"], np.nan, inplace=True)

In [None]:
# Check for missing values
print("Missing values before handling:\n", df.isnull().sum())

# Fill missing 'bmi' values with the mean
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

# Check for missing values after handling BMI
print("\nMissing values after handling BMI:\n", df.isnull().sum())

In [None]:
# Check for duplicate rows
print("\nNumber of duplicate rows:", df.duplicated().sum())

In [None]:
# Drop the 'id' column as it's not useful for modeling
df.drop('id', axis=1, inplace=True)

In [None]:
# Handle outliers in 'avg_glucose_level' and 'bmi' using Z-score
from scipy.stats import zscore
z_score_glucose = zscore(df['avg_glucose_level'])
z_score_bmi = zscore(df['bmi'])

# Count outliers (abs(z-score) > 3)
glucose_outliers_count = np.sum(np.abs(z_score_glucose) > 3)
bmi_outliers_count = np.sum(np.abs(z_score_bmi) > 3)
print(f"\nGlucose outliers (z > 3): {glucose_outliers_count}")
print(f"BMI outliers (z > 3): {bmi_outliers_count}")

# Replace outliers with the mean
df['avg_glucose_level'] = np.where(np.abs(z_score_glucose) > 3, df['avg_glucose_level'].mean(), df['avg_glucose_level'])
df['bmi'] = np.where(np.abs(z_score_bmi) > 3, df['bmi'].mean(), df['bmi'])

# Verify outliers are handled
z_score_glucose = zscore(df['avg_glucose_level'])
z_score_bmi = zscore(df['bmi'])
glucose_outliers_count = np.sum(np.abs(z_score_glucose) > 3)
bmi_outliers_count = np.sum(np.abs(z_score_bmi) > 3)
print(f"\nRemaining Glucose outliers (z > 3): {glucose_outliers_count}")
print(f"Remaining BMI outliers (z > 3): {bmi_outliers_count}")

In [None]:
# Display the first few rows after cleaning
df.head()

## 4. Feature Engineering and Selection

In [None]:
# Save the cleaned data (optional, but good practice)
df.to_csv('cleaned_stroke_data.csv', index=False)
print("Cleaned data saved to 'cleaned_stroke_data.csv'")

In [None]:
# Identify important features based on correlation or domain knowledge
# Based on your previous code and common practice, these features are likely important:
selected_features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
                     'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']

# Create a new DataFrame with only the selected features
df_selected = df[selected_features].copy()

print("\nDataFrame with selected features:")
print(df_selected.head())

# Check data types of the selected features
print("\nData types of selected features:\n", df_selected.dtypes)

# Ensure there are no NaNs in the selected feature DataFrame before splitting
if df_selected.isnull().sum().sum() > 0:
    print("\nWarning: NaNs still found in selected feature data. Please check cleaning steps.")
    print(df_selected.isnull().sum())
else:
    print("\nNo NaNs found in selected feature data. Proceeding.")

## 5. Model Training and Evaluation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

In [None]:
# Define features (X) and target (y) using the selected features DataFrame
X = df_selected.drop(columns=['stroke'])
y = df_selected['stroke']

In [None]:
# Identify categorical and numerical columns within the selected features
categorical = X.select_dtypes(include='object').columns.tolist()
numerical = X.select_dtypes(include=np.number).columns.tolist()

In [None]:
print(f"Categorical columns for preprocessing: {categorical}")
print(f"Numerical columns for preprocessing: {numerical}")

In [None]:
# Preprocessing: Use ColumnTransformer to apply transformations
# One-hot encode categorical features and scale numerical features
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numerical)
], remainder='passthrough') # Keep other columns (if any) as they are

In [None]:
# Define the model pipeline: preprocessing followed by Logistic Regression
model_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Train the model pipeline
model_pipeline.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model_pipeline.predict(X_test)

In [None]:

# Evaluate the model
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

## 6. Saving the Model

In [None]:
import joblib

# Define the filename for the saved model
filename = 'stroke_pred.joblib'

# Save the trained pipeline using joblib
joblib.dump(model_pipeline, filename)

print(f"✅ Trained Logistic Regression model pipeline saved successfully to {filename}.")