<a href="https://colab.research.google.com/github/tharund7/IT1703/blob/main/Assignment_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from IPython.display import display, Markdown
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')


# Loading the Dataset

In [2]:
# Load the dataset
stroke_data = pd.read_csv('stroke.csv')

# Display the first few rows of the dataset
display(stroke_data.head())


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# Preprocessing  the Data

In [3]:
# Check for missing values
display(Markdown("### Missing Values"))
display(stroke_data.isnull().sum())

# Fill missing values (e.g., using mean for numerical columns)
stroke_data['bmi'].fillna(stroke_data['bmi'].mean(), inplace=True)

# Convert categorical variables using one-hot encoding
stroke_data = pd.get_dummies(stroke_data, drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['age', 'avg_glucose_level', 'bmi']
stroke_data[numerical_features] = scaler.fit_transform(stroke_data[numerical_features])




### Missing Values

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [4]:
# Display the first few rows after preprocessing
display(Markdown("### Data After Preprocessing"))
display(stroke_data.head())

### Data After Preprocessing

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,1.051434,0,1,2.706375,1.001234,1,True,False,True,False,True,False,False,True,True,False,False
1,51676,0.78607,0,0,2.121559,4.615554e-16,1,False,False,True,False,False,True,False,False,False,True,False
2,31112,1.62639,0,1,-0.005028,0.4685773,1,True,False,True,False,True,False,False,False,False,True,False
3,60182,0.255342,0,0,1.437358,0.7154182,1,False,False,True,False,True,False,False,True,False,False,True
4,1665,1.582163,1,0,1.501184,-0.6357112,1,False,False,True,False,False,True,False,False,False,True,False


# Defining Target and Features

In [5]:
# Define target and features
X = stroke_data.drop('stroke', axis=1)
y = stroke_data['stroke']

# Display shapes of X and y
display(Markdown(f"### Features Shape: {X.shape}"))
display(Markdown(f"### Target Shape: {y.shape}"))


### Features Shape: (5110, 17)

### Target Shape: (5110,)

# Handling Imbalanced Data

In [6]:
# Use SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Display shapes of resampled data
display(Markdown(f"### Resampled Features Shape: {X_res.shape}"))
display(Markdown(f"### Resampled Target Shape: {y_res.shape}"))


### Resampled Features Shape: (9722, 17)

### Resampled Target Shape: (9722,)

# Spliting the Data

In [7]:
# Split the resampled data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Display shapes of train and test sets
display(Markdown(f"### Training Set Shape: {X_train.shape}, {y_train.shape}"))
display(Markdown(f"### Test Set Shape: {X_test.shape}, {y_test.shape}"))


### Training Set Shape: (7777, 17), (7777,)

### Test Set Shape: (1945, 17), (1945,)

# Training the Models

In [8]:
# Initialize models with class weights
log_reg = LogisticRegression(random_state=42, max_iter=200, class_weight='balanced')
rand_forest = RandomForestClassifier(random_state=42, class_weight='balanced')
grad_boost = GradientBoostingClassifier(random_state=42)

# Train models
log_reg.fit(X_train, y_train)
rand_forest.fit(X_train, y_train)
grad_boost.fit(X_train, y_train)


# Making  Predictions

In [9]:
# Predict on test set
log_reg_pred = log_reg.predict(X_test)
rand_forest_pred = rand_forest.predict(X_test)
grad_boost_pred = grad_boost.predict(X_test)


# Evaluating the Models

In [10]:
# Define a function to evaluate models
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)
    return accuracy, precision, recall, f1

# Evaluate models
log_reg_eval = evaluate_model(y_test, log_reg_pred)
rand_forest_eval = evaluate_model(y_test, rand_forest_pred)
grad_boost_eval = evaluate_model(y_test, grad_boost_pred)

# Display results
results = {
    'Model': ['Logistic Regression', 'Random Forest', 'Gradient Boosting'],
    'Accuracy': [log_reg_eval[0], rand_forest_eval[0], grad_boost_eval[0]],
    'Precision': [log_reg_eval[1], rand_forest_eval[1], grad_boost_eval[1]],
    'Recall': [log_reg_eval[2], rand_forest_eval[2], grad_boost_eval[2]],
    'F1 Score': [log_reg_eval[3], rand_forest_eval[3], grad_boost_eval[3]]
}

results_df = pd.DataFrame(results)

display(Markdown("### Model Evaluation Results"))
display(results_df)


### Model Evaluation Results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.751671,0.703766,0.86701,0.776905
1,Random Forest,0.942416,0.935976,0.949485,0.942682
2,Gradient Boosting,0.913625,0.91002,0.917526,0.913758
