# Logistic regression implementation


#### 1. Preprocessing Code and importing all libraries (Shared Across All Algorithms)

In [9]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as GDA
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Loading the dataset
file_path = './DATA/healthcare-dataset-stroke-data.csv' 
data = pd.read_csv(file_path)

# Data Cleaning
data = data.dropna()

# Encode categorical variables
label_encoders = {}
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le 

# Scaling numerical features
scaler = StandardScaler()
numerical_columns = ['age', 'avg_glucose_level', 'bmi']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Split data into features (X) and target (y)
X = data.drop(columns=['id', 'stroke'])  # Features
y = data['stroke']  # Target

# Split into train (70%), test (20%), and validation (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42, stratify=y_temp)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


#### 1. Logistic regression

In [10]:


# Training with Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_resampled, y_resampled)

# Evaluating on Test Set
y_pred_test = lr_model.predict(X_test)
print("Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))


Logistic Regression
Accuracy: 0.76
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.76      0.86       944
           1       0.12      0.76      0.21        42

    accuracy                           0.76       986
   macro avg       0.56      0.76      0.54       986
weighted avg       0.95      0.76      0.83       986

Confusion Matrix:
[[720 224]
 [ 10  32]]


#### 2. Decision Tree

In [11]:


# Training with Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_resampled, y_resampled)

# Evaluating with Test Set
y_pred_test = dt_model.predict(X_test)
print("Decision Tree")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))


Decision Tree
Accuracy: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.89      0.93       944
           1       0.11      0.29      0.16        42

    accuracy                           0.87       986
   macro avg       0.54      0.59      0.54       986
weighted avg       0.93      0.87      0.90       986

Confusion Matrix:
[[844 100]
 [ 30  12]]


#### 3. Random Forest

In [12]:


# Training with Random Forest
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_resampled, y_resampled)

# Evaluating on Test Set
y_pred_test = rf_model.predict(X_test)
print("Random Forest")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))


Random Forest
Accuracy: 0.89
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       944
           1       0.11      0.21      0.15        42

    accuracy                           0.89       986
   macro avg       0.54      0.57      0.54       986
weighted avg       0.93      0.89      0.91       986

Confusion Matrix:
[[872  72]
 [ 33   9]]


#### 4. Support Vector Machine 

In [13]:


# Training with SVM
svm_model = SVC(random_state=42, kernel='linear', probability=True)
svm_model.fit(X_resampled, y_resampled)

# Evaluating on Test Set
y_pred_test = svm_model.predict(X_test)
print("Support Vector Machine")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))


Support Vector Machine
Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.75      0.85       944
           1       0.12      0.76      0.20        42

    accuracy                           0.75       986
   macro avg       0.55      0.75      0.53       986
weighted avg       0.95      0.75      0.82       986

Confusion Matrix:
[[705 239]
 [ 10  32]]


#### 5.Gradient Boosting (XGBoost)

In [14]:


# Train Gradient Boosting (XGBoost)
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')  
xgb_model.fit(X_resampled, y_resampled)

# Evaluate on Test Set
y_pred_test = xgb_model.predict(X_test)
print("Gradient Boosting (XGBoost)")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))


Gradient Boosting (XGBoost)
Accuracy: 0.88
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       944
           1       0.08      0.17      0.11        42

    accuracy                           0.88       986
   macro avg       0.52      0.54      0.52       986
weighted avg       0.92      0.88      0.90       986

Confusion Matrix:
[[864  80]
 [ 35   7]]


### 6. Gaussian Discriminant Analysis GDA

In [None]:


# Train Gaussian Discriminant Analysis (GDA)
gda_model = GDA()  
gda_model.fit(X_resampled, y_resampled) 

# Evaluate on Test Set
y_pred_test = gda_model.predict(X_test)  
print("Gaussian Discriminant Analysis (GDA)")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))


Gaussian Discriminant Analysis (GDA)
Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.75      0.85       944
           1       0.12      0.74      0.20        42

    accuracy                           0.75       986
   macro avg       0.55      0.75      0.53       986
weighted avg       0.95      0.75      0.83       986

Confusion Matrix:
[[712 232]
 [ 11  31]]


### 7. Naive Bayes

In [16]:


# Train Naive Bayes
nb_model = GaussianNB()  
nb_model.fit(X_resampled, y_resampled)  

# Evaluate on Test Set
y_pred_test = nb_model.predict(X_test) 
print("Naive Bayes")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))


Naive Bayes
Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.73      0.84       944
           1       0.11      0.79      0.20        42

    accuracy                           0.73       986
   macro avg       0.55      0.76      0.52       986
weighted avg       0.95      0.73      0.81       986

Confusion Matrix:
[[688 256]
 [  9  33]]


## Comparing the algorithms used

In [19]:



results = {
    "Algorithm": [],
    "Accuracy": [],
    "Precision (macro)": [],
    "Recall (macro)": [],
    "F1-score (macro)": []
}


def evaluate_model(model, algorithm_name, X_test, y_test):
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    
    results["Algorithm"].append(algorithm_name)
    results["Accuracy"].append(accuracy_score(y_test, y_pred))
    results["Precision (macro)"].append(report["macro avg"]["precision"])
    results["Recall (macro)"].append(report["macro avg"]["recall"])
    results["F1-score (macro)"].append(report["macro avg"]["f1-score"])

# Evaluate pre-trained models
evaluate_model(lr_model, "Logistic Regression", X_test, y_test)
evaluate_model(dt_model, "Decision Tree", X_test, y_test)
evaluate_model(rf_model, "Random Forest", X_test, y_test)
evaluate_model(svm_model, "Support Vector Machine", X_test, y_test)
evaluate_model(xgb_model, "Gradient Boosting (XGBoost)", X_test, y_test)
evaluate_model(gda_model, "Gaussian Discriminant Analysis(GDA)", X_test, y_test)
evaluate_model(nb_model, "Naive Bayes", X_test, y_test)
# Tabulate Results
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Algorithm,Accuracy,Precision (macro),Recall (macro),F1-score (macro)
0,Logistic Regression,0.762677,0.555651,0.762308,0.53749
1,Decision Tree,0.868154,0.536409,0.589891,0.542169
2,Random Forest,0.893509,0.537324,0.569007,0.544777
3,Support Vector Machine,0.747465,0.552048,0.754363,0.527191
4,Gradient Boosting (XGBoost),0.883367,0.520764,0.54096,0.523064
5,Gaussian Discriminant Analysis(GDA),0.75355,0.551328,0.746166,0.528754
6,Naive Bayes,0.731237,0.550637,0.757264,0.518954
