In [1]:
import os
import pandas as pd

!pip install hvplot
import hvplot.pandas

from pathlib import Path
from sklearn import tree
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


#define spark version
spark_version = 'spark-3.5.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

Collecting hvplot
  Downloading hvplot-0.9.2-py2.py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: hvplot
Successfully installed hvplot-0.9.2


Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [51.0 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [770 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Hit:9 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [1,617 kB]
Hit:13 https://ppa.lau

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
from pyspark import SparkFiles
spark = SparkSession.builder.appName("Demographics").getOrCreate()

In [3]:
#Read in the data from the group github repo
url = "https://raw.githubusercontent.com/AlexFeeney/Project4_Group3/main/Data/healthcare-dataset-stroke-data.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("healthcare-dataset-stroke-data.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+-----+------+---+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender|age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+---+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male| 67|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female| 61|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
|31112|  Male| 80|           0|            1|         Yes|      Private|         Rural|           105.92|32.5|   never smoked|     1|
|60182|Female| 49|           0|            0|         Yes|      Private|         Urban|           171.23|34.4|         smokes|     1|
| 1665|Female| 79|           1|            0|         Yes|Self

In [4]:
#convert to pandas df
pandas_df = df.toPandas()


# Filter rows where 'gender' is 'Male' or 'Female'
filtered_df = pandas_df[(pandas_df['gender'] == 'Male') | (pandas_df['gender'] == 'Female')]
# Reassign the filtered DataFrame back to pandas_df
pandas_df = filtered_df.copy()
#check pandas df
pandas_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
# Get the target variables
y = pandas_df['stroke']
X = pandas_df.drop(['stroke', 'id','gender','ever_married','work_type','Residence_type','smoking_status','bmi'],axis=1)


In [6]:
#get the features
X.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level
0,67,0,1,228.69
1,61,0,0,202.21
2,80,0,1,105.92
3,49,0,0,171.23
4,79,1,0,174.12


In [7]:
#get the target
y.head()

0    1
1    1
2    1
3    1
4    1
Name: stroke, dtype: object

In [8]:
#Split data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=1,
                                                    stratify=y)
X_train.shape

(3831, 4)

In [9]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [10]:
classifier.fit(X_train, y_train)

In [11]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9511876794570608
Testing Data Score: 0.9514866979655712


In [12]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,1


In [13]:
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.9514866979655712

In [14]:
# add target column

y = pandas_df['stroke']


In [15]:
#add and convert additional features
X = pandas_df.drop(['stroke', 'id','work_type','Residence_type','smoking_status','bmi'],axis=1)
X['ever_married'].fillna('No', inplace=True)
X['ever_married'] = X['ever_married'].map({'Yes': 1, 'No': 0})
X['gender'] = X['gender'].map({'Male': 1, 'Female': 0})
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level
0,1,67,0,1,1,228.69
1,0,61,0,0,1,202.21
2,1,80,0,1,1,105.92
3,0,49,0,0,1,171.23
4,0,79,1,0,1,174.12


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=1,
                                                    stratify=y)
X_train.shape

(3831, 6)

In [17]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [18]:
classifier.fit(X_train, y_train)

In [19]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9511876794570608
Testing Data Score: 0.9514866979655712


In [20]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,1


In [21]:
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.9514866979655712

In [22]:
cm = confusion_matrix(y_test, predictions)



In [24]:
#attempt to optimiset the model
# Model Optimization
from sklearn.model_selection import GridSearchCV

# Define hyperparameters grid for logistic regression
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l2']}  # Adjusted to include only 'l2' penalty

# Initialize logistic regression classifier
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model
best_classifier = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Print the training and testing scores of the best model
print("Training Data Score (after optimization):", best_classifier.score(X_train, y_train))
print("Testing Data Score (after optimization):", best_classifier.score(X_test, y_test))

# Feature Importance Analysis (if the best model supports it)
# For logistic regression, you can analyze coefficients as feature importances
feature_importances = best_classifier.coef_[0]
feature_names = X.columns

# Print feature importances
print("\nFeature Importances:")
for feature, importance in zip(feature_names, feature_importances):
    print(f"{feature}: {importance}")


Best Hyperparameters: {'C': 0.001, 'penalty': 'l2'}
Training Data Score (after optimization): 0.9511876794570608
Testing Data Score (after optimization): 0.9514866979655712

Feature Importances:
gender: 0.0021709465205578316
age: 0.07153149848800829
hypertension: 0.008266308556091801
heart_disease: 0.008543232209240057
avg_glucose_level: -0.004505799643233578
bmi: 0.004575498287225877


In [26]:
# Data Preprocessing
# You can apply more advanced preprocessing techniques such as handling outliers and feature scaling
from sklearn.preprocessing import RobustScaler

# Initialize the scaler
scaler = RobustScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data
X_test_scaled = scaler.transform(X_test)

# Model Optimization with Hyperparameter Tuning
# Let's perform more extensive hyperparameter tuning using GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l2']}
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_classifier = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Print the training and testing scores of the best model
print("Training Data Score (after optimization):", best_classifier.score(X_train_scaled, y_train))
print("Testing Data Score (after optimization):", best_classifier.score(X_test_scaled, y_test))



Best Hyperparameters: {'C': 0.001, 'penalty': 'l2'}
Training Data Score (after optimization): 0.9511876794570608
Testing Data Score (after optimization): 0.9514866979655712
