In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier as SparkRandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier as SparkDecisionTreeClassifier
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource

# Set seed for reproducibility
np.random.seed(5)
random_state = np.random.randint(0, 10000)

output_notebook()


In [4]:
# Create Spark session
spark = SparkSession.builder \
    .appName("Alzheimer's Disease Analysis") \
    .getOrCreate()

# Load the CSV file into a Spark DataFrame
file_path = "Resources/alzheimers_disease_data.csv"
alzheimers_df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the schema of the DataFrame
alzheimers_df.printSchema()

# Display the first few rows of the DataFrame
alzheimers_df.show(5)


root
 |-- PatientID: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: integer (nullable = true)
 |-- Ethnicity: integer (nullable = true)
 |-- EducationLevel: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- Smoking: integer (nullable = true)
 |-- AlcoholConsumption: double (nullable = true)
 |-- PhysicalActivity: double (nullable = true)
 |-- DietQuality: double (nullable = true)
 |-- SleepQuality: double (nullable = true)
 |-- FamilyHistoryAlzheimers: integer (nullable = true)
 |-- CardiovascularDisease: integer (nullable = true)
 |-- Diabetes: integer (nullable = true)
 |-- Depression: integer (nullable = true)
 |-- HeadInjury: integer (nullable = true)
 |-- Hypertension: integer (nullable = true)
 |-- SystolicBP: integer (nullable = true)
 |-- DiastolicBP: integer (nullable = true)
 |-- CholesterolTotal: double (nullable = true)
 |-- CholesterolLDL: double (nullable = true)
 |-- CholesterolHDL: double (nullable = true)
 |-- CholesterolTrig

In [5]:
# Separate target variable and features
features = alzheimers_df.columns
features.remove('PatientID')
features.remove('Diagnosis')
features.remove('DoctorInCharge')

# StringIndexer for the target variable
indexer = StringIndexer(inputCol="Diagnosis", outputCol="label")

# VectorAssembler for the feature variables
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Create a pipeline to transform the data
pipeline = Pipeline(stages=[indexer, assembler])

# Fit and transform the data
transformed_data = pipeline.fit(alzheimers_df).transform(alzheimers_df)

# Split the data into training and testing sets
train_data, test_data = transformed_data.randomSplit([0.8, 0.2], seed=random_state)


In [6]:
# Build Random Forest model
rf = SparkRandomForestClassifier(featuresCol="features", labelCol="label", seed=random_state)
rf_model = rf.fit(train_data)

# Make predictions
rf_predictions = rf_model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy_rf = evaluator.evaluate(rf_predictions)

conf_matrix_rf = rf_predictions.groupBy('label', 'prediction').count().show()
class_report_rf = rf_predictions.groupBy('label', 'prediction').count().toPandas()

# Display results
print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest Confusion Matrix:\n", conf_matrix_rf)
print("Random Forest Classification Report:\n", class_report_rf)


+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  114|
|  0.0|       1.0|    6|
|  1.0|       0.0|   35|
|  0.0|       0.0|  259|
+-----+----------+-----+

Random Forest Accuracy: 0.9009661835748792
Random Forest Confusion Matrix:
 None
Random Forest Classification Report:
    label  prediction  count
0    1.0         1.0    114
1    0.0         1.0      6
2    1.0         0.0     35
3    0.0         0.0    259


In [7]:
# Build Decision Tree model
dt = SparkDecisionTreeClassifier(featuresCol="features", labelCol="label", seed=random_state)
dt_model = dt.fit(train_data)

# Make predictions
dt_predictions = dt_model.transform(test_data)

# Evaluate the model
accuracy_dt = evaluator.evaluate(dt_predictions)

conf_matrix_dt = dt_predictions.groupBy('label', 'prediction').count().show()
class_report_dt = dt_predictions.groupBy('label', 'prediction').count().toPandas()

# Display results
print("Decision Tree Accuracy:", accuracy_dt)
print("Decision Tree Confusion Matrix:\n", conf_matrix_dt)
print("Decision Tree Classification Report:\n", class_report_dt)


+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  135|
|  0.0|       1.0|   10|
|  1.0|       0.0|   14|
|  0.0|       0.0|  255|
+-----+----------+-----+

Decision Tree Accuracy: 0.9420289855072463
Decision Tree Confusion Matrix:
 None
Decision Tree Classification Report:
    label  prediction  count
0    1.0         1.0    135
1    0.0         1.0     10
2    1.0         0.0     14
3    0.0         0.0    255


### ROC (Receiver Operating Characteristic) Curve and AUC (Area Under the Curve)
The Receiver Operating Characteristic (ROC) curve is a graphical plot that illustrates the diagnostic ability of a binary classifier as its discrimination threshold is varied. The Area Under the Curve (AUC) measures the entire two-dimensional area underneath the entire ROC curve.

In [8]:
# Compute ROC curve and AUC for Random Forest
rf_evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName="areaUnderROC")
auc_rf = rf_evaluator.evaluate(rf_predictions)

# Compute ROC curve and AUC for Decision Tree
dt_evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName="areaUnderROC")
auc_dt = dt_evaluator.evaluate(dt_predictions)

# Display AUC results
print("Random Forest AUC:", auc_rf)
print("Decision Tree AUC:", auc_dt)


Random Forest AUC: 0.8712295808534887
Decision Tree AUC: 0.9341522096998861


In [16]:
# Feature importance for Random Forest
rf_feature_importance = rf_model.featureImportances
features = alzheimers_df.columns
features.remove('PatientID')
features.remove('Diagnosis')
features.remove('DoctorInCharge')

rf_importance_df = pd.DataFrame(list(zip(features, rf_feature_importance)), columns=["Feature", "Importance"])
rf_importance_df = rf_importance_df.sort_values(by="Importance", ascending=True)

# Create a ColumnDataSource for Bokeh
source_rf = ColumnDataSource(rf_importance_df)

# Create a horizontal bar chart
p_rf = figure(y_range=rf_importance_df['Feature'], title="Feature Importance in Random Forest Model",
              x_axis_label='Importance', y_axis_label='Feature')

p_rf.hbar(y='Feature', right='Importance', source=source_rf, height=0.5)

show(p_rf)


In [17]:
# Feature importance for Decision Tree
dt_feature_importance = dt_model.featureImportances
dt_importance_df = pd.DataFrame(list(zip(features, dt_feature_importance)), columns=["Feature", "Importance"])
dt_importance_df = dt_importance_df.sort_values(by="Importance", ascending=True)

# Create a ColumnDataSource for Bokeh
source_dt = ColumnDataSource(dt_importance_df)

# Create a horizontal bar chart
p_dt = figure(y_range=dt_importance_df['Feature'], title="Feature Importance in Decision Tree Model",
              x_axis_label='Importance', y_axis_label='Feature')

p_dt.hbar(y='Feature', right='Importance', source=source_dt, height=0.5)

show(p_dt)


In [11]:
# Overall model performance display
print("Random Forest Accuracy:", accuracy_rf)
print("Random Forest AUC:", auc_rf)
print("Decision Tree Accuracy:", accuracy_dt)
print("Decision Tree AUC:", auc_dt)


Random Forest Accuracy: 0.9009661835748792
Random Forest AUC: 0.8712295808534887
Decision Tree Accuracy: 0.9420289855072463
Decision Tree AUC: 0.9341522096998861
