In [49]:
# import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from pathlib import Path
from collections import Counter
from matplotlib import style
import seaborn as sns
import sqlite3
from sqlalchemy import create_engine, text
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Create a DataFrame for the healthcare-dataset-stroke-data.csv. 
file_path = Path("Resources/alzheimers_disease_data.csv")
alzheimer_df = pd.read_csv(file_path)
alzheimer_df.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


---
### Data cleaning and preparation process 


In [5]:
# determine the number of rows and columns.
alzheimer_df_rc, alzheimer_df_cc = alzheimer_df.shape
print('Number of total rows:', alzheimer_df_rc)
print('Number of total columns:', alzheimer_df_cc)

Number of total rows: 2149
Number of total columns: 35


In [6]:
# Check all columns inside of the DataFrame
alzheimer_df.columns

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
       'SleepQuality', 'FamilyHistoryAlzheimers', 'CardiovascularDisease',
       'Diabetes', 'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP',
       'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion',
       'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
       'Forgetfulness', 'Diagnosis', 'DoctorInCharge'],
      dtype='object')

In [7]:
# show duplicates
duplicate = alzheimer_df[alzheimer_df.duplicated()]
print("Duplicate Rows:", len(duplicate), "\n")

Duplicate Rows: 0 



In [8]:
# Check for missing values
alzheimer_df.isna().sum()

PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTasks    0
Forgetfu

In [9]:
# Drop all rows with missing information 
alzheimer_df = alzheimer_df.dropna(how='any')

In [10]:
# print out columns and number of unique values
for col in alzheimer_df.columns:
    print(col, alzheimer_df[col].nunique())

PatientID 2149
Age 31
Gender 2
Ethnicity 4
EducationLevel 4
BMI 2149
Smoking 2
AlcoholConsumption 2149
PhysicalActivity 2149
DietQuality 2149
SleepQuality 2149
FamilyHistoryAlzheimers 2
CardiovascularDisease 2
Diabetes 2
Depression 2
HeadInjury 2
Hypertension 2
SystolicBP 90
DiastolicBP 60
CholesterolTotal 2149
CholesterolLDL 2149
CholesterolHDL 2149
CholesterolTriglycerides 2149
MMSE 2149
FunctionalAssessment 2149
MemoryComplaints 2
BehavioralProblems 2
ADL 2149
Confusion 2
Disorientation 2
PersonalityChanges 2
DifficultyCompletingTasks 2
Forgetfulness 2
Diagnosis 2
DoctorInCharge 1


---
### Exploratory Data Analysis (EDA)



In [11]:
# Look at the stroke outcome value counts
alzheimer_counts = alzheimer_df['Diagnosis'].value_counts()
alzheimer_counts

Diagnosis
0    1389
1     760
Name: count, dtype: int64

In [12]:
# keep only columns of "Ethinicity", "Gender", , "Age", "EducationLevel", "MemoryComplaints", "BehavioralProblems"
alzheimer_cleanML_df = alzheimer_df[['Ethnicity', 'Gender', 'Age', 'EducationLevel', 'Diagnosis']]
alzheimer_cleanML_df.head()

Unnamed: 0,Ethnicity,Gender,Age,EducationLevel,Diagnosis
0,0,0,73,2,0
1,0,0,89,0,0
2,3,0,73,1,0
3,0,1,74,1,0
4,0,0,89,0,0


### EDA modeling 

**Age: The age of the patients ranges from 60 to 90 years.
**Gender: Gender of the patients, where 0 represents Male and 1 represents Female.
**Ethnicity: The ethnicity of the patients, coded as follows:
0: Caucasian
1: African American
2: Asian
3: Other
**EducationLevel: The education level of the patients, coded as follows:
0: None
1: High School
2: Bachelor's
3: Higher

In [13]:
# Create new columns for each ethnicity and education level
alzheimer_cleanML_df['Caucasian'] = (alzheimer_cleanML_df['Ethnicity'] == 0).astype(int)
alzheimer_cleanML_df['African American'] = (alzheimer_cleanML_df['Ethnicity'] == 1).astype(int)
alzheimer_cleanML_df['Asian'] = (alzheimer_cleanML_df['Ethnicity'] == 2).astype(int)
alzheimer_cleanML_df['Other'] = (alzheimer_cleanML_df['Ethnicity'] == 3).astype(int)

alzheimer_cleanML_df['None'] = (alzheimer_cleanML_df['EducationLevel'] == 0).astype(int)
alzheimer_cleanML_df['High School'] = (alzheimer_cleanML_df['EducationLevel'] == 1).astype(int)
alzheimer_cleanML_df['Bachelor\'s'] = (alzheimer_cleanML_df['EducationLevel'] == 2).astype(int)
alzheimer_cleanML_df['Higher'] = (alzheimer_cleanML_df['EducationLevel'] == 3).astype(int)

# Drop the original Ethnicity and EducationLevel columns
alzheimer_cleanML_df = alzheimer_cleanML_df.drop('Ethnicity', axis=1)
alzheimer_cleanML_df = alzheimer_cleanML_df.drop('EducationLevel', axis=1)

# Display the first few rows of the reshaped dataframe
alzheimer_cleanML_df.head()

Unnamed: 0,Gender,Age,Diagnosis,Caucasian,African American,Asian,Other,None,High School,Bachelor's,Higher
0,0,73,0,1,0,0,0,0,0,1,0
1,0,89,0,1,0,0,0,1,0,0,0
2,0,73,0,0,0,0,1,0,1,0,0
3,1,74,0,1,0,0,0,0,1,0,0
4,0,89,0,1,0,0,0,1,0,0,0


In [14]:
# Save the dataframe to a CSV file
alzheimer_cleanML_df.to_csv('Resources/alzheimer_EGAE_ML_df.csv', index=False)

In [15]:
# Create a SQLite database engine
engine = create_engine('sqlite:///alzheimer_EGAE_ML_data.db')

# Write the DataFrame to a SQL table
alzheimer_cleanML_df.to_sql('alzheimer_EGAE_ML_data', engine, index=False, if_exists='replace')

# Generate SQL file
with engine.connect() as conn:
    with open('alzheimer_EGAE_ML_data.sql', 'w') as f:
        result = conn.execute(text('SELECT * FROM alzheimer_EGAE_ML_data'))
        for line in result.fetchall():
            f.write(f"INSERT INTO alzheimer_EGAE_ML_data VALUES {line};\n")

In [16]:
# Connect to the database 
conn = sqlite3.connect('alzheimer_EGAE_ML_data.db')
cursor = conn.cursor()

# Execute an SQL query
cursor.execute('SELECT * FROM alzheimer_EGAE_ML_data')

# Fetch the data
rows = cursor.fetchall()

# Close the connection
conn.close()

In [17]:
# Split features and target
X = alzheimer_cleanML_df.drop('Diagnosis', axis=1)
y = alzheimer_cleanML_df['Diagnosis']

In [18]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Age']),
        ('cat', OneHotEncoder(drop='first'), ['Gender', 'Caucasian', 'African American', 'Asian','Other','None','High School', 'Bachelor\'s', 'Higher'])
    ])


In [20]:
# Create a pipeline
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])


In [21]:
# Fit the model
clf.fit(X_train, y_train)

In [22]:
# Make predictions
y_pred = clf.predict(X_test)

In [23]:
# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.73      0.68       277
           1       0.32      0.24      0.27       153

    accuracy                           0.55       430
   macro avg       0.48      0.48      0.47       430
weighted avg       0.52      0.55      0.53       430



### Expend dataset

In [35]:
# Add more columns to current dataframe 
alzheimer_cleanML_opt_df = alzheimer_cleanML_df[['Diagnosis','Age', 'Gender', 'Caucasian', 'African American', 'Asian','Other','None','High School', 'Bachelor\'s', 'Higher']].merge(alzheimer_df[['MMSE', 'FunctionalAssessment','MemoryComplaints', 'BehavioralProblems', 'ADL',]], left_index=True, right_index=True)
alzheimer_cleanML_opt_df.head()

Unnamed: 0,Diagnosis,Age,Gender,Caucasian,African American,Asian,Other,None,High School,Bachelor's,Higher,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL
0,0,73,0,1,0,0,0,0,0,1,0,21.463532,6.518877,0,0,1.725883
1,0,89,0,1,0,0,0,1,0,0,0,20.613267,7.118696,0,0,2.592424
2,0,73,0,0,0,0,1,0,1,0,0,7.356249,5.895077,0,0,7.119548
3,0,74,1,1,0,0,0,0,1,0,0,13.991127,8.965106,0,1,6.481226
4,0,89,0,1,0,0,0,1,0,0,0,13.517609,6.045039,0,0,0.014691


In [36]:
# Split features and target
X1 = alzheimer_cleanML_opt_df.drop('Diagnosis', axis=1)
y1 = alzheimer_cleanML_opt_df['Diagnosis']

In [37]:
# Split into train and test sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

### Logistic Regression

In [38]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression = LogisticRegression(solver='lbfgs', random_state=1)
logistic_regression

# Fit the model using training data
LR_model = logistic_regression.fit(X1_train,y1_train)

In [54]:
# Make a prediction using the testing data
predictions = logistic_regression.predict(X1_test)
LR_predictions = pd.DataFrame({"Predictions": predictions, "Actual":y1_test}).reset_index(drop=True)
LR_predictions

Unnamed: 0,Predictions,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
425,0,0
426,0,0
427,1,1
428,0,0


In [41]:
# Generate a confusion matrix for the model
confusion_matrix(predictions,y1_test)

array([[248,  43],
       [ 29, 110]], dtype=int64)

In [42]:
# Print the classification report for the model
print(classification_report(predictions,y1_test))

              precision    recall  f1-score   support

           0       0.90      0.85      0.87       291
           1       0.72      0.79      0.75       139

    accuracy                           0.83       430
   macro avg       0.81      0.82      0.81       430
weighted avg       0.84      0.83      0.83       430



In [48]:
print('\nLogistic Regression result: \n1.Non-patient: Precision 0.90, recall 0.85 \n2.Patient: Precision 0.72, recall 0.79 \n3.Overall: Accuracy 0.85')


Logistic Regression result: 
1.Non-patient: Precision 0.90, recall 0.85 
2.Patient: Precision 0.72, recall 0.79 
3.Overall: Accuracy 0.85


### RandomForestClassifier 

In [50]:
# Apply RandomOverSampler
ros = RandomOverSampler(random_state=42)
X1_train_resampled, y1_train_resampled = ros.fit_resample(X1_train, y1_train)

In [51]:
# Create and train the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X1_train_resampled, y1_train_resampled)

In [52]:
# Make predictions
y1_pred = rf_classifier.predict(X1_test)

In [53]:
# Evaluate the model
print(confusion_matrix(y1_test, y1_pred))
print(classification_report(y1_test, y1_pred))

[[272   5]
 [ 16 137]]
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       277
           1       0.96      0.90      0.93       153

    accuracy                           0.95       430
   macro avg       0.95      0.94      0.95       430
weighted avg       0.95      0.95      0.95       430



In [55]:
print('\nRandomForestClassifier modeling result: \n1.Non-patient: Precision 0.94, recall 0.98 \n2.Patient: Precision 0.96, recall 0.90 \n3.Overall: Accuracy 0.95')


RandomForestClassifier modeling result: 
1.Non-patient: Precision 0.94, recall 0.98 
2.Patient: Precision 0.96, recall 0.90 
3.Overall: Accuracy 0.95
