In [225]:
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [226]:
alzheimers_df = pd.read_csv("archive/alzheimers_disease_data.csv")

alzheimers_df

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,6895,61,0,0,1,39.121757,0,1.561126,4.049964,6.555306,...,0,0,4.492838,1,0,0,0,0,1,XXXConfid
2145,6896,75,0,0,2,17.857903,0,18.767261,1.360667,2.904662,...,0,1,9.204952,0,0,0,0,0,1,XXXConfid
2146,6897,77,0,0,1,15.476479,0,4.594670,9.886002,8.120025,...,0,0,5.036334,0,0,0,0,0,1,XXXConfid
2147,6898,78,1,3,1,15.299911,0,8.674505,6.354282,1.263427,...,0,0,3.785399,0,0,0,0,1,1,XXXConfid


In [227]:
alzheimers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [228]:
alzheimers_df["Diagnosis"].value_counts()

Diagnosis
0    1389
1     760
Name: count, dtype: int64

In [229]:
X = alzheimers_df.drop(columns=["PatientID","DoctorInCharge","Diagnosis"], axis = 1)
y = alzheimers_df["Diagnosis"]

In [230]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6)

In [231]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [232]:
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

print(f'Base Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Base Testing Score: {clf.score(X_test_scaled, y_test)}')

Base Training Score: 1.0
Base Testing Score: 0.9237918215613383


In [233]:
feature_importances = clf.feature_importances_

importances_sorted = sorted(zip(feature_importances, X.columns), reverse=True)
for feature in importances_sorted:
    print(feature)

(0.1824967996422388, 'FunctionalAssessment')
(0.16108053055721172, 'ADL')
(0.1317233462470854, 'MMSE')
(0.07576661240271657, 'MemoryComplaints')
(0.050259016289060016, 'BehavioralProblems')
(0.032065203237371664, 'DietQuality')
(0.02933618769074726, 'PhysicalActivity')
(0.029192554069231307, 'CholesterolHDL')
(0.02894021815137295, 'CholesterolTriglycerides')
(0.028777461086706327, 'CholesterolLDL')
(0.028487256600862015, 'BMI')
(0.028263185240259427, 'SleepQuality')
(0.02825044703791203, 'AlcoholConsumption')
(0.028109602798893588, 'CholesterolTotal')
(0.025354192387685243, 'SystolicBP')
(0.024127366575442472, 'DiastolicBP')
(0.023398990540967224, 'Age')
(0.01131728258007388, 'EducationLevel')
(0.008162277774994552, 'Ethnicity')
(0.004429311699017625, 'Gender')
(0.0037888399905859154, 'Forgetfulness')
(0.0036054540756536406, 'Depression')
(0.003583935384450165, 'DifficultyCompletingTasks')
(0.0035667559651200755, 'Diabetes')
(0.0035443164675720393, 'Smoking')
(0.003514997992766779, 'Fa

In [234]:
narrowed_df = alzheimers_df.copy().drop(columns=["PatientID","DoctorInCharge", "Diagnosis"], axis=1)

user_input = input("Enter a column to remove (NA to end):")
while user_input!="NA":

    narrowed_df=narrowed_df.drop(columns=[user_input], axis=1)
    
    user_input = input("Enter a column to remove (NA to end):")


In [235]:
X = narrowed_df.copy()
y = alzheimers_df["Diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

print(f'New Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'New Testing Score: {clf.score(X_test_scaled, y_test)}')

feature_importances = clf.feature_importances_

importances_sorted = sorted(zip(feature_importances, X.columns), reverse=True)
for feature in importances_sorted:
    print(feature)

New Training Score: 1.0
New Testing Score: 0.7156133828996283
(0.07525981672256005, 'MemoryComplaints')
(0.06707722315085908, 'BMI')
(0.06572090670514891, 'CholesterolTriglycerides')
(0.0656192024522364, 'CholesterolTotal')
(0.06469815522681516, 'PhysicalActivity')
(0.06433476781446008, 'CholesterolHDL')
(0.06387299606067885, 'DietQuality')
(0.062405892679955076, 'AlcoholConsumption')
(0.06131237426057212, 'SleepQuality')
(0.05967062805289259, 'CholesterolLDL')
(0.05567901132110812, 'DiastolicBP')
(0.05564572058789665, 'SystolicBP')
(0.05139824536818142, 'Age')
(0.04541909073942969, 'BehavioralProblems')
(0.021109220682029372, 'EducationLevel')
(0.017245352288574715, 'Ethnicity')
(0.010064798743324105, 'Gender')
(0.009357302487280155, 'Forgetfulness')
(0.009175033499052609, 'FamilyHistoryAlzheimers')
(0.008572849220016722, 'Smoking')
(0.008401116743186868, 'Confusion')
(0.008107877075082493, 'Depression')
(0.008086182519628457, 'Hypertension')
(0.0075866648710310675, 'CardiovascularDis

In [236]:
alzheimers_df["Age"].min()

60

In [237]:
cols = X.columns

#for col in cols:
    

In [238]:
positive_df = alzheimers_df.loc[alzheimers_df["Diagnosis"]==1]
positive_df

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
7,4758,75,0,0,1,18.776009,0,13.723826,4.649451,8.341903,...,0,0,4.517248,1,0,0,0,1,1,XXXConfid
13,4764,78,1,0,1,28.870652,1,10.194706,0.631281,1.653281,...,1,0,1.939596,0,1,0,0,0,1,XXXConfid
15,4766,69,0,0,1,18.045917,0,8.116832,2.956495,7.570633,...,0,0,1.911131,0,0,0,0,1,1,XXXConfid
16,4767,63,1,1,2,22.822896,1,4.433961,7.182895,7.929486,...,1,0,1.382086,0,0,0,0,0,1,XXXConfid
17,4768,65,1,0,1,16.333283,1,4.161795,1.306320,2.888936,...,0,0,2.892940,0,0,0,0,0,1,XXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2143,6894,66,1,2,1,32.013806,1,9.308706,4.352402,5.432374,...,0,0,4.544538,0,0,0,1,1,1,XXXConfid
2144,6895,61,0,0,1,39.121757,0,1.561126,4.049964,6.555306,...,0,0,4.492838,1,0,0,0,0,1,XXXConfid
2145,6896,75,0,0,2,17.857903,0,18.767261,1.360667,2.904662,...,0,1,9.204952,0,0,0,0,0,1,XXXConfid
2146,6897,77,0,0,1,15.476479,0,4.594670,9.886002,8.120025,...,0,0,5.036334,0,0,0,0,0,1,XXXConfid


In [239]:
negative_df = alzheimers_df.loc[alzheimers_df["Diagnosis"]==0]
negative_df

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2137,6888,86,0,3,0,24.226040,0,10.890948,2.938590,5.530857,...,0,0,9.870780,0,0,0,0,0,0,XXXConfid
2139,6890,68,0,1,0,17.828965,0,2.982747,8.394826,0.317526,...,0,0,8.819115,0,0,0,1,0,0,XXXConfid
2140,6891,89,0,1,2,34.422419,0,7.770687,0.947567,5.732139,...,0,0,8.734082,0,1,0,0,0,0,XXXConfid
2141,6892,72,0,0,2,21.600144,0,19.391766,8.181469,6.640195,...,0,0,9.570776,0,0,0,0,1,0,XXXConfid


In [240]:
negative_df["FunctionalAssessment"].describe()

count    1389.000000
mean        5.860669
std         2.761000
min         0.011898
25%         3.850641
50%         6.244300
75%         8.121256
max         9.996467
Name: FunctionalAssessment, dtype: float64

In [241]:
positive_df["FunctionalAssessment"].describe()

count    760.000000
mean       3.653380
std        2.566685
min        0.000460
25%        1.576868
50%        3.299060
75%        4.920710
max        9.927945
Name: FunctionalAssessment, dtype: float64