In [1]:
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


In [2]:
alzheimers_df = pd.read_csv("archive/alzheimers_disease_data.csv")

alzheimers_df

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,6895,61,0,0,1,39.121757,0,1.561126,4.049964,6.555306,...,0,0,4.492838,1,0,0,0,0,1,XXXConfid
2145,6896,75,0,0,2,17.857903,0,18.767261,1.360667,2.904662,...,0,1,9.204952,0,0,0,0,0,1,XXXConfid
2146,6897,77,0,0,1,15.476479,0,4.594670,9.886002,8.120025,...,0,0,5.036334,0,0,0,0,0,1,XXXConfid
2147,6898,78,1,3,1,15.299911,0,8.674505,6.354282,1.263427,...,0,0,3.785399,0,0,0,0,1,1,XXXConfid


In [3]:
#Get general information from the dataset, including data types and null counts
alzheimers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [4]:
#checking number of positive and negative diagonoses
alzheimers_df["Diagnosis"].value_counts()

Diagnosis
0    1389
1     760
Name: count, dtype: int64

In [5]:
#Preparing the target (Diagnosis) and dependant variable set (exluding irrelevant columns) 
X = alzheimers_df.drop(columns=["PatientID","DoctorInCharge","Diagnosis"], axis = 1)
y = alzheimers_df["Diagnosis"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6)

In [7]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

print(f'Base Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Base Testing Score: {clf.score(X_test_scaled, y_test)}')

Base Training Score: 1.0
Base Testing Score: 0.9237918215613383


In [9]:
feature_importances = clf.feature_importances_

importances_sorted = sorted(zip(feature_importances, X.columns), reverse=True)
for feature in importances_sorted:
    print(feature)

(0.1824967996422388, 'FunctionalAssessment')
(0.16108053055721172, 'ADL')
(0.1317233462470854, 'MMSE')
(0.07576661240271657, 'MemoryComplaints')
(0.050259016289060016, 'BehavioralProblems')
(0.032065203237371664, 'DietQuality')
(0.02933618769074726, 'PhysicalActivity')
(0.029192554069231307, 'CholesterolHDL')
(0.02894021815137295, 'CholesterolTriglycerides')
(0.028777461086706327, 'CholesterolLDL')
(0.028487256600862015, 'BMI')
(0.028263185240259427, 'SleepQuality')
(0.02825044703791203, 'AlcoholConsumption')
(0.028109602798893588, 'CholesterolTotal')
(0.025354192387685243, 'SystolicBP')
(0.024127366575442472, 'DiastolicBP')
(0.023398990540967224, 'Age')
(0.01131728258007388, 'EducationLevel')
(0.008162277774994552, 'Ethnicity')
(0.004429311699017625, 'Gender')
(0.0037888399905859154, 'Forgetfulness')
(0.0036054540756536406, 'Depression')
(0.003583935384450165, 'DifficultyCompletingTasks')
(0.0035667559651200755, 'Diabetes')
(0.0035443164675720393, 'Smoking')
(0.003514997992766779, 'Fa

In [10]:
param_grid = {
    "max_depth": np.arange(1,20,2),
    "n_estimators": [300,500,1000]
}

rf_model = RandomForestClassifier()
#ask Adriana about GridSearchCV vs RandomizedSearchCV

grid_clf = RandomizedSearchCV(rf_model, param_grid, verbose=2, random_state=1)

grid_clf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ....................max_depth=11, n_estimators=1000; total time=   3.2s
[CV] END ....................max_depth=11, n_estimators=1000; total time=   3.2s
[CV] END ....................max_depth=11, n_estimators=1000; total time=   3.2s
[CV] END ....................max_depth=11, n_estimators=1000; total time=   3.2s
[CV] END ....................max_depth=11, n_estimators=1000; total time=   3.2s
[CV] END .....................max_depth=15, n_estimators=300; total time=   0.9s
[CV] END .....................max_depth=15, n_estimators=300; total time=   0.9s
[CV] END .....................max_depth=15, n_estimators=300; total time=   0.9s
[CV] END .....................max_depth=15, n_estimators=300; total time=   0.9s
[CV] END .....................max_depth=15, n_estimators=300; total time=   0.9s
[CV] END ......................max_depth=7, n_estimators=500; total time=   1.4s
[CV] END ......................max_depth=7, n_es

In [11]:
print(grid_clf.best_params_)
print(grid_clf.best_score_)

{'n_estimators': 500, 'max_depth': 13}
0.9472395823317885


In [12]:
from sklearn.model_selection import cross_val_score

In [13]:
#clf = RandomForestClassifier(n_estimators=1000, max_depth=19, random_state=1).fit(X_train_scaled, y_train)

clf = RandomForestClassifier(**grid_clf.best_params_, random_state=1).fit(X_train_scaled, y_train)
scores = cross_val_score(clf, X_train_scaled, y_train, cv=5)

print(f'New Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'New Testing Score: {clf.score(X_test_scaled, y_test)}')


New Training Score: 0.9993792675356921
New Testing Score: 0.9256505576208178


In [14]:
grid_clf.best_params_

{'n_estimators': 500, 'max_depth': 13}

In [15]:
clf.score(X_test_scaled, y_test)

0.9256505576208178

In [16]:
scores

array([0.9504644 , 0.9378882 , 0.95031056, 0.97515528, 0.92236025])

In [17]:
target_names = ["negative", "positive"]
9275092936802974
grid_y_pred = grid_clf.predict(X_test_scaled)
print(classification_report(y_test, grid_y_pred,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.93      0.97      0.95       353
    positive       0.93      0.86      0.89       185

    accuracy                           0.93       538
   macro avg       0.93      0.91      0.92       538
weighted avg       0.93      0.93      0.93       538



In [23]:
classification_report(y_test, grid_y_pred, output_dict=True)['1']['recall']

0.8594594594594595

In [36]:
best_estimators = grid_clf.best_params_["n_estimators"]
best_max_depth = grid_clf.best_params_["max_depth"]


clf = RandomForestClassifier(n_estimators=best_estimators, max_depth=best_max_depth).fit(X_train_scaled, y_train)
print(f'New Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'New Testing Score: {clf.score(X_test_scaled, y_test)}')


New Training Score: 0.9993792675356921
New Testing Score: 0.9275092936802974


In [None]:
feature_importances = clf.feature_importances_
importances_sorted = sorted(zip(feature_importances, X.columns), reverse=True)
for feature in importances_sorted:
    print(feature)

In [45]:
param_grid = {
    "max_depth": np.arange(5,15,2),
    "n_estimators": [300,500,1000]
}

rf_model = RandomForestClassifier()


grid_clf = GridSearchCV(rf_model, param_grid, verbose=3)

features = importances_sorted.copy()
score_list = []
while len(features)>10:

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6)
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    grid_clf.fit(X_train_scaled, y_train)
    
    print(grid_clf.best_params_)
    print(grid_clf.best_score_)
    score_list.append(grid_clf.best_score_)

    print("\n\n")
    X=X.drop(columns=features[-1][1])
    print(f"Removing the following column: {features.pop()[1]}")


Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END .....max_depth=5, n_estimators=300;, score=0.864 total time=   0.7s
[CV 2/5] END .....max_depth=5, n_estimators=300;, score=0.879 total time=   0.7s
[CV 3/5] END .....max_depth=5, n_estimators=300;, score=0.894 total time=   0.6s
[CV 4/5] END .....max_depth=5, n_estimators=300;, score=0.922 total time=   0.6s
[CV 5/5] END .....max_depth=5, n_estimators=300;, score=0.870 total time=   0.6s
[CV 1/5] END .....max_depth=5, n_estimators=500;, score=0.876 total time=   1.1s
[CV 2/5] END .....max_depth=5, n_estimators=500;, score=0.888 total time=   1.2s
[CV 3/5] END .....max_depth=5, n_estimators=500;, score=0.888 total time=   1.1s
[CV 4/5] END .....max_depth=5, n_estimators=500;, score=0.916 total time=   1.1s
[CV 5/5] END .....max_depth=5, n_estimators=500;, score=0.876 total time=   1.1s
[CV 1/5] END ....max_depth=5, n_estimators=1000;, score=0.861 total time=   2.3s
[CV 2/5] END ....max_depth=5, n_estimators=1000;

KeyboardInterrupt: 

In [42]:
test = [(1,2),(3,4)]
print(test[0][1])

2


In [None]:
clf = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=12).fit(X_train_scaled, y_train)

print(f'New Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'New Testing Score: {clf.score(X_test_scaled, y_test)}')

In [None]:
X = narrowed_df.copy()
y = alzheimers_df["Diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

print(f'New Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'New Testing Score: {clf.score(X_test_scaled, y_test)}')

feature_importances = clf.feature_importances_

importances_sorted = sorted(zip(feature_importances, X.columns), reverse=True)
for feature in importances_sorted:
    print(feature)

In [236]:
narrowed_df = alzheimers_df.copy().drop(columns=["PatientID","DoctorInCharge", "Diagnosis"], axis=1)


user_input = input("Enter a column to remove (NA to end):")
while user_input!="NA":

    narrowed_df=narrowed_df.drop(columns=[user_input], axis=1)
    
    user_input = input("Enter a column to remove (NA to end):")


60

In [237]:
cols = X.columns

#for col in cols:
    

In [238]:
positive_df = alzheimers_df.loc[alzheimers_df["Diagnosis"]==1]
positive_df

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
7,4758,75,0,0,1,18.776009,0,13.723826,4.649451,8.341903,...,0,0,4.517248,1,0,0,0,1,1,XXXConfid
13,4764,78,1,0,1,28.870652,1,10.194706,0.631281,1.653281,...,1,0,1.939596,0,1,0,0,0,1,XXXConfid
15,4766,69,0,0,1,18.045917,0,8.116832,2.956495,7.570633,...,0,0,1.911131,0,0,0,0,1,1,XXXConfid
16,4767,63,1,1,2,22.822896,1,4.433961,7.182895,7.929486,...,1,0,1.382086,0,0,0,0,0,1,XXXConfid
17,4768,65,1,0,1,16.333283,1,4.161795,1.306320,2.888936,...,0,0,2.892940,0,0,0,0,0,1,XXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2143,6894,66,1,2,1,32.013806,1,9.308706,4.352402,5.432374,...,0,0,4.544538,0,0,0,1,1,1,XXXConfid
2144,6895,61,0,0,1,39.121757,0,1.561126,4.049964,6.555306,...,0,0,4.492838,1,0,0,0,0,1,XXXConfid
2145,6896,75,0,0,2,17.857903,0,18.767261,1.360667,2.904662,...,0,1,9.204952,0,0,0,0,0,1,XXXConfid
2146,6897,77,0,0,1,15.476479,0,4.594670,9.886002,8.120025,...,0,0,5.036334,0,0,0,0,0,1,XXXConfid


In [239]:
negative_df = alzheimers_df.loc[alzheimers_df["Diagnosis"]==0]
negative_df

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2137,6888,86,0,3,0,24.226040,0,10.890948,2.938590,5.530857,...,0,0,9.870780,0,0,0,0,0,0,XXXConfid
2139,6890,68,0,1,0,17.828965,0,2.982747,8.394826,0.317526,...,0,0,8.819115,0,0,0,1,0,0,XXXConfid
2140,6891,89,0,1,2,34.422419,0,7.770687,0.947567,5.732139,...,0,0,8.734082,0,1,0,0,0,0,XXXConfid
2141,6892,72,0,0,2,21.600144,0,19.391766,8.181469,6.640195,...,0,0,9.570776,0,0,0,0,1,0,XXXConfid


In [240]:
negative_df["FunctionalAssessment"].describe()

count    1389.000000
mean        5.860669
std         2.761000
min         0.011898
25%         3.850641
50%         6.244300
75%         8.121256
max         9.996467
Name: FunctionalAssessment, dtype: float64

In [241]:
positive_df["FunctionalAssessment"].describe()

count    760.000000
mean       3.653380
std        2.566685
min        0.000460
25%        1.576868
50%        3.299060
75%        4.920710
max        9.927945
Name: FunctionalAssessment, dtype: float64

1. Get Base Score on Train/Test Splits
2. Run RandomForestCV function (runs multiple splits/folds) to see if scores are meaningful (close together)
3. Check Feature Importances
4. Check scores based on removing features, in order of least importance -> visualize
5. Once score is as high as possible, redo randomforestCV