# Random Forest Model

In [1]:
import pandas as pd
import numpy as np
import statistics as stat
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import random

pd.options.mode.chained_assignment = None 

random.seed(10)
ad =  pd.read_csv('dataset.csv')

### Changing CDR 2 to CDR1

In [2]:
ad.loc[ad["CDR"] == 2,"CDR"] = 1

In [3]:
ad["Male"] = np.where(ad["M/F"]=="M", 1, 0)
ad["Female"] = np.where(ad["M/F"]=="F", 1, 0)

### A glimpse at the data

In [4]:
ad.head()

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF,Male,Female
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883,1,0
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876,1,0
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046,1,0
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01,1,0
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034,1,0


### Features Selected [Visit	MR Delay	Age	EDUC	SES	MMSE	CDR	eTIV	nWBV	Female]

In [5]:
X = ad.iloc[:,[3,4,7,8,9,10,11,12,13,16]]
y = ad.iloc[:,[2]]

In [6]:
X.head()

Unnamed: 0,Visit,MR Delay,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,Female
0,1,0,87,14,2.0,27.0,0.0,1987,0.696,0
1,2,457,88,14,2.0,30.0,0.0,2004,0.681,0
2,1,0,75,12,,23.0,0.5,1678,0.736,0
3,2,560,76,12,,28.0,0.5,1738,0.713,0
4,3,1895,80,12,,22.0,0.5,1698,0.701,0


### Complete case to use for evaluation

In [7]:
test_ad = ad.dropna()

In [8]:
test_X = test_ad.iloc[:,[3,4,7,8,9,10,11,12,13,16]] 
test_y = test_ad.iloc[:,[2]]

### Splitting into training and testing set to look for optimal numbers of tree and stable model

In [9]:
X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(test_X, test_y, test_size=0.3, random_state=42)

In [10]:
ad_X_train, ad_X_test, ad_y_train, ad_y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
med_SES = ad_X_train['SES'].mean()
avg_MMSE = ad_X_train['MMSE'].mean()
ad_X_train.loc[:,'SES'] = ad_X_train.loc[:,'SES'].fillna(med_SES)
ad_X_train.loc[:,'MMSE'] = ad_X_train.loc[:,'MMSE'].fillna(avg_MMSE)

In [12]:
ad_X_test["Group"] = ad_y_test
ad_X_test = ad_X_test.dropna(axis = 0, how='any')

ad_y_test = ad_X_test["Group"]

### To look for optimal number of trees

In [13]:
kf = KFold(n_splits=5, random_state=4, shuffle=True)
oversample = RandomOverSampler(sampling_strategy='minority')

model_set= []
scores={}
cnt = 0

for t in range(1,61):
    clf=RandomForestClassifier(n_estimators=t)

    acc_scores = []
    for i in range(0,10):
        
        for train_index, test_index in kf.split(ad_X_train):
            
            X_train, X_test = ad_X_train.iloc[train_index,:], ad_X_train.iloc[test_index,:]
            y_train, y_test = ad_y_train.iloc[train_index,:], ad_y_train.iloc[test_index,:]

            X_train, y_train = oversample.fit_resample(X_train, y_train.values.ravel())

            model = clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

            acc = accuracy_score(y_test, y_pred)

            acc_scores.append(acc)
    scores[t] = acc_scores    

### Find the model with the lowest standard deviation

In [14]:
num_tree = 0
std = stat.stdev(scores[1])
for i in scores:
    if stat.stdev(scores[i]) < std:
        num_tree = i
        std = stat.stdev(scores[i])

In [15]:
print(num_tree,std)

57 0.027590539859128658


### To look for a stable model

In [16]:
kf = KFold(n_splits=5, random_state=4, shuffle=True)
oversample = RandomOverSampler(sampling_strategy='minority')
clf=RandomForestClassifier(n_estimators=num_tree)

model_set= []
scores={}
cnt = 0

for i in range(0,10):
    
    for train_index, test_index in kf.split(ad_X_train):
        
        X_train, X_test = ad_X_train.iloc[train_index,:], ad_X_train.iloc[test_index,:]
        y_train, y_test = ad_y_train.iloc[train_index,:], ad_y_train.iloc[test_index,:]

        X_train, y_train = oversample.fit_resample(X_train, y_train.values.ravel())

        model = clf.fit(X_train, y_train)
        acc_score =[]
        for i in range(0,20):
            for train2_index, test2_index in kf.split(test_ad):
                X_test_2 = test_X.iloc[test2_index,:]
                y_test_2 = test_y.iloc[test2_index,:]

                y_pred = model.predict(X_test_2)

                acc = accuracy_score(y_test_2, y_pred)
                acc_score.append(acc)
        scores[cnt] = acc_scores
        model_set.append(model)
        cnt += 1

In [17]:
k = 0
std = stat.stdev(scores[0])
for i in scores:
    if stat.stdev(scores[i]) < std:
        k = i
        std = stat.stdev(scores[i])

In [18]:
print(k, std)

0 0.03518966881893625


### Model Evaluation

In [19]:
for train_index, test_index in kf.split(ad_X_train):
    X_train, X_test = ad_X_train.iloc[train_index,:], ad_X_train.iloc[test_index,:]
    y_train, y_test = ad_y_train.iloc[train_index,:], ad_y_train.iloc[test_index,:]

    y_pred = model_set[k].predict(X_test)


    print(accuracy_score(y_test, y_pred))

1.0
1.0
1.0
1.0
0.8846153846153846


In [20]:
y_pred = model_set[k].predict(X_test_e)

In [21]:
print("Accuracy: ", accuracy_score(y_test_e, y_pred))
print("Precision: ", precision_score(y_test_e, y_pred, average='macro',zero_division=1))
print("Recall: ", recall_score(y_test_e, y_pred, average='macro',zero_division=1))

Accuracy:  0.9252336448598131
Precision:  0.8949494949494948
Recall:  0.8670888433600298


In [22]:
labels = np.unique(y_test_e)
a = confusion_matrix(y_test_e, y_pred, labels=labels)

pd.DataFrame(a, index=labels, columns=labels)

Unnamed: 0,Converted,Demented,Nondemented
Converted,9,1,3
Demented,2,33,0
Nondemented,0,2,57


### Saving the model

In [23]:
import pickle
pickle.dump(model_set[k], open('alzheinmer_forest.pkl', 'wb'))

In [24]:
#forest_model = pickle.load(open('alzheinmer_foresr.pkl', 'rb'))