# Multiple Linear Regression Model - Features Selection

In [1]:
import pandas as pd
import numpy as np
import statistics as stat
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import RidgeCV
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
import random

random.seed(10)

pd.options.mode.chained_assignment = None 


ad =  pd.read_csv('dataset.csv')

### A glimpse at the data

In [2]:
ad.head()
#https://scikit-learn.org/stable/auto_examples/feature_selection/plot_select_from_model_diabetes.html#sphx-glr-auto-examples-feature-selection-plot-select-from-model-diabetes-py

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


In [3]:
ad.isnull().values.any()

True

### Grouping CDR 2.0 to 1.0


In [4]:
ad.loc[ad["CDR"] == 2,"CDR"] = 1

In [5]:
ad["Male"] = np.where(ad["M/F"]=="M", 1, 0)
ad["Female"] = np.where(ad["M/F"]=="F", 1, 0)

In [6]:
#ad = ad.dropna()

### Splitting the columns into indepdent and dependent variables

In [7]:
X = ad.iloc[:,[3,4,7,8,9,10,12,13,14,15,16]] #dropped 12
y = ad.iloc[:,[11]]

y=y.astype(str)


### Normalizing the values

In [8]:
def normalize(x):
    return (x-min(x))/((max(x)) - min(x))

In [9]:
X.loc[:,['Visit']]=pd.DataFrame(normalize(X['Visit']))
X.loc[:,['MR Delay']]=pd.DataFrame(normalize(X['MR Delay']))
X.loc[:,['Age']]=pd.DataFrame(normalize(X['Age']))
X.loc[:,['EDUC']]=pd.DataFrame(normalize(X['EDUC']))
X.loc[:,['SES']]=pd.DataFrame(normalize(X['SES']))
X.loc[:,['MMSE']]=pd.DataFrame(normalize(X['MMSE']))
X.loc[:,['eTIV']]=pd.DataFrame(normalize(X['eTIV']))
X.loc[:,['nWBV']]=pd.DataFrame(normalize(X['nWBV']))
X.loc[:,['ASF']]=pd.DataFrame(normalize(X['ASF']))

### Splitting into training and test sets

In [10]:
ad_X_train, ad_X_test, ad_y_train, ad_y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Impute missing values for the training set

In [11]:
med_SES = ad_X_train['SES'].mean()
avg_MMSE = ad_X_train['MMSE'].mean()

In [12]:
ad_X_train.loc[:,'SES'] = ad_X_train.loc[:,'SES'].fillna(med_SES)
ad_X_train.loc[:,'MMSE'] = ad_X_train.loc[:,'MMSE'].fillna(avg_MMSE)

In [13]:
ad_y_train = ad_y_train.astype(float)
ad_y_train.loc[:,'CDR'] = ad_y_train.loc[:,'CDR'].fillna(stat.median(ad_y_train['CDR']))
ad_y_train = ad_y_train.astype(str)


### Complete case for test set

In [14]:
ad_X_test["CDR"] = ad_y_test
ad_X_test = ad_X_test.dropna(axis = 0, how='any')

ad_y_test = ad_X_test["CDR"]
ad_X_test=ad_X_test.loc[:,ad_X_test.columns!='CDR']

### Defining the function to check the accuracy, precise, and recall scores for number of features selected for Linear Regression

In [15]:
def acc_test(cnt,fold):
    regressor = LinearRegression()
    
    kf = KFold(n_splits=fold, random_state=4, shuffle=True)
    oversample = RandomOverSampler(sampling_strategy='minority')
    #med_SES = stat.median(ad_X_train['SES'])
    #avg_MMSE = ad_X_train['MMSE'].mean()
    acc_score = []
    precise_score = []
    recall = []

    for train_index, test_index in kf.split(ad_X_train):
        X_train, X_test = ad_X_train.iloc[train_index,:], ad_X_train.iloc[test_index,:]
        y_train, y_test = ad_y_train.iloc[train_index,:], ad_y_train.iloc[test_index,:]

        X_train, y_train = oversample.fit_resample(X_train, y_train)


        # impute missing values for training set
        #X_train.loc[:,'SES'] = X_train.loc[:,'SES'].fillna(med_SES)
        #X_train.loc[:,'MMSE'] = X_train.loc[:,'MMSE'].fillna(avg_MMSE)

       # y_train = y_train.astype(float)
        #y_train.loc[:,'CDR'] = y_train.loc[:,'CDR'].fillna(stat.median(y_train['CDR']))
        #y_train = y_train.astype(str)

       

        #Complete case for test set
        #ad_X_test["CDR"] = ad_y_test
        #ad_X_test = ad_X_test.dropna(axis = 0, how='any')

        #ad_y_test = ad_X_test["CDR"]
        #ad_X_test=ad_X_test.loc[:,ad_X_test.columns!='CDR']

        ridge = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X_train,y_train)

        
        if cnt < 8:
            sfs_forward = SequentialFeatureSelector(
            ridge , n_features_to_select = cnt, direction = "forward").fit(X_train,y_train)

            regressor.fit(X_train[X_train.columns[sfs_forward.get_support()]],y_train)

            y_pred = regressor.predict(ad_X_test[ad_X_test.columns[sfs_forward.get_support()]])
        else:
            regressor.fit(X_train,y_train)
            y_pred = regressor.predict(ad_X_test)
            
        new_y_pred = np.around(y_pred * 2.0) / 2.0
        new_y_pred = new_y_pred.astype(str)
        #ad_y_test = ad_y_test.astype(str)

        acc = accuracy_score(ad_y_test, new_y_pred)
        acc_score.append(acc)

        precise = precision_score(ad_y_test, new_y_pred, average='macro',zero_division=1)
        precise_score.append(precise)

        rec = recall_score(ad_y_test, new_y_pred, average = 'macro', zero_division = 1)
        recall.append(rec)

        
    return acc_score, precise_score, recall
    
    
        
        
    

In [16]:
ad_X_test.head()

Unnamed: 0,Visit,MR Delay,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF,Male,Female
327,0.0,0.0,0.578947,0.470588,0.25,0.730769,0.454343,0.176166,0.398031,1,0
33,0.0,0.0,0.710526,0.470588,0.0,1.0,0.334076,0.367876,0.523207,0,1
15,0.0,0.0,0.210526,0.352941,0.25,0.884615,0.390869,0.839378,0.462729,1,0
314,0.0,0.0,0.473684,0.705882,0.0,1.0,0.152561,0.53886,0.753868,0,1
57,0.0,0.0,0.684211,0.352941,0.5,1.0,0.360802,0.38342,0.493671,0,1


### Selecting One Feature

In [17]:
a5 = acc_test(1,5)
a10 = acc_test(1,10)
a15 = acc_test(1,15)
a20 = acc_test(1,20)
a25 = acc_test(1,25)

a5_acc = stat.mean(a5[0])
a10_acc = stat.mean(a10[0])
a15_acc = stat.mean(a15[0])
a20_acc = stat.mean(a20[0])
a25_acc = stat.mean(a25[0])

print("Accuracy Scores")
print(a5_acc)
print(a10_acc)
print(a15_acc)
print(a20_acc)
print(a25_acc)

acc1 = (a5_acc + a10_acc + a15_acc + a20_acc + a25_acc) / 5

print("The average of the accuracy score: ", acc1)

print("*******************************")
print("Precision Scores")
a5_prec = stat.mean(a5[1])
a10_prec = stat.mean(a10[1])
a15_prec = stat.mean(a15[1])
a20_prec = stat.mean(a20[1])
a25_prec = stat.mean(a25[1])

print(a5_prec)
print(a10_prec)
print(a15_prec)
print(a20_prec)
print(a25_prec)

pre1 =  (a5_prec + a10_prec + a15_prec + a20_prec + a25_prec) / 5

print("The average of the precision score: ", pre1)

print("*******************************")
print("Recall Scores")
a5_rec = stat.mean(a5[2])
a10_rec = stat.mean(a10[2])
a15_rec = stat.mean(a15[2])
a20_rec = stat.mean(a20[2])
a25_rec = stat.mean(a25[2])

print(a5_rec)
print(a10_rec)
print(a15_rec)
print(a20_rec)
print(a25_rec)

rec1 = (a5_rec + a10_rec + a15_rec + a20_rec + a25_rec) / 5


print("The average of the recall score: ", rec1 )

Accuracy Scores
0.5906542056074766
0.5841121495327102
0.5862928348909657
0.5855140186915887
0.5779439252336448
The average of the accuracy score:  0.5849034267912772
*******************************
Precision Scores
0.4410240024661131
0.4079386763069642
0.4189671183600138
0.4106814464935956
0.4071327136041751
The average of the precision score:  0.4171487914461724
*******************************
Recall Scores
0.6711357380848907
0.6313378241344343
0.6446037954512531
0.6355044908011009
0.6303447776329133
The average of the recall score:  0.6425853252209184


### Selecting Two Features

In [18]:
a5 = acc_test(2,5)
a10 = acc_test(2,10)
a15 = acc_test(2,15)
a20 = acc_test(2,20)
a25 = acc_test(2,25)

a5_acc = stat.mean(a5[0])
a10_acc = stat.mean(a10[0])
a15_acc = stat.mean(a15[0])
a20_acc = stat.mean(a20[0])
a25_acc = stat.mean(a25[0])

print("Accuracy Scores")
print(a5_acc)
print(a10_acc)
print(a15_acc)
print(a20_acc)
print(a25_acc)

acc2 =  (a5_acc + a10_acc + a15_acc + a20_acc + a25_acc) / 5

print("The average of the accuracy score: ", acc2)

print("*******************************")
print("Precision Scores")
a5_prec = stat.mean(a5[1])
a10_prec = stat.mean(a10[1])
a15_prec = stat.mean(a15[1])
a20_prec = stat.mean(a20[1])
a25_prec = stat.mean(a25[1])

print(a5_prec)
print(a10_prec)
print(a15_prec)
print(a20_prec)
print(a25_prec)

pre2 = (a5_prec + a10_prec + a15_prec + a20_prec + a25_prec) / 5


print("The average of the precision score: ", pre2)

print("*******************************")
print("Recall Scores")
a5_rec = stat.mean(a5[2])
a10_rec = stat.mean(a10[2])
a15_rec = stat.mean(a15[2])
a20_rec = stat.mean(a20[2])
a25_rec = stat.mean(a25[2])

print(a5_rec)
print(a10_rec)
print(a15_rec)
print(a20_rec)
print(a25_rec)

rec2 = (a5_rec + a10_rec + a15_rec + a20_rec + a25_rec) / 5

print("The average of the recall score: ", rec2 )

Accuracy Scores
0.5233644859813084
0.5457943925233645
0.5358255451713395
0.5383177570093457
0.5398130841121495
The average of the accuracy score:  0.5366230529595015
*******************************
Precision Scores
0.3694271010130079
0.3751844840342159
0.37954713265172685
0.36247820482414955
0.38815883569451715
The average of the precision score:  0.37495915164352345
*******************************
Recall Scores
0.6475097783572359
0.6589229320585253
0.6418817905258583
0.6558398522381573
0.6356653628857019
The average of the recall score:  0.6479639432130957


### Selecting Three Features

In [19]:
a5 = acc_test(3,5)
a10 = acc_test(3,10)
a15 = acc_test(3,15)
a20 = acc_test(3,20)
a25 = acc_test(3,25)

a5_acc = stat.mean(a5[0])
a10_acc = stat.mean(a10[0])
a15_acc = stat.mean(a15[0])
a20_acc = stat.mean(a20[0])
a25_acc = stat.mean(a25[0])

print("Accuracy Scores")
print(a5_acc)
print(a10_acc)
print(a15_acc)
print(a20_acc)
print(a25_acc)

acc3 =  (a5_acc + a10_acc + a15_acc + a20_acc + a25_acc) / 5

print("The average of the accuracy score: ", acc3)

print("*******************************")
print("Precision Scores")
a5_prec = stat.mean(a5[1])
a10_prec = stat.mean(a10[1])
a15_prec = stat.mean(a15[1])
a20_prec = stat.mean(a20[1])
a25_prec = stat.mean(a25[1])

print(a5_prec)
print(a10_prec)
print(a15_prec)
print(a20_prec)
print(a25_prec)

pre3 = (a5_prec + a10_prec + a15_prec + a20_prec + a25_prec) / 5


print("The average of the precision score: ", pre3)

print("*******************************")
print("Recall Scores")
a5_rec = stat.mean(a5[2])
a10_rec = stat.mean(a10[2])
a15_rec = stat.mean(a15[2])
a20_rec = stat.mean(a20[2])
a25_rec = stat.mean(a25[2])

print(a5_rec)
print(a10_rec)
print(a15_rec)
print(a20_rec)
print(a25_rec)

rec3 = (a5_rec + a10_rec + a15_rec + a20_rec + a25_rec) / 5

print("The average of the recall score: ", rec3 )

Accuracy Scores
0.5794392523364486
0.6130841121495326
0.5862928348909657
0.5761682242990653
0.5801869158878504
The average of the accuracy score:  0.5870342679127726
*******************************
Precision Scores
0.40209284487075736
0.42930693999822417
0.4060447399920914
0.4004702715856987
0.39410197363484506
The average of the precision score:  0.4064033540163233
*******************************
Recall Scores
0.6552165725047081
0.6811234245980009
0.6677159689024096
0.6626387078081993
0.671261770244821
The average of the recall score:  0.6675912888116279


### Selecting Four Features

In [20]:
a5 = acc_test(4,5)
a10 = acc_test(4,10)
a15 = acc_test(4,15)
a20 = acc_test(4,20)
a25 = acc_test(4,25)

a5_acc = stat.mean(a5[0])
a10_acc = stat.mean(a10[0])
a15_acc = stat.mean(a15[0])
a20_acc = stat.mean(a20[0])
a25_acc = stat.mean(a25[0])

print("Accuracy Scores")
print(a5_acc)
print(a10_acc)
print(a15_acc)
print(a20_acc)
print(a25_acc)

acc4 =  (a5_acc + a10_acc + a15_acc + a20_acc + a25_acc) / 5

print("The average of the accuracy score: ", acc4)

print("*******************************")
print("Precision Scores")
a5_prec = stat.mean(a5[1])
a10_prec = stat.mean(a10[1])
a15_prec = stat.mean(a15[1])
a20_prec = stat.mean(a20[1])
a25_prec = stat.mean(a25[1])

print(a5_prec)
print(a10_prec)
print(a15_prec)
print(a20_prec)
print(a25_prec)

pre4 = (a5_prec + a10_prec + a15_prec + a20_prec + a25_prec) / 5
print("The average of the precision score: ", pre4)

print("*******************************")
print("Recall Scores")
a5_rec = stat.mean(a5[2])
a10_rec = stat.mean(a10[2])
a15_rec = stat.mean(a15[2])
a20_rec = stat.mean(a20[2])
a25_rec = stat.mean(a25[2])

print(a5_rec)
print(a10_rec)
print(a15_rec)
print(a20_rec)
print(a25_rec)

rec4 = (a5_rec + a10_rec + a15_rec + a20_rec + a25_rec) / 5

print("The average of the recall score: ", rec4)

Accuracy Scores
0.6018691588785047
0.5878504672897196
0.6211838006230529
0.5827102803738318
0.5816822429906542
The average of the accuracy score:  0.5950591900311526
*******************************
Precision Scores
0.3808094050412153
0.3973771435260682
0.43477911541415554
0.38906836934514205
0.39022416185031944
The average of the precision score:  0.3984516390353801
*******************************
Recall Scores
0.6988280457771984
0.6683920034767492
0.6804553575740017
0.6823772272924815
0.6813547732869767
The average of the recall score:  0.6822814814814816


### Selecting Five Features

In [21]:
a5 = acc_test(5,5)
a10 = acc_test(5,10)
a15 = acc_test(5,15)
a20 = acc_test(5,20)
a25 = acc_test(5,25)

a5_acc = stat.mean(a5[0])
a10_acc = stat.mean(a10[0])
a15_acc = stat.mean(a15[0])
a20_acc = stat.mean(a20[0])
a25_acc = stat.mean(a25[0])

print("Accuracy Scores")
print(a5_acc)
print(a10_acc)
print(a15_acc)
print(a20_acc)
print(a25_acc)

acc5 = (a5_acc + a10_acc + a15_acc + a20_acc + a25_acc) / 5

print("The average of the accuracy score: ", acc5)

print("*******************************")
print("Precision Scores")
a5_prec = stat.mean(a5[1])
a10_prec = stat.mean(a10[1])
a15_prec = stat.mean(a15[1])
a20_prec = stat.mean(a20[1])
a25_prec = stat.mean(a25[1])

print(a5_prec)
print(a10_prec)
print(a15_prec)
print(a20_prec)
print(a25_prec)

pre5 = (a5_prec + a10_prec + a15_prec + a20_prec + a25_prec) / 5
print("The average of the precision score: ", pre5)

print("*******************************")
print("Recall Scores")
a5_rec = stat.mean(a5[2])
a10_rec = stat.mean(a10[2])
a15_rec = stat.mean(a15[2])
a20_rec = stat.mean(a20[2])
a25_rec = stat.mean(a25[2])

print(a5_rec)
print(a10_rec)
print(a15_rec)
print(a20_rec)
print(a25_rec)

rec5 = (a5_rec + a10_rec + a15_rec + a20_rec + a25_rec) / 5
print("The average of the recall score: ", rec5)

Accuracy Scores
0.5887850467289719
0.5476635514018691
0.5725856697819314
0.5817757009345794
0.576448598130841
The average of the accuracy score:  0.5734517133956386
*******************************
Precision Scores
0.397847416916695
0.3474820884772726
0.3618383909045445
0.3693031044663737
0.3686675874368342
The average of the precision score:  0.369027717640344
*******************************
Recall Scores
0.6821874547298277
0.6783311603650587
0.6895702351634555
0.6936350137621324
0.6882857694722102
The average of the recall score:  0.686401926698537


### Selecting Six Features

In [22]:
a5 = acc_test(6,5)
a10 = acc_test(6,10)
a15 = acc_test(6,15)
a20 = acc_test(6,20)
a25 = acc_test(6,25)

a5_acc = stat.mean(a5[0])
a10_acc = stat.mean(a10[0])
a15_acc = stat.mean(a15[0])
a20_acc = stat.mean(a20[0])
a25_acc = stat.mean(a25[0])

print("Accuracy Scores")
print(a5_acc)
print(a10_acc)
print(a15_acc)
print(a20_acc)
print(a25_acc)

acc6 = (a5_acc + a10_acc + a15_acc + a20_acc + a25_acc) / 5

print("The average of the accuracy score: ", acc6)

print("*******************************")
print("Precision Scores")
a5_prec = stat.mean(a5[1])
a10_prec = stat.mean(a10[1])
a15_prec = stat.mean(a15[1])
a20_prec = stat.mean(a20[1])
a25_prec = stat.mean(a25[1])

print(a5_prec)
print(a10_prec)
print(a15_prec)
print(a20_prec)
print(a25_prec)

pre6 = (a5_prec + a10_prec + a15_prec + a20_prec + a25_prec) / 5

print("The average of the precision score: ", pre6)

print("*******************************")
print("Recall Scores")
a5_rec = stat.mean(a5[2])
a10_rec = stat.mean(a10[2])
a15_rec = stat.mean(a15[2])
a20_rec = stat.mean(a20[2])
a25_rec = stat.mean(a25[2])

print(a5_rec)
print(a10_rec)
print(a15_rec)
print(a20_rec)
print(a25_rec)

rec6 = (a5_rec + a10_rec + a15_rec + a20_rec + a25_rec) / 5
print("The average of the recall score: ", rec6)

Accuracy Scores
0.5644859813084112
0.5766355140186915
0.5619937694704049
0.5710280373831775
0.5783177570093457
The average of the accuracy score:  0.5704922118380062
*******************************
Precision Scores
0.3286749873596368
0.3638244734150984
0.3546929078025808
0.3490139531283065
0.35444898714843914
The average of the precision score:  0.3501310617708123
*******************************
Recall Scores
0.6965947172726834
0.6941257424308271
0.6793308867320167
0.7028289632526921
0.6954032546235936
The average of the recall score:  0.6936567128623625


### Selecting Seven Features

In [23]:
a5 = acc_test(7,5)
a10 = acc_test(7,10)
a15 = acc_test(7,15)
a20 = acc_test(7,20)
a25 = acc_test(7,25)

a5_acc = stat.mean(a5[0])
a10_acc = stat.mean(a10[0])
a15_acc = stat.mean(a15[0])
a20_acc = stat.mean(a20[0])
a25_acc = stat.mean(a25[0])

print("Accuracy Scores")
print(a5_acc)
print(a10_acc)
print(a15_acc)
print(a20_acc)
print(a25_acc)

acc7 = (a5_acc + a10_acc + a15_acc + a20_acc + a25_acc) / 5

print("The average of the accuracy score: ", acc7 )

print("*******************************")
print("Precision Scores")
a5_prec = stat.mean(a5[1])
a10_prec = stat.mean(a10[1])
a15_prec = stat.mean(a15[1])
a20_prec = stat.mean(a20[1])
a25_prec = stat.mean(a25[1])

print(a5_prec)
print(a10_prec)
print(a15_prec)
print(a20_prec)
print(a25_prec)

pre7 = (a5_prec + a10_prec + a15_prec + a20_prec + a25_prec) / 5

print("The average of the precision score: ", pre7)

print("*******************************")
print("Recall Scores")
a5_rec = stat.mean(a5[2])
a10_rec = stat.mean(a10[2])
a15_rec = stat.mean(a15[2])
a20_rec = stat.mean(a20[2])
a25_rec = stat.mean(a25[2])

print(a5_rec)
print(a10_rec)
print(a15_rec)
print(a20_rec)
print(a25_rec)

rec7 = (a5_rec + a10_rec + a15_rec + a20_rec + a25_rec) / 5

print("The average of the recall score: ", rec7)

Accuracy Scores
0.5588785046728971
0.5532710280373832
0.5439252336448598
0.5649532710280374
0.5715887850467289
The average of the accuracy score:  0.5585233644859813
*******************************
Precision Scores
0.3306580585782731
0.32856771936730816
0.33813524728434746
0.33375906243565134
0.35509796358104917
The average of the precision score:  0.3372436102493258
*******************************
Recall Scores
0.695889226906176
0.699953160461635
0.6929817953546767
0.7090827659471728
0.6957525713457917
The average of the recall score:  0.6987319040030905


### Selecting Eight Features

In [24]:
a5 = acc_test(8,5)
a10 = acc_test(8,10)
a15 = acc_test(8,15)
a20 = acc_test(8,20)
a25 = acc_test(8,25)

a5_acc = stat.mean(a5[0])
a10_acc = stat.mean(a10[0])
a15_acc = stat.mean(a15[0])
a20_acc = stat.mean(a20[0])
a25_acc = stat.mean(a25[0])

print("Accuracy Scores")
print(a5_acc)
print(a10_acc)
print(a15_acc)
print(a20_acc)
print(a25_acc)

acc8 = (a5_acc + a10_acc + a15_acc + a20_acc + a25_acc) / 5

print("The average of the accuracy score: ", acc8 )

print("*******************************")
print("Precision Scores")
a5_prec = stat.mean(a5[1])
a10_prec = stat.mean(a10[1])
a15_prec = stat.mean(a15[1])
a20_prec = stat.mean(a20[1])
a25_prec = stat.mean(a25[1])

print(a5_prec)
print(a10_prec)
print(a15_prec)
print(a20_prec)
print(a25_prec)

pre8 = (a5_prec + a10_prec + a15_prec + a20_prec + a25_prec) / 5

print("The average of the precision score: ", pre8 )

print("*******************************")
print("Recall Scores")
a5_rec = stat.mean(a5[2])
a10_rec = stat.mean(a10[2])
a15_rec = stat.mean(a15[2])
a20_rec = stat.mean(a20[2])
a25_rec = stat.mean(a25[2])

print(a5_rec)
print(a10_rec)
print(a15_rec)
print(a20_rec)
print(a25_rec)

rec8 = (a5_rec + a10_rec + a15_rec + a20_rec + a25_rec) / 5

print("The average of the recall score: ", rec8)

Accuracy Scores
0.5514018691588785
0.5336448598130841
0.5476635514018691
0.555607476635514
0.5525233644859813
The average of the accuracy score:  0.5481682242990653
*******************************
Precision Scores
0.31866675499980535
0.32520017292933806
0.3255754148546014
0.3310354186415325
0.33193186846176753
The average of the precision score:  0.326481925977409
*******************************
Recall Scores
0.7101772176348448
0.6959495871360278
0.7048236676485264
0.7071509971509972
0.7043152252643778
The average of the recall score:  0.7044833389669549


### Selecting Nine Features

In [25]:
a5 = acc_test(9,5)
a10 = acc_test(9,10)
a15 = acc_test(9,15)
a20 = acc_test(9,20)
a25 = acc_test(9,25)

a5_acc = stat.mean(a5[0])
a10_acc = stat.mean(a10[0])
a15_acc = stat.mean(a15[0])
a20_acc = stat.mean(a20[0])
a25_acc = stat.mean(a25[0])

print("Accuracy Scores")
print(a5_acc)
print(a10_acc)
print(a15_acc)
print(a20_acc)
print(a25_acc)

acc9 = (a5_acc + a10_acc + a15_acc + a20_acc + a25_acc) / 5

print("The average of the accuracy score: ", acc9 )

print("*******************************")
print("Precision Scores")
a5_prec = stat.mean(a5[1])
a10_prec = stat.mean(a10[1])
a15_prec = stat.mean(a15[1])
a20_prec = stat.mean(a20[1])
a25_prec = stat.mean(a25[1])

print(a5_prec)
print(a10_prec)
print(a15_prec)
print(a20_prec)
print(a25_prec)

pre9 = (a5_prec + a10_prec + a15_prec + a20_prec + a25_prec) / 5

print("The average of the precision score: ", pre9 )

print("*******************************")
print("Recall Scores")
a5_rec = stat.mean(a5[2])
a10_rec = stat.mean(a10[2])
a15_rec = stat.mean(a15[2])
a20_rec = stat.mean(a20[2])
a25_rec = stat.mean(a25[2])

print(a5_rec)
print(a10_rec)
print(a15_rec)
print(a20_rec)
print(a25_rec)

rec9 = (a5_rec + a10_rec + a15_rec + a20_rec + a25_rec) / 5

print("The average of the recall score: ", rec9)

Accuracy Scores
0.5271028037383177
0.5429906542056074
0.5538940809968846
0.5448598130841121
0.5517757009345794
The average of the accuracy score:  0.5441246105919003
*******************************
Precision Scores
0.3067753699463013
0.3229318602805957
0.33053670478378105
0.32389561879552736
0.3357931446678344
The average of the precision score:  0.32398653969480795
*******************************
Recall Scores
0.6963774204452171
0.7032053696460476
0.704105461393597
0.7024392776087691
0.6977884977545995
The average of the recall score:  0.7007832053696461


## Scorecard

### Accuracy Score

In [26]:
print("Accuracy Score")
print("Selection One Feature: ", acc1)
print("Selection Two Features: ", acc2)
print("Selection Three Features: ", acc3)
print("Selection Four Features: ", acc4)
print("Selection Five Features: ", acc5)
print("Selection Six Features: ", acc6)
print("Selection Seven Features: ", acc7)
print("Selection Eight Features: ", acc8)
print("Selection Nine Features: ", acc9)


Accuracy Score
Selection One Feature:  0.5849034267912772
Selection Two Features:  0.5366230529595015
Selection Three Features:  0.5870342679127726
Selection Four Features:  0.5950591900311526
Selection Five Features:  0.5734517133956386
Selection Six Features:  0.5704922118380062
Selection Seven Features:  0.5585233644859813
Selection Eight Features:  0.5481682242990653
Selection Nine Features:  0.5441246105919003


### Precision Score

In [27]:
print("Precision Score")
print("Selection One Feature: ", pre1)
print("Selection Two Features: ", pre2)
print("Selection Three Features: ", pre3)
print("Selection Four Features: ", pre4)
print("Selection Five Features: ", pre5)
print("Selection Six Features: ", pre6)
print("Selection Seven Features: ", pre7)
print("Selection Eight Features: ", pre8)
print("Selection Nine Features: ", pre9)

Precision Score
Selection One Feature:  0.4171487914461724
Selection Two Features:  0.37495915164352345
Selection Three Features:  0.4064033540163233
Selection Four Features:  0.3984516390353801
Selection Five Features:  0.369027717640344
Selection Six Features:  0.3501310617708123
Selection Seven Features:  0.3372436102493258
Selection Eight Features:  0.326481925977409
Selection Nine Features:  0.32398653969480795


### Recall Score

In [28]:
print("Recall Score")
print("Selection One Feature: ", rec1)
print("Selection Two Features: ", rec2)
print("Selection Three Features: ", rec3)
print("Selection Four Features: ", rec4)
print("Selection Five Features: ", rec5)
print("Selection Six Features: ", rec6)
print("Selection Seven Features: ", rec7)
print("Selection Eight Features: ", rec8)
print("Selection Nine Features: ", rec9)

Recall Score
Selection One Feature:  0.6425853252209184
Selection Two Features:  0.6479639432130957
Selection Three Features:  0.6675912888116279
Selection Four Features:  0.6822814814814816
Selection Five Features:  0.686401926698537
Selection Six Features:  0.6936567128623625
Selection Seven Features:  0.6987319040030905
Selection Eight Features:  0.7044833389669549
Selection Nine Features:  0.7007832053696461


### Selecing the feature(s)

In [29]:
from sklearn.model_selection import train_test_split
med_SES = stat.median(ad_X_train['SES'])
avg_MMSE = ad_X_train['MMSE'].mean()
X_train, X_test, y_train, y_test = train_test_split(ad_X_train, ad_y_train, test_size=0.2, random_state=42)
X_train.loc[:,'SES'] = X_train.loc[:,'SES'].fillna(med_SES)
X_train.loc[:,'MMSE'] = X_train.loc[:,'MMSE'].fillna(avg_MMSE)

y_train = y_train.astype(float)
y_train.loc[:,'CDR'] = y_train.loc[:,'CDR'].fillna(stat.median(y_train['CDR']))
y_train = y_train.astype(str)

ridge = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X_train,y_train)

sfs_forward = SequentialFeatureSelector(
ridge , n_features_to_select = 4, direction = "forward").fit(X_train,y_train)

X_test.columns[sfs_forward.get_support()]

Index(['MMSE', 'nWBV', 'ASF', 'Female'], dtype='object')