# Multiple Linear Regression Model

In [28]:
import pandas as pd
import numpy as np
import statistics as stat
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import random
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report

random.seed(4)

pd.options.mode.chained_assignment = None 


ad =  pd.read_csv('dataset.csv')

### Grouping CDR 2.0 to 1.0

In [29]:
ad.loc[ad["CDR"] == 2,"CDR"] = 1

In [30]:
ad["Male"] = np.where(ad["M/F"]=="M", 1, 0)
ad["Female"] = np.where(ad["M/F"]=="F", 1, 0)

### A glimpse at the data

In [31]:
ad.head()

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF,Male,Female
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883,1,0
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876,1,0
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046,1,0
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01,1,0
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034,1,0


### Features Selected ['MMSE', 'nWBV', 'ASF', 'Female']

In [32]:
X = ad.iloc[:,[10,13,14,16]]
y = ad.iloc[:,[11]]

y=y.astype(str)

In [33]:
X.head()

Unnamed: 0,MMSE,nWBV,ASF,Female
0,27.0,0.696,0.883,0
1,30.0,0.681,0.876,0
2,23.0,0.736,1.046,0
3,28.0,0.713,1.01,0
4,22.0,0.701,1.034,0


### Complete case to use for model evaluation

In [34]:
test_ad = ad.dropna()

In [35]:
test_X = test_ad.iloc[:,[10,13,14,16]] 
test_y = test_ad.iloc[:,[11]]
test_y=test_y.astype(str)

In [36]:
test_X.head()

Unnamed: 0,MMSE,nWBV,ASF,Female
0,27.0,0.696,0.883,0
1,30.0,0.681,0.876,0
5,28.0,0.71,1.444,1
6,27.0,0.718,1.462,1
7,28.0,0.712,1.039,0


In [37]:
X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(test_X, test_y, test_size=0.3, random_state=42)

### Splitting into training set and test set to look for a stable model

In [38]:
ad_X_train, ad_X_test, ad_y_train, ad_y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [39]:
#med_SES = ad_X_train['SES'].mean()
avg_MMSE = ad_X_train['MMSE'].mean()
#ad_X_train.loc[:,'SES'] = ad_X_train.loc[:,'SES'].fillna(med_SES)
ad_X_train.loc[:,'MMSE'] = ad_X_train.loc[:,'MMSE'].fillna(avg_MMSE)

In [40]:
ad_X_test["CDR"] = ad_y_test
ad_X_test = ad_X_test.dropna(axis = 0, how='any')

ad_y_test = ad_X_test["CDR"]
ad_X_test=ad_X_test.loc[:,ad_X_test.columns!='CDR']

In [41]:
regressor = LinearRegression()
kf = KFold(n_splits=5, random_state=4, shuffle=True)
oversample = RandomOverSampler(sampling_strategy='minority')
model_set= []
scores={}
cnt = 0

for i in range(0,100):
    for train_index, test_index in kf.split(ad_X_train):
        X_train, X_test = ad_X_train.iloc[train_index,:], ad_X_train.iloc[test_index,:]
        y_train, y_test = ad_y_train.iloc[train_index,:], ad_y_train.iloc[test_index,:]

        X_train, y_train = oversample.fit_resample(X_train, y_train)

        model = regressor.fit(X_train, y_train)
        acc_score =[]
        
        for i in range(0,10):
            for train2_index, test2_index in kf.split(test_ad):
                
                X_test_2 = test_X.iloc[test2_index,:] 
                y_test_2 = test_y.iloc[test2_index,:]

                y_pred = model.predict(X_test_2)

                new_y_pred = np.around(y_pred * 2.0) / 2.0
                new_y_pred = new_y_pred.astype(str)

                acc = accuracy_score(y_test_2, new_y_pred)
                acc_score.append(acc)
        scores[cnt] = acc_score
        model_set.append(model)
        cnt += 1






### Find out which model has the lowest standard deviation

In [42]:
k = 0
std = stat.stdev(scores[0])
for i in scores:
    if stat.stdev(scores[i]) < std:
        k = i
        std = stat.stdev(scores[i])

In [43]:
len(scores[0])

50

In [44]:
print(k, std)

25 0.015790838395884337


In [45]:
model_set[k].coef_

array([[-0.04732268, -2.81810437,  0.53813487, -0.13549174]])

In [46]:
for train_index, test_index in kf.split(ad_X_train):
    X_train, X_test = ad_X_train.iloc[train_index,:], ad_X_train.iloc[test_index,:]
    y_train, y_test = ad_y_train.iloc[train_index,:], ad_y_train.iloc[test_index,:]

    y_pred = model_set[k].predict(X_test)

    new_y_pred = np.around(y_pred * 2.0) / 2.0
    new_y_pred = new_y_pred.astype(str)

    print(accuracy_score(y_test, new_y_pred))

0.6226415094339622
0.6346153846153846
0.5769230769230769
0.6153846153846154
0.5961538461538461


### Model Evaluation

In [47]:
y_pred = model_set[k].predict(X_test_e)
new_y_pred = np.around(y_pred * 2.0) / 2.0
new_y_pred = new_y_pred.astype(str)

In [48]:
print("Accuracy: ", accuracy_score(y_test_e, new_y_pred))
print("Precision: ", precision_score(y_test_e, new_y_pred, average='macro',zero_division=1))
print("Recall: ", recall_score(y_test_e, new_y_pred, average='macro',zero_division=1))

Accuracy:  0.5887850467289719
Precision:  0.37777777777777777
Recall:  0.7502840909090909


In [49]:
labels = np.unique(y_test_e)
a = confusion_matrix(y_test_e, new_y_pred, labels=labels)

pd.DataFrame(a, index=labels, columns=labels)

Unnamed: 0,0.0,0.5,1.0
0.0,33,30,0
0.5,3,25,4
1.0,0,5,5


### Saving the Model

In [50]:
import pickle
pickle.dump(model_set[k], open('multiple_LR.pkl', 'wb'))

In [51]:
mLR_model = pickle.load(open('multiple_LR.pkl', 'rb'))


In [52]:
mLR_model.coef_

array([[-0.04732268, -2.81810437,  0.53813487, -0.13549174]])