In [80]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [81]:
df=pd.read_csv("Alzheimer's clinical data.csv" )

In [82]:
df.head()

Unnamed: 0,Subject,Gender,mmse,ageAtEntry,cdr,memory,dx1
0,OAS30124,female,16,79.14579,1.0,1.0,1
1,OAS30124,female,21,79.14579,0.5,0.5,1
2,OAS31129,female,20,68.07666,1.0,1.0,1
3,OAS31129,female,28,68.07666,1.0,1.0,1
4,OAS31129,female,29,68.07666,0.5,0.5,1


In [83]:
#Check the duplicate rows.
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1224    False
1225    False
1226    False
1227    False
1228    False
Length: 1229, dtype: bool

In [84]:
#Check the data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1229 entries, 0 to 1228
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Subject     1229 non-null   object 
 1   Gender      1229 non-null   object 
 2   mmse        1229 non-null   object 
 3   ageAtEntry  1229 non-null   float64
 4   cdr         1229 non-null   float64
 5   memory      1229 non-null   object 
 6   dx1         1229 non-null   int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 67.3+ KB


In [85]:
#descriptive structure of the data
df.describe()

Unnamed: 0,ageAtEntry,cdr,dx1
count,1229.0,1229.0,1229.0
mean,72.900069,0.702604,0.688365
std,7.739166,0.415548,0.46335
min,47.60575,0.0,0.0
25%,67.550995,0.5,0.0
50%,73.01301,0.5,1.0
75%,78.65024,1.0,1.0
max,95.56742,3.0,1.0


In [86]:
# Categorical columns
cat_col = [col for col in df.columns if df[col].dtype == 'object']
print('Categorical columns :',cat_col)
# Numerical columns
num_col = [col for col in df.columns if df[col].dtype != 'object']
print('Numerical columns :',num_col)

Categorical columns : ['Subject', 'Gender', 'mmse', 'memory']
Numerical columns : ['ageAtEntry', 'cdr', 'dx1']


In [87]:
#Check the total number of unique values in the Categorical columns
df[cat_col].nunique()


Subject    426
Gender       2
mmse        28
memory       6
dtype: int64

In [88]:
round((df.isnull().sum()/df.shape[0])*100,2)

Subject       0.0
Gender        0.0
mmse          0.0
ageAtEntry    0.0
cdr           0.0
memory        0.0
dx1           0.0
dtype: float64

In [89]:
df= df.dropna()

In [90]:
df.isnull().sum()

Subject       0
Gender        0
mmse          0
ageAtEntry    0
cdr           0
memory        0
dx1           0
dtype: int64

In [91]:
df=df.dropna()

In [92]:
df.isnull().sum()

Subject       0
Gender        0
mmse          0
ageAtEntry    0
cdr           0
memory        0
dx1           0
dtype: int64

In [93]:
#cleaning data
df['mmse'] = df['mmse'].str.replace(r'?', '72')
df['memory'] = df['memory'].str.replace(r'?','1' )

df = df.dropna()
df.isnull().sum().sum()

  df['mmse'] = df['mmse'].str.replace(r'?', '72')
  df['memory'] = df['memory'].str.replace(r'?','1' )


0

In [94]:
df=df.drop(["Subject"],axis=1)


In [95]:
df["Gender"]=df["Gender"].map({"male":0,"female":1})

In [96]:
df.head()

Unnamed: 0,Gender,mmse,ageAtEntry,cdr,memory,dx1
0,1,16,79.14579,1.0,1.0,1
1,1,21,79.14579,0.5,0.5,1
2,1,20,68.07666,1.0,1.0,1
3,1,28,68.07666,1.0,1.0,1
4,1,29,68.07666,0.5,0.5,1


In [97]:
X_train,X_test,y_train,y_test=train_test_split(df.drop('dx1',axis=1),df['dx1'],test_size=0.3,random_state=0)

In [98]:
from sklearn.naive_bayes import GaussianNB
NBmodel=GaussianNB()
NBmodel.fit(X_train,y_train)

In [99]:
predicted=NBmodel.predict(X_test)
predicted



array([0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,

In [100]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
cm2=confusion_matrix(y_test,predicted)
score2=accuracy_score(y_test,predicted)
cr2=classification_report(y_test,predicted,digits=4)
auc2=roc_auc_score(y_test,predicted)

In [101]:
print('Confusion Matrix for Naive Bayes:''\n',cm2 )
print('Accuracy Score for Naive Bayes:',round(score2,4) )
print('Auc Score for Naive Bayes:',round(auc2,4) )

Confusion Matrix for Naive Bayes:
 [[107   8]
 [ 99 155]]
Accuracy Score for Naive Bayes: 0.71
Auc Score for Naive Bayes: 0.7703


In [102]:
print('Classification Report for Naive Bayes:''\n''\n',cr2)

Classification Report for Naive Bayes:

               precision    recall  f1-score   support

           0     0.5194    0.9304    0.6667       115
           1     0.9509    0.6102    0.7434       254

    accuracy                         0.7100       369
   macro avg     0.7352    0.7703    0.7050       369
weighted avg     0.8164    0.7100    0.7195       369



In [103]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [177]:
clf = DecisionTreeClassifier(max_depth=9)
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [178]:
cm2=confusion_matrix(y_test,y_pred)
score2=accuracy_score(y_test,y_pred)
cr2=classification_report(y_test,y_pred,digits=4)
auc2=roc_auc_score(y_test,y_pred)

In [179]:
print('Confusion Matrix for decision tree:''\n',cm2 )
print('Accuracy Score for decision tree:',round(score2,4) )
print('Auc Score for decision tree:',round(auc2,4) )

Confusion Matrix for decision tree:
 [[ 81  34]
 [ 51 203]]
Accuracy Score for decision tree: 0.7696
Auc Score for decision tree: 0.7518


In [137]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Classification Report for Decision Tree:""\n",cr2)

Accuracy: 0.7696476964769647
Classification Report for Decision Tree:
               precision    recall  f1-score   support

           0     0.6136    0.7043    0.6559       115
           1     0.8565    0.7992    0.8269       254

    accuracy                         0.7696       369
   macro avg     0.7351    0.7518    0.7414       369
weighted avg     0.7808    0.7696    0.7736       369



In [234]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 50, max_depth=4)
rf.fit(X_train,y_train)

In [235]:
y_pred = rf.predict(X_test)

In [236]:
cm2=confusion_matrix(y_test,y_pred)
score2=accuracy_score(y_test,y_pred)
cr2=classification_report(y_test,y_pred,digits=4)
auc2=roc_auc_score(y_test,y_pred)

In [237]:
print('Confusion Matrix for random forest :''\n',cm2 )
print('Accuracy Score for random forest:',round(score2,4) )
print('Auc Score for random forest:',round(auc2,4) )

Confusion Matrix for random forest :
 [[ 70  45]
 [ 41 213]]
Accuracy Score for random forest: 0.7669
Auc Score for random forest: 0.7236


In [238]:
print("Classification Report for Random Forest:""\n",cr2)


Classification Report for Random Forest:
               precision    recall  f1-score   support

           0     0.6306    0.6087    0.6195       115
           1     0.8256    0.8386    0.8320       254

    accuracy                         0.7669       369
   macro avg     0.7281    0.7236    0.7258       369
weighted avg     0.7648    0.7669    0.7658       369



In [315]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=9)
classifier.fit(X_train, y_train)

In [316]:
y_pred = classifier.predict(X_test)

In [317]:

cm2=confusion_matrix(y_test,y_pred)
score2=accuracy_score(y_test,y_pred)
cr2=classification_report(y_test,y_pred,digits=4)
auc2=roc_auc_score(y_test,y_pred)

In [318]:
print('Confusion Matrix for KNN:''\n',cm2 )
print('Accuracy Score for KNN:',round(score2,4) )
print('Auc Score for KNN:',round(auc2,4) )

Confusion Matrix for KNN:
 [[ 61  54]
 [ 54 200]]
Accuracy Score for KNN: 0.7073
Auc Score for KNN: 0.6589


In [319]:
print(classification_report(y_test, y_pred,digits=4))

              precision    recall  f1-score   support

           0     0.5304    0.5304    0.5304       115
           1     0.7874    0.7874    0.7874       254

    accuracy                         0.7073       369
   macro avg     0.6589    0.6589    0.6589       369
weighted avg     0.7073    0.7073    0.7073       369



In [118]:
from sklearn.svm import SVC

In [404]:
svm_model = SVC(kernel='linear')
svm_model = svm_model.fit(X_train,y_train)

In [405]:
y_pred = svm_model.predict(X_test)

In [406]:
cm2=confusion_matrix(y_test,y_pred)
score2=accuracy_score(y_test,y_pred)
cr2=classification_report(y_test,y_pred,digits=4)
auc2=roc_auc_score(y_test,y_pred)

In [407]:
print('Confusion Matrix for SVM:''\n',cm2 )
print('Accuracy Score for SVM:',round(score2,4) )
print('Auc Score for SVM:',round(auc2,4) )

Confusion Matrix for SVM:
 [[ 68  47]
 [ 41 213]]
Accuracy Score for SVM: 0.7615
Auc Score for SVM: 0.7149


In [408]:
print("Classification Report for SVM:""\n",cr2)


Classification Report for SVM:
               precision    recall  f1-score   support

           0     0.6239    0.5913    0.6071       115
           1     0.8192    0.8386    0.8288       254

    accuracy                         0.7615       369
   macro avg     0.7215    0.7149    0.7180       369
weighted avg     0.7583    0.7615    0.7597       369

