In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
pd.options.display.max_rows = 100
import matplotlib.pyplot as plt

Cleaning Data

In [None]:
#Loading in the data
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
df = pd.read_csv("oasis_longitudinal.csv")
print(df)

#extracting pateints only from first and second visits
df = df.loc[df['Visit'].isin([1,2])]
print(df)
print("Dataset has " + str(df.shape[0]) +" values, along " + str(df.shape[1]) +" dimensions.")
print("Participents Sex: " + str(len(df.loc[df['M/F'].isin(['M'])])) +" Males and " + str(len(df.loc[df['M/F'].isin(['F'])])) +" Females.")
print("Class distribution for demented and non-demented patients in reduced dataset is as follows:" )
print("Demented: " + str(len(df.loc[df['Group']== 'Demented'])))
print("Non-Demented: " + str(len(df.loc[df['Group']== 'Nondemented'])))
print("Converted from Non-Demented to Demented: " + str(len(df.loc[df['Group']== 'Converted'])))
# Remove all converted patients
df.drop(df[df['Group'] == 'Converted'].index, inplace = True)


In [None]:
#replacing categorical variables group and gender to numeric
df['M/F'].replace(['M', 'F'],
                        [0, 1], inplace=True)
df['Group'].replace(['Demented', 'Nondemented'],
                        [0, 1], inplace=True)
del df['CDR']
del df['Subject ID']
del df['MRI ID']
del df['Hand']
del df['Visit']
#printing correlation matrix
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')
df


Feature Extraction

Using SVM

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, SelectFpr
from sklearn.impute import KNNImputer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, accuracy_score
feature_list = list(df.columns)
# target stays stable
y = df[['Group']]

y = y.iloc[0:,:].values
feature_list.remove('Group')
X = df[feature_list]
X = X.iloc[0:,:].values
acc_list = []

if np.sum(np.isnan(X)):
  #print('Total of NaN before imputation:', np.sum(np.isnan(X)))
  imputer = KNNImputer(n_neighbors=4, weights="uniform")
  X1 = imputer.fit_transform(X)

for j in range(20):
  acc_temp= []
  for i in range(1,10):
    select = SelectKBest(chi2, k=i)
    X_new = select.fit_transform(X1, y)
    # train/test/val = 60/20/20
    X_train, X_test, y_train, y_test = train_test_split(X_new, y,
    test_size=0.2, random_state= 19)
    # Use the same function above for the validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
    test_size=0.25, random_state = 7) # 0.25 x 0.8 = 0.2
    
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel = 'poly', probability=True))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    probs = clf.predict_proba(X_val)
    preds = probs[:,1]
    fpr, tpr, threshold = metrics.roc_curve(y_val, preds)
    roc_auc = metrics.auc(fpr, tpr)
    acc = np.round(accuracy_score(y_val, y_pred), 3)
    acc_temp.append(roc_auc)
  acc_list.append(acc_temp)
print(np.mean(acc_list, axis = 0))
to_plot = np.average(acc_list, axis=0)
import matplotlib.pyplot as plt

fig, ax = plt.subplots(ncols = 1, figsize= (12,8))

# MMSE
ax.plot(list(range(1,10)), to_plot)
ax.set_title("Performance with k features (SVM)")
ax.set_xlabel("k (Features Used)")
ax.set_ylabel("AUC")
ax.legend()

Decision Tree Classifier

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, SelectFpr
from sklearn.impute import KNNImputer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, accuracy_score
feature_list = list(df.columns)
# target stays stable
y = df[['Group']]
y = y.iloc[1:,:].values
feature_list.remove('Group')
X = df[feature_list]
X = X.iloc[1:,:].values
acc_list = []

if np.sum(np.isnan(X)):
  #print('Total of NaN before imputation:', np.sum(np.isnan(X)))
  imputer = KNNImputer(n_neighbors=4, weights="uniform")
  X1 = imputer.fit_transform(X)

for j in range(20):
  acc_temp= []
  for i in range(1,10):
    select = SelectKBest(chi2, k=i)
    X_new = select.fit_transform(X1, y)
    # train/test/val = 60/20/20
    X_train, X_test, y_train, y_test = train_test_split(X_new, y,
    test_size=0.2, random_state= 19)
    # Use the same function above for the validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
    test_size=0.25, random_state = 7) # 0.25 x 0.8 = 0.2
    
    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    probs = clf.predict_proba(X_val)
    preds = probs[:,1]
    fpr, tpr, threshold = metrics.roc_curve(y_val, preds)
    roc_auc = metrics.auc(fpr, tpr)
    acc = np.round(accuracy_score(y_val, y_pred), 3)
    acc_temp.append(roc_auc)
  acc_list.append(acc_temp)
print(np.mean(acc_list, axis = 0))
to_plot = np.average(acc_list, axis=0)
import matplotlib.pyplot as plt

fig, ax = plt.subplots(ncols = 1, figsize= (12,8))

# MMSE
ax.plot(list(range(1,10)), to_plot)
ax.set_title("Performance with k features (Decision Tree)")
ax.set_xlabel("k (Features Used)")
ax.set_ylabel("AUC")
ax.legend()

Logistic Regression

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, SelectFpr
from sklearn.impute import KNNImputer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import tree
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, accuracy_score
feature_list = list(df.columns)
# target stays stable
y = df[['Group']]
y = y.iloc[1:,:].values
feature_list.remove('Group')
X = df[feature_list]
X = X.iloc[1:,:].values
acc_list = []

if np.sum(np.isnan(X)):
  #print('Total of NaN before imputation:', np.sum(np.isnan(X)))
  imputer = KNNImputer(n_neighbors=4, weights="uniform")
  X1 = imputer.fit_transform(X)

for j in range(20):
  acc_temp= []
  for i in range(1,10):
    select = SelectKBest(chi2, k=i)
    X_new = select.fit_transform(X1, y)
    #print columns if i = 5
    if (i == 5):
      print(select.get_support(indices=False))
    # train/test/val = 60/20/20
    X_train, X_test, y_train, y_test = train_test_split(X_new, y,
    test_size=0.2, random_state= 19)
    # Use the same function above for the validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
    test_size=0.25, random_state = 7) # 0.25 x 0.8 = 0.2
    
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    probs = clf.predict_proba(X_val)
    preds = probs[:,1]
    fpr, tpr, threshold = metrics.roc_curve(y_val, preds)
    roc_auc = metrics.auc(fpr, tpr)
    acc = np.round(accuracy_score(y_val, y_pred), 3)
    acc_temp.append(roc_auc)
  acc_list.append(acc_temp)
print(np.mean(acc_list, axis = 0))
to_plot = np.average(acc_list, axis=0)
import matplotlib.pyplot as plt

fig, ax = plt.subplots(ncols = 1, figsize= (12,8))

# MMSE
ax.plot(list(range(1,10)), to_plot)
ax.set_title("Performance with k features (Logistic Regression)")
ax.set_xlabel("k (Features Used)")
ax.set_ylabel("AUC")
ax.legend()

Splitting Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import tree
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, accuracy_score
#use ideal value of k
select = SelectKBest(chi2, k=5)
X_new = select.fit_transform(X1, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y,
test_size=0.2, random_state= 19)

# Use the same function above for the validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
test_size=0.25, random_state = 7) # 0.25 x 0.8 = 0.2

print("X_train size:",len(X_train), 'Shape:',X_train.shape)
print("X_test size:",len(X_test),'Shape:',X_test.shape)
print("X_val size:",len(X_val),'Shape:',X_val.shape)
print("y_train size:",len(y_train),'Shape:',y_train.shape)
print("y_test size:",len(y_test),'Shape:',y_test.shape)
print("y_val size:",len(y_val),'Shape:',y_val.shape)

Trying different models to see difference

Decision Tree

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
probs = clf.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
acc = np.round(accuracy_score(y_val, y_pred), 3)
print("Accuracy is: " + str(acc))
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

SVM

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for i in range(len(kernels)):
  clf = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel = kernels[i], probability=True))
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_val)
  probs = clf.predict_proba(X_val)
  preds = probs[:,1]
  fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
  roc_auc = metrics.auc(fpr, tpr)
  acc = np.round(accuracy_score(y_val, y_pred), 3)
  print("Accuracy is: " + str(acc))
  plt.title('Receiver Operating Characteristic: Kernel = ' + kernels[i])
  plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
  plt.legend(loc = 'lower right')
  plt.plot([0, 1], [0, 1],'r--')
  plt.xlim([0, 1])
  plt.ylim([0, 1])
  plt.ylabel('True Positive Rate')
  plt.xlabel('False Positive Rate')
  plt.show()

Logistic Regression (Model Decided to Use)

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
#tune solver
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'sag']
for  i in range(len(solvers)):
  clf = LogisticRegression(solver = solvers[i])
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_val)
  probs = clf.predict_proba(X_val)
  preds = probs[:,1]
  fpr, tpr, threshold = metrics.roc_curve(y_val, preds)
  roc_auc = metrics.auc(fpr, tpr)
  fig, ax = plt.subplots(ncols = 1, figsize= (12,8))
  print("Accuracy is: " + str(acc))
  plt.title('Receiver Operating Characteristic using solver = '+solvers[i])
  plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
  plt.legend(loc = 'lower right')
  plt.plot([0, 1], [0, 1],'r--')
  plt.xlim([0, 1])
  plt.ylim([0, 1])
  plt.ylabel('True Positive Rate')
  plt.xlabel('False Positive Rate')
  plt.show()

Final Classification

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'sag']
#after tuning decided to use lbfgs
clf = LogisticRegression(solver = solvers[0])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
probs = clf.predict_proba(X_test)
preds = probs[:,1]
#plotting confusion matrix
color = 'white'
matrix = plot_confusion_matrix(clf, X_test, y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix', color=color)
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
fig, ax = plt.subplots(ncols = 1, figsize= (12,8))
print("Accuracy is: " + str(acc))
plt.title('Receiver Operating Characteristic using all 9 features')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()