In [24]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [2]:
def get_base_filepath():
    '''
    Access the filepath for the base folder of the project
    
    Input: None
    
    Output: The filepath to the root of the folder
    '''
    # Get current directory
    os.path.abspath(os.curdir)

    # Go up a directory level
    os.chdir('..')

    # Set baseline filepath to the project folder directory
    base_folder_filepath = os.path.abspath(os.curdir)
    return base_folder_filepath

In [3]:
def extract_features(filepath):
    '''
    Create a dataframe using the mean of regions over time.
    
    Input: filepath to open the dataframe
    
    Output: dataframe of mean for each region
    '''
    df = pd.read_csv(filepath, sep=r'\s{1,}', engine='python', header=0)
    df = df.drop('File', axis=1)
    df = df.drop('Sub-brick', axis=1)
    return df.mean()

In [4]:
base_folder_filepath = get_base_filepath()
sites_filepath = base_folder_filepath +  '\\Data\\Preprocessed_data\\Sites\\'
phenotypics_filepath = base_folder_filepath + '\\Data\\Phenotypic\\Sites\\'

In [5]:
subjects = []
subject_features = []
subjects_dropped = []

# Access all sfnwmrda files in the OHSU folder
for site_folder in os.listdir(sites_filepath):
    # Access the filepath to the folder
    site_folder_path = os.path.join(sites_filepath, site_folder)
        
    # Access the patient folders within the site folder
    for patient_id_folder in os.listdir(site_folder_path):            
        # Access the filepath to the folder
        patient_id_folder_path = os.path.join(site_folder_path, patient_id_folder)
        
        # Skip the folder if it is empty
        if len(os.listdir(patient_id_folder_path)) == 0:
            print(f"Skipping empty folder: {patient_id_folder}")
            subjects_dropped.append(patient_id_folder)
            continue

        # Check if the filepath is a folder, continue if it is a folder
        if os.path.isdir(patient_id_folder_path):
            # Get the file name (dependent on folder name)
            file_name = f"sfnwmrda{patient_id_folder}_session_1_rest_1_aal_TCs.1D"
            
            # Join the file name to its path
            file_path = os.path.join(patient_id_folder_path, file_name)
            
            if not os.path.exists(file_path):
                print(f"Skipping folder {file_name}: not found.")
                subjects_dropped.append(patient_id_folder)
                continue

            # Extract the features and add it to the list of subjects
            subject_features.append(extract_features(file_path))
            
            # Add the patient ID to the subjects list
            subjects.append(patient_id_folder)
        
subjects[:3]

Skipping empty folder: 0010016
Skipping empty folder: 0010027
Skipping empty folder: 0010055
Skipping empty folder: 0010098
Skipping empty folder: 0010105
Skipping empty folder: 0010127
Skipping folder sfnwmrda0015001_session_1_rest_1_aal_TCs.1D: not found.
Skipping folder sfnwmrda0015004_session_1_rest_1_aal_TCs.1D: not found.
Skipping empty folder: 0015011
Skipping folder sfnwmrda0015016_session_1_rest_1_aal_TCs.1D: not found.
Skipping empty folder: 0015018
Skipping folder sfnwmrda0015026_session_1_rest_1_aal_TCs.1D: not found.
Skipping folder sfnwmrda0015027_session_1_rest_1_aal_TCs.1D: not found.
Skipping folder sfnwmrda0015032_session_1_rest_1_aal_TCs.1D: not found.
Skipping folder sfnwmrda0015036_session_1_rest_1_aal_TCs.1D: not found.
Skipping folder sfnwmrda0015052_session_1_rest_1_aal_TCs.1D: not found.
Skipping folder sfnwmrda0015057_session_1_rest_1_aal_TCs.1D: not found.


['0010001', '0010002', '0010003']

In [6]:
subjects_dropped[:3]

['0010016', '0010027', '0010055']

In [7]:
dx = []
pheno_index = []

# Iterate through each file in the directory
for site_pheno in os.listdir(phenotypics_filepath):
    site_pheno_filepath = os.path.join(phenotypics_filepath, site_pheno)
    
    # Check if the current item in the directory is a file
    if os.path.isfile(site_pheno_filepath):
        df_pheno = pd.read_csv(site_pheno_filepath, index_col='ScanDir ID')
        dx.append(df_pheno['DX'])
        pheno_index.append(df_pheno.index)
        
dx[:3]

[ScanDir ID
 1018959    0
 1019436    3
 1043241    0
 1266183    0
 1535233    0
           ..
 8337695    1
 8432725    0
 8628223    0
 8658218    0
 9922944    0
 Name: DX, Length: 83, dtype: int64,
 ScanDir ID
 1000804    0
 1023964    3
 1057962    1
 1099481    1
 1127915    0
           ..
 10127      1
 10128      0
 10129      3
 10013      1
 10043      0
 Name: DX, Length: 222, dtype: int64,
 ScanDir ID
 1084283    1
 1084884    0
 1108916    1
 1206380    3
 1340333    1
           ..
 7333005    1
 8064456    0
 8218392    0
 8720244    3
 9499804    0
 Name: DX, Length: 79, dtype: int64]

In [8]:
df_subject_x_region = pd.DataFrame(subject_features, index=subjects)
df_subject_x_region.head()

Unnamed: 0,Mean_2001,Mean_2002,Mean_2101,Mean_2102,Mean_2111,Mean_2112,Mean_2201,Mean_2202,Mean_2211,Mean_2212,...,Mean_9081,Mean_9082,Mean_9100,Mean_9110,Mean_9120,Mean_9130,Mean_9140,Mean_9150,Mean_9160,Mean_9170
10001,0.001918,0.001396,0.000917,0.001579,0.00162,0.000398,0.000401,0.000248,-6e-06,-0.001791,...,-0.001946,-0.00154,0.002221,0.00164,-0.000227,-0.000473,-0.000525,0.00246,0.00181,-0.000823
10002,0.000535,-0.000911,-0.00437,1.3e-05,-0.012312,0.001798,-0.001885,0.000525,-0.002277,0.015622,...,-0.000176,-0.001465,-0.002169,-0.000968,0.001107,0.00105,0.000374,-0.000629,-2.5e-05,0.001806
10003,0.004598,0.001763,0.001807,-0.000461,-0.004121,-0.007068,0.003899,0.004255,-0.001597,-0.011144,...,-0.001121,-0.001566,-0.00923,-0.002198,0.006707,0.009246,-0.000108,0.00162,-5.9e-05,-0.007794
10004,-0.000559,0.00083,-0.003498,-0.001282,-0.004143,0.001574,-0.001477,-0.000162,-0.005601,0.002853,...,0.0008,-0.000904,-0.000326,0.000155,0.003007,0.001742,0.002644,0.000302,-0.000304,-0.00053
10005,0.003364,0.006273,0.014627,0.015924,0.000704,0.002034,0.01669,0.014993,0.004241,0.00822,...,0.007601,0.004895,0.001707,-0.004593,-0.007235,-0.008659,-0.007546,-0.000393,-0.003564,-0.001598


In [9]:
pheno_index = [ind for site_pheno in pheno_index for ind in site_pheno]

In [10]:
df_region_w_dx = df_subject_x_region.copy()

for i in range (len(pheno_index)):
    s_id = pheno_index[i]
    
    if len(str(s_id)) == 5:
        pheno_index[i] = '00' + str(s_id)
    else:
        pheno_index[i] = str(s_id)
    
diagnosis = pd.Series([diag for site_pheno in dx for diag in site_pheno], index=pheno_index)

filtered_diagnosis = diagnosis.drop(index=subjects_dropped)

df_region_w_dx['DX'] = filtered_diagnosis
df_region_w_dx.head()

Unnamed: 0,Mean_2001,Mean_2002,Mean_2101,Mean_2102,Mean_2111,Mean_2112,Mean_2201,Mean_2202,Mean_2211,Mean_2212,...,Mean_9082,Mean_9100,Mean_9110,Mean_9120,Mean_9130,Mean_9140,Mean_9150,Mean_9160,Mean_9170,DX
10001,0.001918,0.001396,0.000917,0.001579,0.00162,0.000398,0.000401,0.000248,-6e-06,-0.001791,...,-0.00154,0.002221,0.00164,-0.000227,-0.000473,-0.000525,0.00246,0.00181,-0.000823,3
10002,0.000535,-0.000911,-0.00437,1.3e-05,-0.012312,0.001798,-0.001885,0.000525,-0.002277,0.015622,...,-0.001465,-0.002169,-0.000968,0.001107,0.00105,0.000374,-0.000629,-2.5e-05,0.001806,3
10003,0.004598,0.001763,0.001807,-0.000461,-0.004121,-0.007068,0.003899,0.004255,-0.001597,-0.011144,...,-0.001566,-0.00923,-0.002198,0.006707,0.009246,-0.000108,0.00162,-5.9e-05,-0.007794,0
10004,-0.000559,0.00083,-0.003498,-0.001282,-0.004143,0.001574,-0.001477,-0.000162,-0.005601,0.002853,...,-0.000904,-0.000326,0.000155,0.003007,0.001742,0.002644,0.000302,-0.000304,-0.00053,0
10005,0.003364,0.006273,0.014627,0.015924,0.000704,0.002034,0.01669,0.014993,0.004241,0.00822,...,0.004895,0.001707,-0.004593,-0.007235,-0.008659,-0.007546,-0.000393,-0.003564,-0.001598,2


In [11]:
df_region_w_dx.shape

(628, 117)

In [12]:
df_region_w_dx['DX'].value_counts()

DX
0    395
1    125
3    104
2      4
Name: count, dtype: int64

In [13]:
X = df_region_w_dx.drop('DX', axis=1)
y = df_region_w_dx['DX']

X_train, X_val, y_train, y_val = train_test_split(X, y)

In [14]:
df_region_w_dx['DX'].loc[df_region_w_dx['DX'].isnull()]

Series([], Name: DX, dtype: int64)

In [15]:
model_LR = LogisticRegression().fit(X_train, y_train)
y_pred_LR = model_LR.predict(X_val)

accuracy_LR = accuracy_score(y_pred_LR, y_val)
accuracy_LR

0.6305732484076433

In [16]:
model_KNN = KNeighborsClassifier().fit(X_train, y_train)
y_pred_KNN = model_KNN.predict(X_val)

accuracy_KNN = accuracy_score(y_pred_KNN, y_val)
accuracy_KNN

0.6178343949044586

In [17]:
model_SVM = SVC().fit(X_train, y_train)
y_pred_SVM = model_SVM.predict(X_val)

accuracy_SVM = accuracy_score(y_pred_SVM, y_val)
accuracy_SVM

0.6305732484076433

In [18]:
print('Accuracies:')
print('\nLogistic Regression:\t', accuracy_LR)
print('KNN:\t\t\t', accuracy_KNN)
print('SVM:\t\t\t', accuracy_SVM)

Accuracies:

Logistic Regression:	 0.6305732484076433
KNN:			 0.6178343949044586
SVM:			 0.6305732484076433


In [19]:
def evaluate_models(X, y):
    X_trn, X_tst, y_trn, y_tst = train_test_split(X, y)
    
    lr_acc = make_predictions(LogisticRegression(), X_trn, X_tst, y_trn, y_tst)
    knn_acc = make_predictions(KNeighborsClassifier(), X_trn, X_tst, y_trn, y_tst)
    svm_acc = make_predictions(SVC(), X_trn, X_tst, y_trn, y_tst)
    
    return [lr_acc, knn_acc, svm_acc]

In [20]:
def make_predictions(model, X_trn, X_tst, y_trn, y_tst):
    model_fit = model.fit(X_trn, y_trn)
    y_pred = model_fit.predict(X_tst)
    accuracy = accuracy_score(y_pred, y_tst)
    return accuracy

In [21]:
def run_predictions(X, y):
    lr_acc = []
    knn_acc = []
    svm_acc = []
    
    for i in range(100):
        accuracies = evaluate_models(X, y)
        lr_acc.append(accuracies[0])
        knn_acc.append(accuracies[1])
        svm_acc.append(accuracies[2])
        
    return [lr_acc, knn_acc, svm_acc]

In [22]:
accs = run_predictions(X, y)
accuracies = np.asarray(accs)

means = [accuracies[0].mean(), accuracies[1].mean(), accuracies[2].mean()]
stds = [accuracies[0].std(), accuracies[1].std(), accuracies[2].std()]

In [None]:
[means, stds]

In [23]:
results = pd.DataFrame([means, stds], index=['Mean', 'STD'], columns=['LR', 'KNN', 'SVM'])
results

Unnamed: 0,LR,KNN,SVM
Mean,0.630382,0.577261,0.630382
STD,0.033973,0.03347,0.033973


In [25]:
def perform_cross_validation(X_train, y_train):
    '''
    Input: 
        - A dataframe containing the features use to build the model
        - A Series of the true values associated with the feature list
    
    Output: Printed result for the mean and standard deviation of each model
    '''
    results = dict()

    for name, model in models:
        kfold = StratifiedKFold(n_splits=10)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
        results[name] = (cv_results.mean(), cv_results.std())

    print('Model\t\tCV Mean\t\tCV std')
    print(results)

In [31]:
models = []

models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))

In [32]:
perform_cross_validation(X, y)



Model		CV Mean		CV std
{'LR': (0.6290066564260113, 0.008930862894101646), 'KNN': (0.5734511008704557, 0.07671529589607275), 'SVM': (0.6290066564260113, 0.008930862894101646)}
