In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler

In [2]:
models = []

models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('LDA', LinearDiscriminantAnalysis()))

### get_base_filepath()

Access the filepath for th ebase folder of the project. 
From here, any other asset of the project can be located.

In [3]:
def get_base_filepath():
    '''
    Access the filepath for the base folder of the project
    
    Input: None
    
    Output: The filepath to the root of the folder
    '''
    # Get current directory
    os.path.abspath(os.curdir)

    # Go up a directory level
    os.chdir('..')
    
    # Go up a directory level
    os.chdir('..')

    # Set baseline filepath to the project folder directory
    base_folder_filepath = os.path.abspath(os.curdir)
    return base_folder_filepath

### normalize()

Normalizes a Series

**Input:** A feature of type Series

**Output:** The normalized feature of type Series

In [4]:
def normalize(feature):
    '''
    This function normalizes a Series
    
    Input: A feature of type Series
    
    Output: The normalized feature of type Series
    '''
    return (feature - feature.mean())/feature.std()

### normalize_features()

Normalizes all features in a given dataframe. This will normalize ALL features, so ensure that the inputted dataframe consists only of numeric values.

**Input:** A dataframe to normalize

**Output:** A normalized dataframe

In [5]:
def normalize_features(df):
    '''
    This function normalizes all features in a dataframe
    
    Input: A pandas dataframe
    
    Output: The normalized dataframe
    '''
    for column in df.columns:
        df[column] = normalize(df[column])
    return df

### make_predictions()

Fit a model using the training data, 
make predictions on a testing set, 
and get the accuracy of the model.

Used in evaluate_models()

In [6]:
def make_predictions(model, X_trn, X_tst, y_trn, y_tst):
    '''
    Get the accuracy of a model
    
    Input:
        - A model to use to make predictions
        - Set of training features
        - Set of testing features
        - Set of training targets
        - Set of testing targets
        
    Output: Accuracy of the model
    '''
    
    # Train the model on the training set
    model_fit = model.fit(X_trn, y_trn)
    
    # Make predictions on the testing features
    y_pred = model_fit.predict(X_tst)
    
    # Compare the predictions to the true values
    accuracy = accuracy_score(y_pred, y_tst)
    
    # Return the accuracy
    return accuracy

### evaluate_models()

Evaluate the performance of models on a set of features and targets.

Uses make_predictions()

Used in get_accuracies()

In [7]:
def evaluate_models(X, y):
    '''
    Evaluate the performance of models on a set of features and targets.
    
    Input:
        - Set of features
        - Set of targets
        
    Output: Accuracy of three models (Logistic regression, KNN, SVM)
    '''
    # Separate the data into training and testing sets
    X_trn, X_tst, y_trn, y_tst = train_test_split(X, y)
    
    # Evaluate the accuracies using each of the three models
    lr_acc = make_predictions(LogisticRegression(), X_trn, X_tst, y_trn, y_tst)
    knn_acc = make_predictions(KNeighborsClassifier(), X_trn, X_tst, y_trn, y_tst)
    svm_acc = make_predictions(SVC(), X_trn, X_tst, y_trn, y_tst)
    lda_acc = make_predictions(LinearDiscriminantAnalysis(), X_trn, X_tst, y_trn, y_tst)
    
    # Return the accuracy in a list format
    return [lr_acc, knn_acc, svm_acc, lda_acc]

### get_accuracies()

Get 100 accuracies for three models (Logistic regression, KNN, SVM).

In [8]:
def get_accuracies(X, y):
    '''
    Get 100 accuracies for three models (Logistic regression, KNN, SVM).
    
    Input:
        - Set of features
        - Set of targets
        
    Output: List of 100 accuracies for the three models
    '''
    # Create an empty list to store the accuracies for each model
    lr_acc = []
    knn_acc = []
    svm_acc = []
    lda_acc = []
    
    # Run 100 iterations of evaluating the model
    for i in range(100):
        # Get the accuracy for this iteration
        accuracies = evaluate_models(X, y)
        
        # Add it to the corresponding model holder
        lr_acc.append(accuracies[0])
        knn_acc.append(accuracies[1])
        svm_acc.append(accuracies[2])
        lda_acc.append(accuracies[3])
        
    # Return a list of all accuracies
    return [lr_acc, knn_acc, svm_acc, lda_acc]

### perform_cross_validation()

Use a stratified K-fold for cross validation for the three classification models 

In [9]:
def perform_cross_validation(X_train, y_train):
    '''
    Input: 
        - A dataframe containing the features use to build the model
        - A Series of the true values associated with the feature list
    
    Output: Printed result for the mean and standard deviation of each model
    '''
    # Create an empty dictionary to store the results
    results = dict()

    # Loop through the models
    for name, model in models:
        # Create a Stratified K-fold for cross validation
        kfold = StratifiedKFold(n_splits=10)
        
        # Apply cross validation using the current model
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
        
        # Add the mean and standard deviation to the dictionary
        results[name] = (cv_results.mean(), cv_results.std())

    # Print the results
    print('Model\t\tCV Mean\t\tCV std')
    print(results)

## Import File

Locate the file using its filepath from the base folder and load the file as a dataframe.

In [10]:
# The folder for the project
base_folder_filepath = get_base_filepath()

# Phenotypic data site folder
filepath = base_folder_filepath + '\\Data\\Preprocessed_data\\2023.7.14-Region_Condensed_Dataframe.csv'

# Dataframe from filepath
df = pd.read_csv(filepath, index_col=0)

--------------------------------------------------------------------------------------------------------------------------------

# Multi-Class Classificaiton

This section investigates how models perform when predicting the type of ADHD the subject has or if they are a control.

This is accomplished by using the phenotypic data for the sites. The target will be the diagnosis which includes three types with each number corresponding to a type diagnosis for ADHD.

    0 = TDC (Typically developing children)
    1 = ADHD-Combined
    2 = ADHD-Hyperactive/Impulsive
    3 = ADHD-Inattentive
    
There will be three methods to make these predictions: 

- Current dataframe

- Scaled dataframe

- Normalized dataframe

## Current Dataframe

This model will use the current dataframe without any modifications to the features. 
This will act as a baseline to compare the models with changes to.

### Separate data

Split the data into features and target.

In [11]:
X = df.drop('DX', axis=1)
y = df['DX']

### Evaluate Accuracy

Determine the accuracy of using this dataframe. 

#### 100-iteration Train/Test Split

Do 100-iterations of train/test splits using this dataframe. 
Generate 100 accuracies for the four models.

In [12]:
accs = get_accuracies(X, y)
accuracies = np.asarray(accs)

Extract descriptive statistics from the accuracies.

In [13]:
means = [accuracies[0].mean(), accuracies[1].mean(), accuracies[2].mean(), accuracies[3].mean()]
stds  = [accuracies[0].std(),  accuracies[1].std(),  accuracies[2].std(),  accuracies[3].std()]
maxes = [accuracies[0].max(),  accuracies[1].max(),  accuracies[2].max(),  accuracies[3].max()]
mins  = [accuracies[0].min(),  accuracies[1].min(),  accuracies[2].min(),  accuracies[3].min()]

Format the descriptive statistics as a dataframe.

In [14]:
results = pd.DataFrame([means, stds, maxes, mins], 
                       index=['Mean', 'STD', 'Max', 'Min'], 
                       columns=['LR_multiclass', 'KNN_multiclass', 'SVM_multiclass', 'LDA_multiclass'])

results

Unnamed: 0,LR_multiclass,KNN_multiclass,SVM_multiclass,LDA_multiclass
Mean,0.627643,0.579936,0.627643,0.60758
STD,0.030138,0.03247,0.030138,0.030744
Max,0.707006,0.66242,0.707006,0.675159
Min,0.547771,0.515924,0.547771,0.528662


#### Cross-validation

Perform cross validation on this dataset with the four models from before. This is done to compare the results to the train-test split method.

In [15]:
perform_cross_validation(X, y)



Model		CV Mean		CV std
{'LR': (0.6290066564260113, 0.008930862894101646), 'KNN': (0.5608038914490528, 0.07719861970677068), 'SVM': (0.6274193548387097, 0.009146666829568337), 'LDA': (0.6114439324116744, 0.03051346941825479)}




### Method Conclusion

Logistic regression and SVM were the most accurate method in the train/test split, and logistic regression was the most accurate method for cross validation. 
The train/test split and cross-validation had similar results.

Logistic regression and SVM are close to the average from using all features, but not higher.

## Normalized Dataframe

This model will use a normalized version of the dataframe. 
This method will adjust the features to be normally distributed.

This should reduce some of the bias that results from the different scales in the dataframe's features

### Separate data

Make a copy of the original dataframe to ensure that it is preserved. 
Split the data into features and target.

In [16]:
df_norm = df.copy()

X_norm = df_norm.drop('DX', axis=1)
y_norm = df_norm['DX']

### Normalize columns

Normalize the features and update the feature dataframe to use these changes.

In [17]:
X_norm = normalize_features(X_norm)

### Evaluate Accuracy

Determine the accuracy of using this dataframe. 

#### 100-iteration Train/Test Split

Do 100-iterations of train/test splits using this dataframe. 
Generate 100 accuracies for the four models.

In [18]:
accs_norm = get_accuracies(X_norm, y_norm)
accuracies_norm = np.asarray(accs_norm)

Extract descriptive statistics from the accuracies.

In [19]:
means_norm = [accuracies_norm[0].mean(), accuracies_norm[1].mean(), accuracies_norm[2].mean(), accuracies_norm[3].mean()]
stds_norm  = [accuracies_norm[0].std(),  accuracies_norm[1].std(),  accuracies_norm[2].std(),  accuracies_norm[3].std()]
maxes_norm = [accuracies_norm[0].max(),  accuracies_norm[1].max(),  accuracies_norm[2].max(),  accuracies_norm[3].max()]
mins_norm  = [accuracies_norm[0].min(),  accuracies_norm[1].min(),  accuracies_norm[2].min(),  accuracies_norm[3].min()]

Format the descriptive statistics as a dataframe.

In [20]:
results_norm = pd.DataFrame([means_norm, stds_norm, maxes_norm, mins_norm], 
                       index=['Mean', 'STD', 'Max', 'Min'], 
                       columns=['LR_multiclass_norm', 'KNN_multiclass_norm', 
                                'SVM_multiclass_norm', 'LDA_multiclass_norm'])

results_norm

Unnamed: 0,LR_multiclass_norm,KNN_multiclass_norm,SVM_multiclass_norm,LDA_multiclass_norm
Mean,0.611019,0.581656,0.630892,0.611338
STD,0.034042,0.036941,0.037155,0.033752
Max,0.681529,0.66879,0.719745,0.694268
Min,0.522293,0.484076,0.535032,0.528662


#### Cross-validation

Perform cross validation on this dataset with the four models from before. This is done to compare the results to the train-test split method.

In [21]:
perform_cross_validation(X_norm, y_norm)



Model		CV Mean		CV std
{'LR': (0.5988223246287763, 0.05283969797865226), 'KNN': (0.5767025089605735, 0.06679154940398133), 'SVM': (0.6274193548387097, 0.009146666829568337), 'LDA': (0.6114439324116744, 0.03051346941825479)}




### Method Conclusion

SVM was the most accurate method in both the train/test split and cross-validation.
The train/test split and cross-validation had similar results.

SVM is the only model from this test that is better than the previous test using the condensed features. 

SVM is also higher than the average from using all features.

## Classification Conclusion

The most accurate method for this classification method was SVM on the normalized dataframe.

This method scored with higher accuracy than the same dataframe with all region means.

--------------------------------------------------------------------------------------------------------------------------------

# Binary Classificaiton

This section investigates how models perform when predicting whether a patient has ADHD or not. 

This is accomplished by converting the diagnosis to a binary value based on if their diagnosis is a control or has some type of ADHD. 
For this feature, 'True' signifies the subject has ADHD and 'False' signifies the subject is a control and does not have ADHD.

Theoretically, this model should perform better than the multi-class classification since it is simpler.

## Base Binary Dataframe

The binary dataframe is exactly the same as the multiclass dataframe except the diagnosis is binary. 
Any value for 'DX' greater than 0 for this column indicates that the subject has ADHD.

In [22]:
df_binary = df.copy()

df_binary['DX'].loc[df_binary['DX'] > 0] = 1
df_binary.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_binary['DX'].loc[df_binary['DX'] > 0] = 1


Unnamed: 0,Mean_4001,Mean_4002,Mean_4011,Mean_4021,Mean_4102,Mean_5011,Mean_5012,Mean_6002,Mean_6221,Mean_6302,...,Mean_8122,Mean_8201,Mean_8211,Mean_9041,Mean_9071,Mean_9072,Mean_9150,Mean_9160,Mean_9170,DX
10001,0.003551,0.001368,0.001056,-0.001229,-0.00054,-0.003605,-0.002618,9.4e-05,0.000855,-0.001402,...,0.001068,0.00202,0.00391,-0.000998,-0.001536,-0.001464,0.00246,0.00181,-0.000823,1
10002,-0.005823,-0.002241,0.001463,-0.004338,0.002755,-0.000571,7.1e-05,-0.001889,-0.000175,-5.8e-05,...,0.001865,0.002288,0.001013,-0.000929,-0.001079,-0.000776,-0.000629,-2.5e-05,0.001806,1
10003,0.005478,0.002208,0.004277,6e-05,-0.003816,0.004641,0.004576,0.001011,-0.001067,0.001231,...,0.009668,0.000621,-0.003601,0.003619,-0.002535,-0.003021,0.00162,-5.9e-05,-0.007794,0
10004,-0.002847,-0.001133,-0.000901,0.002363,0.000122,0.001875,0.002724,0.000547,0.001156,0.00253,...,-0.001246,0.002599,0.001361,0.00342,3.7e-05,-0.000959,0.000302,-0.000304,-0.00053,0
10005,0.00834,0.007814,-0.000297,-0.005694,-0.001799,-0.002059,-0.001324,0.004112,0.004462,-7.8e-05,...,-0.001998,-0.002685,-0.001065,-0.008435,0.002573,0.000648,-0.000393,-0.003564,-0.001598,1


## Binary Current Dataframe

This model will use the current dataframe with the only modification being to the diagnosis column. 
Any value for 'DX' greater than 0 indicates that the patient has ADHD.

This will act as a baseline to compare the binary models with other changes to.

### Separate data

Split the data into features and target.

In [23]:
X_binary = df_binary.drop('DX', axis=1)
y_binary = df_binary['DX']

### Evaluate Accuracy

Determine the accuracy of using this dataframe. 

#### 100-iteration Train/Test Split

Do 100-iterations of train/test splits using this dataframe. 
Generate 100 accuracies for the four models.

In [24]:
accs_binary = get_accuracies(X_binary, y_binary)
accuracies_binary = np.asarray(accs_binary)

Extract descriptive statistics from the accuracies.

In [25]:
means_binary = [accuracies_binary[0].mean(), accuracies_binary[1].mean(), accuracies_binary[2].mean(), accuracies_binary[3].mean()]
stds_binary  = [accuracies_binary[0].std(),  accuracies_binary[1].std(),  accuracies_binary[2].std(),  accuracies_binary[3].std()]
maxes_binary = [accuracies_binary[0].max(),  accuracies_binary[1].max(),  accuracies_binary[2].max(),  accuracies_binary[3].max()]
mins_binary  = [accuracies_binary[0].min(),  accuracies_binary[1].min(),  accuracies_binary[2].min(),  accuracies_binary[3].min()]

Format the descriptive statistics as a dataframe.

In [26]:
results_binary = pd.DataFrame([means_binary, stds_binary, maxes_binary, mins_binary], 
                              index=['Mean', 'STD', 'Max', 'Min'], 
                              columns=['LR_binary', 'KNN_binary', 'SVM_binary', 'LDA_binary'])

results_binary

Unnamed: 0,LR_binary,KNN_binary,SVM_binary,LDA_binary
Mean,0.625287,0.627707,0.625924,0.629363
STD,0.02989,0.032731,0.030063,0.032286
Max,0.707006,0.700637,0.707006,0.726115
Min,0.515924,0.56051,0.515924,0.535032


#### Cross-validation

Perform cross validation on this dataset with the four models from before. This is done to compare the results to the train-test split method.

In [27]:
perform_cross_validation(X_binary, y_binary)

Model		CV Mean		CV std
{'LR': (0.6289810547875063, 0.006873265171196576), 'KNN': (0.6054275473630313, 0.10340918547205324), 'SVM': (0.5940604198668714, 0.05383047536239013), 'LDA': (0.6291602662570405, 0.060572551530569514)}


### Method Conclusion

LDA is the most accurate model for both train/test split and cross-validation with all models very close.

None of these models are higher than the SVM model on the normalized dataframe from earlier in the notebook.

KNN and LDA performed better than the binary classification from the average from using all features.

## Normalized Binary Dataframe

This model will use a normalized version of the dataframe. 
This method will adjust the features to be normally distributed.

This should reduce some of the bias that results from the different scales in the dataframe's features

### Separate data

Make a copy of the original dataframe to ensure that it is preserved. 
Split the data into features and target.

In [28]:
df_binary_norm = df_binary.copy()

X_binary_norm = df_binary_norm.drop('DX', axis=1)
y_binary_norm = df_binary_norm['DX']

### Normalize columns

Normalize the features and update the feature dataframe to use these changes.

In [29]:
X_binary_norm = normalize_features(X_binary_norm)

### Evaluate Accuracy

Determine the accuracy of using this dataframe. 

#### 100-iteration Train/Test Split

Do 100-iterations of train/test splits using this dataframe. 
Generate 100 accuracies for the four models.

In [30]:
accs_binary_norm = get_accuracies(X_binary_norm, y_binary_norm)
accuracies_binary_norm = np.asarray(accs_binary_norm)

Extract descriptive statistics from the accuracies.

In [31]:
means_binary_norm = [accuracies_binary_norm[0].mean(), accuracies_binary_norm[1].mean(), accuracies_binary_norm[2].mean(), accuracies_binary_norm[3].mean()]
stds_binary_norm  = [accuracies_binary_norm[0].std(),  accuracies_binary_norm[1].std(),  accuracies_binary_norm[2].std(),  accuracies_binary_norm[3].std()]
maxes_binary_norm = [accuracies_binary_norm[0].max(),  accuracies_binary_norm[1].max(),  accuracies_binary_norm[2].max(),  accuracies_binary_norm[3].max()]
mins_binary_norm  = [accuracies_binary_norm[0].min(),  accuracies_binary_norm[1].min(),  accuracies_binary_norm[2].min(),  accuracies_binary_norm[3].min()]

Format the descriptive statistics as a dataframe.

In [32]:
results_binary_norm = pd.DataFrame([means_binary_norm, stds_binary_norm, maxes_binary_norm, mins_binary_norm], 
                       index=['Mean', 'STD', 'Max', 'Min'], 
                       columns=['LR_binary_norm', 'KNN_binary_norm', 
                                'SVM_binary_norm', 'LDA_binary_norm'])

results_binary_norm

Unnamed: 0,LR_binary_norm,KNN_binary_norm,SVM_binary_norm,LDA_binary_norm
Mean,0.627006,0.617643,0.626497,0.627134
STD,0.035888,0.034717,0.036684,0.0344
Max,0.707006,0.694268,0.707006,0.713376
Min,0.522293,0.547771,0.522293,0.515924


#### Cross-validation

Perform cross validation on this dataset with the four models from before. This is done to compare the results to the train-test split method.

In [33]:
perform_cross_validation(X_binary_norm, y_binary_norm)

Model		CV Mean		CV std
{'LR': (0.6292114695340503, 0.07786838833775525), 'KNN': (0.6182795698924731, 0.10282530870140304), 'SVM': (0.5877368151561699, 0.055485555272934674), 'LDA': (0.6291602662570405, 0.060572551530569514)}


### Method Conclusion

LDA is the most accurate model for both train/test split and cross-validation with all models very close.

None of these models are higher than the SVM model on the normalized dataframe from earlier in the notebook or the LDA from the baseline binary models.

Logistic regression, SVM and LDA performed better than the binary classification from the average from using all features.

## Classification Conclusion

The binary classification yielded more accurate predictions on average. 
However, the most accurate model from this notebook was SVM on the normalized features.

Using SVM on a normalized multi-class dataframe resulted in the most accurate model. 
Using LDA on a unchanged dataframe resulted in the most accurate binary model. Both of these outpreformed their respective classification method on the original dataframe.

--------------------------------------------------------------------------------------------------------------------------------

# Complete Results

Combine the accuracy from the multiclass and binary tests. 

Concatenate the two results to a single dataframe to export.

In [43]:
results_complete = pd.concat([results, results_norm, results_binary, results_binary_norm], axis=1)
results_complete

Unnamed: 0,LR_multiclass,KNN_multiclass,SVM_multiclass,LDA_multiclass,LR_multiclass_norm,KNN_multiclass_norm,SVM_multiclass_norm,LDA_multiclass_norm,LR_binary,KNN_binary,SVM_binary,LDA_binary,LR_binary_norm,KNN_binary_norm,SVM_binary_norm,LDA_binary_norm
Mean,0.589363,0.603694,0.629809,0.497325,0.537197,0.594459,0.622229,0.492293,0.680318,0.645159,0.714204,0.620573,0.630127,0.642293,0.70707,0.615669
STD,0.030273,0.031714,0.032155,0.036173,0.040366,0.033122,0.033844,0.038442,0.031446,0.035899,0.032561,0.037824,0.035592,0.038929,0.031015,0.038091
Max,0.66242,0.681529,0.707006,0.592357,0.630573,0.66879,0.694268,0.579618,0.77707,0.745223,0.783439,0.726115,0.745223,0.757962,0.783439,0.687898
Min,0.509554,0.522293,0.56051,0.401274,0.420382,0.522293,0.509554,0.407643,0.598726,0.535032,0.636943,0.503185,0.541401,0.547771,0.643312,0.535032


In [42]:
results_strict_complete = pd.concat([results_strict, results_strict_norm, results_strict_binary, results_strict_binary_norm], axis=1)
results_strict_complete

Unnamed: 0,LR_multiclass_strict,KNN_multiclass_strict,SVM_multiclass_strict,LDA_multiclass_strict,LR_multiclass_strict_norm,KNN_multiclass_strict_norm,SVM_multiclass_strict_norm,LDA_multiclass_strict_norm,LR_binary_strict,KNN_binary_strict,SVM_binary_strict,LDA_binary_strict,LR_binary_strict_norm,KNN_binary_strict_norm,SVM_binary_strict_norm,LDA_binary_strict_norm
Mean,0.622293,0.589236,0.633758,0.606561,0.602166,0.603057,0.637006,0.607134,0.696752,0.630637,0.706752,0.675796,0.68242,0.639172,0.69758,0.677834
STD,0.028364,0.031802,0.030233,0.031666,0.039082,0.035645,0.036384,0.040733,0.034781,0.030058,0.032502,0.038147,0.03374,0.035123,0.032044,0.034196
Max,0.707006,0.649682,0.713376,0.694268,0.700637,0.687898,0.713376,0.700637,0.796178,0.719745,0.789809,0.77707,0.751592,0.751592,0.783439,0.745223
Min,0.547771,0.515924,0.55414,0.515924,0.522293,0.528662,0.541401,0.515924,0.624204,0.547771,0.636943,0.598726,0.605096,0.55414,0.617834,0.605096


In [44]:
results_complete.to_csv(base_folder_filepath + '\\Results\\2023.7.17-Region_Correlation_Condensed-Results.csv')
results_strict_complete.to_csv(base_folder_filepath + '\\Results\\2023.7.17-Region_Correlation_Strict_Condensed-Results.csv')