In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mne
import os

In [1]:
def get_subject_folders(processed_path):
    folders = []
    for root, dirs, _ in os.walk(processed_path):
        folders.extend([os.path.join(root, d) for d in dirs]) 
        break # only top level subfolders required 
    return folders

folders = get_subject_folders(r'C:\Users\Ashwin\Desktop\processed')

In [4]:
template = {'mean_amp':[], 'mean_power':[], 'subject':[], 'is_PD':[]}
data = pd.DataFrame(template)

In [5]:
participants = pd.read_csv(r'C:\Users\Ashwin\Desktop\eeg\participants.tsv', sep= '\s+|\t+', engine='python')

In [6]:

data['subject'] = participants['participant_id'].copy()
data.set_index('subject', inplace=True, drop=True)

In [None]:
def categorize_label(row):
    if row['is_PD'] == 1:
        if row['MOCA'] < 22:
            return 'PDD'
        elif row['MOCA'] >= 22 and row['MOCA'] <= 26:
            return 'PD-MCI'
        else:
            return 'PD'
    else:
        return 'Control'

data['label'] = data.apply(categorize_label, axis=1)
data['MOCA'] = participants['MOCA'].values
data['Age'] = participants['AGE'].values
data['is_PD'] = participants['TYPE'].values

In [8]:
def get_avg_power(subject_path):
    epochs = mne.read_epochs(os.path.join(subject_path, subject_path[-7:] + '-epo.fif'))
    psd, freqs = mne.time_frequency.psd_array_multitaper(epochs['S  2'].pick('Cz').get_data(), fmin=1, fmax=10, n_jobs=1, verbose=None, sfreq=256)
    avg = psd.mean(axis=(0,2))[0]
    return avg

def get_avg_amplitude(subject_path):
    epochs = mne.read_epochs(os.path.join(subject_path, subject_path[-7:] + '-epo.fif'))
    return epochs['S  2'].pick('Cz').get_data().mean(axis=(0,2))[0]
    #return epochs.get_data().mean(axis=(0,2))

    

In [10]:
for subject in folders:
    data.loc[subject[-7:], 'mean_amp'] = get_avg_amplitude(subject)
    data.loc[subject[-7:], 'mean_power'] = get_avg_power(subject)

Reading C:\Users\Ashwin\Desktop\processed\sub-001\sub-001-epo.fif ...
    Found the data of interest:
        t =   -1000.00 ...    2500.00 ms
        0 CTF compensation matrices available
Not setting metadata
323 matching events found
No baseline correction applied
0 projection items activated
Reading C:\Users\Ashwin\Desktop\processed\sub-001\sub-001-epo.fif ...
    Found the data of interest:
        t =   -1000.00 ...    2500.00 ms
        0 CTF compensation matrices available
Not setting metadata
323 matching events found
No baseline correction applied
0 projection items activated
    Using multitaper spectrum estimation with 7 DPSS windows
Reading C:\Users\Ashwin\Desktop\processed\sub-002\sub-002-epo.fif ...
    Found the data of interest:
        t =   -1000.00 ...    2500.00 ms
        0 CTF compensation matrices available
Not setting metadata
419 matching events found
No baseline correction applied
0 projection items activated
Reading C:\Users\Ashwin\Desktop\processed\sub-002\s

In [43]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [47]:
#change X and y accordingly to select features and labels
X = data[(data['label']=='PD') | (data['label']=='PDD')][['mean_amp','mean_power','Age']]
y = data[(data['label']=='PD') | (data['label']=='PDD')]['label']

In [48]:
scaler.fit(X)
X_tr = scaler.transform(X)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tr, y, test_size=0.33, random_state=42)

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("Accuracy score of logistic regression: ", accuracy_score(y_test, y_pred))

Accuracy score of logistic regression:  0.7142857142857143


In [91]:
from sklearn.metrics import accuracy_score
#support vector machine
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print("Accuracy score of SVM: ", accuracy_score(y_test, y_pred))



Accuracy score of SVM:  0.6530612244897959


In [29]:
#random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(n_estimators=200, max_depth=30, random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6530612244897959


In [101]:
import pickle 
models = {'logistic_regression': logreg, 'svc': svc, 'random_forest': rf}
for model in models:
    with open(model + '.pkl', 'wb') as f:
        pickle.dump(models[model], f)
        print(f'{model} model succesfully saved')

logistic_regression model succesfully saved
svc model succesfully saved
random_forest model succesfully saved


In [65]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


# Assuming X is your feature matrix and y is your target variable
X_train, X_test, y_train, y_test = train_test_split(X_tr, y, test_size=0.33, random_state=42)

# Define the number of folds (K)
k = 2
kf = KFold(n_splits=k)

# Initialize a list to store the evaluation scores for each fold
scores = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Train your model on X_train_fold and y_train_fold
    # Evaluate the model on X_val_fold and y_val_fold
    # For example, using a classifier (e.g., a decision tree)
    #svc = SVC(kernel='rbf')
    #rf = RandomForestClassifier(n_estimators=200, max_depth=30, random_state=0)
    logreg = LogisticRegression()
    logreg.fit(X_train_fold, y_train_fold)
    y_pred = logreg.predict(X_val_fold)
    
    # Calculate the accuracy score for this fold
    accuracy = accuracy_score(y_val_fold, y_pred)
    scores.append(accuracy)

# Calculate the mean and standard deviation of the accuracy scores
mean_accuracy = sum(scores) / k
std_deviation = np.std(scores)

print(f"Mean Accuracy: {mean_accuracy}")
print(f"Standard Deviation: {std_deviation}")



Mean Accuracy: 0.6764705882352942
Standard Deviation: 0.02941176470588236
