In [1]:
#Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
import random

In [2]:
#Read file (audio features joined with demographics)
result_mean = pd.read_csv('PD_audio_features_mean_with_demographics.csv')
print(result_mean.shape)

result_var = pd.read_csv('PD_audio_features_var_with_demographics.csv')
print(result_var.shape)

result_std = pd.read_csv('PD_audio_features_std_with_demographics.csv')
print(result_std.shape)

(56444, 73)
(56444, 73)
(56444, 73)


In [3]:
#Understanding proportions of individuals in each category
print(result_mean.groupby(['medtimepoint']).size())
temp = result_mean[['medtimepoint']]
t1 = temp.groupby(['medtimepoint']).agg({'medtimepoint': 'count'})
print(t1.apply(lambda x:100 * x / float(x.sum())))

medtimepoint
Another time                                      15900
I don't take Parkinson medications                23420
Immediately before Parkinson medication            8152
Just after Parkinson medication (at your best)     8762
dtype: int64
                                                medtimepoint
medtimepoint                                                
Another time                                       28.274709
I don't take Parkinson medications                 41.647402
Immediately before Parkinson medication            14.496568
Just after Parkinson medication (at your best)     15.581321


In [4]:
#Read case control files
result_mean = pd.read_csv('PD_mean_case_control.csv')
print(result_mean.shape)

result_var = pd.read_csv('PD_var_case_control.csv')
print(result_var.shape)

result_std = pd.read_csv('PD_std_case_control.csv')
print(result_std.shape)

(56233, 75)
(56233, 75)
(56233, 75)


In [5]:
case_control_data = result_mean[result_mean['match_pair'] != 0]
case_control_data.drop(case_control_data.columns[0], axis=1, inplace = True)
case_control_data.shape
#case_control_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


(11289, 74)

In [6]:
X = case_control_data.iloc[:,9:43]
Y = case_control_data.iloc[:,4]

#Create train and test
X_train, X_test, y_train, y_test = train_test_split(
     X, Y, test_size=0.2, random_state=0)

In [7]:
# Create SVM classification object 
model = svm.SVC(C=1, gamma=1)
model.fit(X_train, y_train)
print(model.score(X_test,y_test))

scores = cross_val_score(model, X, Y, cv=10)
print(scores)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#Predict Output
predicted= model.predict(X_test)

0.8007085916740478
[0.59911504 0.64513274 0.61504425 0.63596103 0.62444641 0.60230292
 0.58865248 0.65602837 0.60283688 0.65248227]
Accuracy: 0.62 (+/- 0.05)


In [9]:
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, model.predict(X_test)
print(classification_report(y_true, y_pred))
print()

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

                                         precision    recall  f1-score   support

     I don't take Parkinson medications       0.80      0.95      0.86      1518
Immediately before Parkinson medication       0.82      0.50      0.62       740

                            avg / total       0.80      0.80      0.79      2258




In [10]:
case_control_data = result_var[result_var['match_pair'] != 0]
case_control_data.drop(case_control_data.columns[0], axis=1,inplace = True)
case_control_data.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


(11289, 74)

In [11]:
X = case_control_data.iloc[:,9:43]
Y = case_control_data.iloc[:,4]

#Create train and test
X_train, X_test, y_train, y_test = train_test_split(
     X, Y, test_size=0.2, random_state=0)

In [12]:
# Create SVM classification object 
model = svm.SVC(C=1, gamma=1)
model.fit(X_train, y_train)
print(model.score(X_test,y_test))

scores = cross_val_score(model, X, Y, cv=10)
print(scores)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#Predict Output
predicted= model.predict(X_test)

0.6842338352524358
[0.66725664 0.66548673 0.66725664 0.66784765 0.66696191 0.66696191
 0.65957447 0.66489362 0.66400709 0.66755319]
Accuracy: 0.67 (+/- 0.00)


In [13]:
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, model.predict(X_test)
print(classification_report(y_true, y_pred))
print()

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

                                         precision    recall  f1-score   support

     I don't take Parkinson medications       0.68      1.00      0.81      1528
Immediately before Parkinson medication       0.95      0.02      0.05       730

                            avg / total       0.77      0.68      0.56      2258




In [14]:
case_control_data = result_std[result_std['match_pair'] != 0]
case_control_data.drop(case_control_data.columns[0], axis=1, inplace = True)
case_control_data.shape
#case_control_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


(984, 74)

In [15]:
X = case_control_data.iloc[:,9:43]
Y = case_control_data.iloc[:,4]

#Create train and test
X_train, X_test, y_train, y_test = train_test_split(
     X, Y, test_size=0.2, random_state=0)

In [16]:
# Create SVM classification object 
model = svm.SVC(C=1, gamma=1)
model.fit(X_train, y_train)
print(model.score(X_test,y_test))

scores = cross_val_score(model, X, Y, cv=10)
print(scores)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#Predict Output
predicted= model.predict(X_test)

0.7055837563451777
[0.57575758 0.6969697  0.67676768 0.66666667 0.67676768 0.61616162
 0.64285714 0.66326531 0.65979381 0.69072165]
Accuracy: 0.66 (+/- 0.07)


In [17]:
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, model.predict(X_test)
print(classification_report(y_true, y_pred))
print()

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

                                         precision    recall  f1-score   support

     I don't take Parkinson medications       0.70      0.99      0.82       133
Immediately before Parkinson medication       0.88      0.11      0.19        64

                            avg / total       0.76      0.71      0.62       197




In [19]:
parameters = [{'kernel': ['rbf'],
               'gamma': [1e-4, 1e-3],
                'C': [1, 10]},
              {'kernel': ['linear'], 'C': [1, 10]}]

print("# Tuning hyper-parameters")
print()

clf = GridSearchCV(svm.SVC(decision_function_shape='ovr'), parameters, cv=5)
clf.fit(X, Y)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on training set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

# Tuning hyper-parameters

Best parameters set found on development set:

{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}

Grid scores on training set:

0.667 (+/-0.003) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.667 (+/-0.003) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.667 (+/-0.003) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.667 (+/-0.003) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.666 (+/-0.006) for {'C': 1, 'kernel': 'linear'}
0.653 (+/-0.077) for {'C': 10, 'kernel': 'linear'}

