In [260]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [261]:
# loading the data from csv file to a Pandas DataFrame
parkinsons_data = pd.read_csv('PD.csv')

In [262]:
# printing the first 5 rows of the dataframe
parkinsons_data.head()

Unnamed: 0,subject_id,Jitter(local),Jitter (local_absolute),Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),Shimmer (local_dB),Shimmer (apq3),Shimmer (apq5),...,Maximum pitch,Number of pulses,Number of periods,Mean period,Standard deviation of period,Fraction of locally unvoiced frames,Number of voice breaks,Degree of voice breaks\tstatus,UPDRS,status
0,1,1.488,9e-05,0.9,0.794,2.699,8.334,0.779,4.517,4.609,...,187.576,160,159,0.006065,0.000416,0.0,0,0.0,23,1
1,1,0.728,3.8e-05,0.353,0.376,1.059,5.864,0.642,2.058,3.18,...,234.505,170,169,0.005181,0.000403,2.247,0,0.0,23,1
2,1,1.22,7.4e-05,0.732,0.67,2.196,8.719,0.875,4.347,5.166,...,211.442,1431,1427,0.006071,0.000474,10.656,1,0.178,23,1
3,1,2.502,0.000123,1.156,1.634,3.469,13.513,1.273,5.263,8.771,...,220.23,94,92,0.00491,0.00032,0.0,0,0.0,23,1
4,1,3.509,0.000167,1.715,1.539,5.145,9.112,1.04,3.102,4.927,...,225.162,117,114,0.004757,0.00038,18.182,1,13.318,23,1


In [263]:
# number of rows and columns in the dataframe
parkinsons_data.shape

(1040, 29)

In [264]:
parkinsons_data.columns

Index(['subject_id', 'Jitter(local)', 'Jitter (local_absolute)',
       'Jitter (rap)', 'Jitter (ppq5)', 'Jitter (ddp)', 'Shimmer (local)',
       'Shimmer (local_dB)', 'Shimmer (apq3)', 'Shimmer (apq5)',
       'Shimmer (apq11)', 'Shimmer (dda)', 'AC', 'NTH', 'HTN', 'Median pitch',
       'Mean pitch', 'Standard deviation', 'Minimum pitch', 'Maximum pitch',
       'Number of pulses', 'Number of periods', 'Mean period',
       'Standard deviation of period', 'Fraction of locally unvoiced frames',
       'Number of voice breaks', 'Degree of voice breaks\tstatus', 'UPDRS',
       'status'],
      dtype='object')

In [265]:
# getting more information about the dataset
parkinsons_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040 entries, 0 to 1039
Data columns (total 29 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   subject_id                           1040 non-null   int64  
 1   Jitter(local)                        1040 non-null   float64
 2   Jitter (local_absolute)              1040 non-null   float64
 3   Jitter (rap)                         1040 non-null   float64
 4   Jitter (ppq5)                        1040 non-null   float64
 5   Jitter (ddp)                         1040 non-null   float64
 6   Shimmer (local)                      1040 non-null   float64
 7   Shimmer (local_dB)                   1040 non-null   float64
 8   Shimmer (apq3)                       1040 non-null   float64
 9   Shimmer (apq5)                       1040 non-null   float64
 10  Shimmer (apq11)                      1040 non-null   float64
 11  Shimmer (dda)                 

In [266]:
# checking for missing values in each column
parkinsons_data.isnull().sum()

subject_id                             0
Jitter(local)                          0
Jitter (local_absolute)                0
Jitter (rap)                           0
Jitter (ppq5)                          0
Jitter (ddp)                           0
Shimmer (local)                        0
Shimmer (local_dB)                     0
Shimmer (apq3)                         0
Shimmer (apq5)                         0
Shimmer (apq11)                        0
Shimmer (dda)                          0
AC                                     0
NTH                                    0
HTN                                    0
Median pitch                           0
Mean pitch                             0
Standard deviation                     0
Minimum pitch                          0
Maximum pitch                          0
Number of pulses                       0
Number of periods                      0
Mean period                            0
Standard deviation of period           0
Fraction of loca

In [267]:
# getting some statistical measures about the data
parkinsons_data.describe()

Unnamed: 0,subject_id,Jitter(local),Jitter (local_absolute),Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),Shimmer (local_dB),Shimmer (apq3),Shimmer (apq5),...,Maximum pitch,Number of pulses,Number of periods,Mean period,Standard deviation of period,Fraction of locally unvoiced frames,Number of voice breaks,Degree of voice breaks\tstatus,UPDRS,status
count,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,...,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0,1040.0
mean,20.5,2.679523,0.00017,1.247053,1.348327,3.741161,12.918391,1.194895,5.6996,7.983552,...,234.87599,109.744231,105.969231,0.006547,0.000843,27.682856,1.134615,12.370042,13.0,0.5
std,11.54895,1.765053,0.000106,0.979462,1.138742,2.938443,5.452204,0.420071,3.015183,4.840892,...,121.541243,150.027703,149.417074,0.001875,0.000723,20.975294,1.614764,15.161916,15.894745,0.500241
min,1.0,0.19,6e-06,0.062,0.081,0.185,1.185,0.103,0.496,0.708,...,85.541,0.0,0.0,0.002039,5.5e-05,0.0,0.0,0.0,1.0,0.0
25%,10.75,1.5075,9.5e-05,0.617,0.66575,1.85175,9.3535,0.941,3.703,5.16025,...,143.65075,42.75,40.75,0.005039,0.000404,8.14925,0.0,0.0,1.0,0.0
50%,20.5,2.396,0.000151,1.0355,1.1265,3.107,12.3485,1.1815,5.1345,7.0505,...,195.971,65.0,62.0,0.006484,0.000644,26.501,1.0,5.826,3.0,0.5
75%,30.25,3.4115,0.000229,1.6025,1.69475,4.8085,15.49325,1.411,6.942,9.55893,...,263.79825,113.0,109.0,0.007923,0.00098,43.06425,1.0,22.2555,23.25,1.0
max,40.0,14.376,0.000777,8.016,13.542,24.048,41.137,2.721,25.82,72.86,...,597.974,1490.0,1489.0,0.01207,0.006371,88.158,12.0,69.117,55.0,1.0


In [268]:
# distribution of target Variable
parkinsons_data['status'].value_counts()

1    520
0    520
Name: status, dtype: int64

In [269]:
# grouping the data bas3ed on the target variable
parkinsons_data.groupby('status').mean()

Unnamed: 0_level_0,subject_id,Jitter(local),Jitter (local_absolute),Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),Shimmer (local_dB),Shimmer (apq3),Shimmer (apq5),...,Minimum pitch,Maximum pitch,Number of pulses,Number of periods,Mean period,Standard deviation of period,Fraction of locally unvoiced frames,Number of voice breaks,Degree of voice breaks\tstatus,UPDRS
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,30.5,2.507421,0.000153,1.138244,1.222062,3.414746,12.869006,1.183479,5.754319,8.092812,...,137.318069,251.810481,103.590385,98.934615,0.006393,0.000885,30.241529,1.263462,14.281812,1.0
1,10.5,2.851624,0.000188,1.355861,1.474592,4.067577,12.967775,1.206312,5.644882,7.874292,...,131.758132,217.941499,115.898077,113.003846,0.0067,0.0008,25.124183,1.005769,10.458273,25.0


In [270]:
X = parkinsons_data.drop(columns=['subject_id','status'], axis=1)
Y = parkinsons_data['status']

In [271]:
print(Y)

0       1
1       1
2       1
3       1
4       1
       ..
1035    0
1036    0
1037    0
1038    0
1039    0
Name: status, Length: 1040, dtype: int64


In [272]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [273]:
print(X.shape, X_train.shape, X_test.shape)

(1040, 27) (832, 27) (208, 27)


In [274]:
# Support vector classifier
svc_classifier = SVC()
svc_classifier.fit(X_train, Y_train)
X_train_prediction=svc_classifier.predict(X_train)
y_pred_scv = svc_classifier.predict(X_test)
print(accuracy_score(Y_test, y_pred_scv))
print(accuracy_score(Y_train,X_train_prediction))

0.8221153846153846
0.8064903846153846


In [275]:
# print(X_train)

In [276]:
# scores=cross_val_score(model,X_train,Y_train,cv=10)
# print(scores)

In [277]:
example_measures = [[1.809,0.00014851,0.68,0.843,2.04,7.881,0.782,2.69,4.543,11.073,8.069,0.925554,0.097481,13.472,119.26,121.63,8.028,108.144,137.546,62,60,0.008211245,0.000565813,18.182,1,3.387,1]]
prediction = model.predict(example_measures)
print(prediction)

[0]


In [278]:
# print(X_train_prediction)
# print(y_pred_scv)

In [279]:
# XGBoost Classifier
from xgboost import XGBClassifier
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, Y_train)
y_pred_xgb = xgb_classifier.predict(X_test)
xgb_train_prediction=xgb_classifier.predict(X_train)
print(accuracy_score(Y_test, y_pred_xgb))
print(accuracy_score(Y_train,xgb_train_prediction))

1.0
1.0


In [280]:
print(Y_test.head(50))
print(y_pred_xgb)

631     0
679     0
364     1
319     1
487     1
723     0
483     1
954     0
251     1
270     1
75      1
671     0
678     0
320     1
367     1
294     1
856     0
689     0
506     1
530     0
1       1
27      1
531     0
31      1
513     1
546     0
962     0
851     0
380     1
312     1
331     1
943     0
18      1
918     0
809     0
865     0
518     1
1015    0
295     1
403     1
278     1
214     1
613     0
862     0
122     1
317     1
561     0
14      1
15      1
142     1
Name: status, dtype: int64
[0 0 1 1 1 0 1 0 1 1 1 0 0 1 1 1 0 0 1 0 1 1 0 1 1 0 0 0 1 1 1 0 1 0 0 0 1
 0 1 1 1 1 0 0 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 0 0 0 0 1 0 0
 1 0 1 1 1 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 1 1 1 1 0 0 1 1 1 0 1 1 1 0 0
 1 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 1
 1 1 1 1 0 0 0 1 0 0 0 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 0 1 1 1 1 1 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0]


In [281]:
example_measures2 = [[1.488,0.000090213,0.9,0.794,2.699,8.334,0.779,4.517,4.609,6.802,13.551,0.905905,0.119116,11.13,166.533,164.781,10.421,142.229,187.576,160,159,0.006064725,0.000416276,0,0,0,23]]
prediction2 = model.predict(example_measures2)
print(prediction2)

[1]


In [282]:
confusion_matrix_xgb=confusion_matrix(Y_test, y_pred_xgb)
print(confusion_matrix_xgb)


[[107   0]
 [  0 101]]


In [283]:
print(classification_report(Y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       107
           1       1.00      1.00      1.00       101

    accuracy                           1.00       208
   macro avg       1.00      1.00      1.00       208
weighted avg       1.00      1.00      1.00       208



In [284]:
## Pickle
import pickle
 
# save model
pickle.dump(xgb_classifier, open('parkinsons_disease_detector.pkl', 'wb'))
 
# load model
parkinsons_disease_detector_model = pickle.load(open('parkinsons_disease_detector.pkl', 'rb'))
 
# predict the output
Y_pred = parkinsons_disease_detector_model.predict(X_test)
 