# The Parkinsons Data Set

This notebook develop a model to predicts "Oxford Parkinson's Disease Detection Dataset"
https://www.kaggle.com/nidaguler/parkinsons-data-set

In [58]:
import numpy as np
import pandas as pd
import os, sys
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

RANDOM_SEED = 4321

First let's look at the data

In [43]:
pd.set_option("display.max_columns", 100)

df = pd.read_csv('.\Data\parkinsons.data')
df.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,0.02182,0.0313,0.02971,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,0.03134,0.04518,0.04368,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,0.02757,0.03858,0.0359,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,0.02924,0.04005,0.03772,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,0.0349,0.04825,0.04465,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [83]:
# Separate labels
labels_df = df['status']
features_df = df.drop(['status', 'name'], axis=1)

In [84]:
# Let's check the class imbalance
print("Status 1: ", (labels_df[labels_df==1].shape[0])/labels_df.shape[0]) 

Status 1:  0.7538461538461538


There is some imbalance!!!

In [85]:
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)


### Feature Enginnering

In [86]:
preprocessing = Pipeline([
    ('Scaler', StandardScaler())
])

#preprocessing.fit(X_train, y_train)

### Model Selection

In [87]:
est_xgb = XGBClassifier()

all_cols = features_df.columns

preprocessor = ColumnTransformer(
    transformers=[('Numeric_preprocessing', preprocessing, all_cols)],
    remainder='drop'
)

full_pipeline = Pipeline([
    ('Preprocessor', preprocessor),
    ("estimator", est_xgb)
])

In [88]:
full_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('Preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('Numeric_preprocessing',
                                                  Pipeline(memory=None,
                                                           steps=[('Scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  Index(['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(...
                               interaction_constra

In [89]:
y_pred = full_pipeline.predict(X_eval)
print(accuracy_score(y_eval, y_pred)*100)

84.61538461538461
