In [1]:
# Required packages if not installed then uncomment the below line to install them

# ! pip install numpy sklearn pandas xgboost

In [2]:
import numpy as np
import pandas as pd
import os, sys
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#  Importing different classification models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [3]:
# Reading the Parkinson Dataset
# The data can be downloaded from the location "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/"
df = pd.read_csv(r'.\Data\parkinsons.data')

# Mostly all data is numerical thus we need not to use any transformer for transformation. Only scaling should be sufficient.
df.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [4]:
# Getting the features columns out of the dataset which excludes the status column as well as name column
# name column is'nt much of useful feature thus neglecting the same
features = df.loc[:,df.columns!='status'].values[:,1:]

# Getting labels or output column
labels = df.loc[:,'status'].values

# Count Records of non-parkinson people
print(labels[labels==0].shape)

# Count Records of parkinson people
print(labels[labels==1].shape)

(48,)
(147,)


In [5]:
scaler = MinMaxScaler((-1,1))

# Scaling the features between -1 to 1
x = scaler.fit_transform(features)

# Labels are either 0 or 1 already thus no need for making any changes to them.
y = labels



In [6]:
# Spliting the dataset into training and test dataset. Keeping test data to be 20% of the total dataset.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

In [7]:
# Creating the Logistic model
logistic_model = LogisticRegression()

# Training or fitting the model using the training set
logistic_model.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [8]:
#  Using model to predict the status of the test feature dataset
y_pred = logistic_model.predict(x_test)

# Getting the accuracy measure of our result
percent_accuracy = accuracy_score(y_test, y_pred) * 100

print('The percentage accuracy of the logistic regression model for the test set is', percent_accuracy)

The percentage accuracy of the logistic regression model for the test set is 87.17948717948718


In [9]:
# Creating the Support Vector Machine/Classification model
svm_model = SVC()

# Training or fitting the model using the training set
svm_model.fit(x_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [10]:
#  Using model to predict the status of the test feature dataset
y_pred = svm_model.predict(x_test)

# Getting the accuracy measure of our result
percent_accuracy = accuracy_score(y_test, y_pred) * 100

print('The percentage accuracy of the SVM model for the test set is', percent_accuracy)

The percentage accuracy of the SVM model for the test set is 87.17948717948718


In [11]:
# Creating the XGBoost model
decision_tree_model = DecisionTreeClassifier()

# Training or fitting the model using the training set
decision_tree_model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [12]:
#  Using model to predict the status of the test feature dataset
y_pred = decision_tree_model.predict(x_test)
# Getting the accuracy measure of our result
percent_accuracy = accuracy_score(y_test, y_pred) * 100

print('The percentage accuracy of the Decision Tree model for the test set is', percent_accuracy)

The percentage accuracy of the Decision Tree model for the test set is 84.61538461538461


In [13]:
# Creating the Random Forest Classifier model
random_forest_model = RandomForestClassifier()

# Training or fitting the model using the training set
random_forest_model.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
#  Using model to predict the status of the test feature dataset
y_pred = random_forest_model.predict(x_test)
# Getting the accuracy measure of our result
percent_accuracy = accuracy_score(y_test, y_pred) * 100

print('The percentage accuracy of the Random Forest model for the test set is', percent_accuracy)

The percentage accuracy of the XGBoost model for the test set is 89.74358974358975


In [15]:
# Creating the XGBoost model
xgb_model = XGBClassifier()

# Training or fitting the model using the training set
xgb_model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [16]:
#  Using model to predict the status of the test feature dataset
y_pred = xgb_model.predict(x_test)
# Getting the accuracy measure of our result
percent_accuracy = accuracy_score(y_test, y_pred) * 100

print('The percentage accuracy of the XGBoost model for the test set is', percent_accuracy)

The percentage accuracy of the XGBoost model for the test set is 94.87179487179486
