<a href="https://colab.research.google.com/github/yossymaynaldi/PredictingParkinsonsDiseaseusingMachineLearning/blob/main/Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting Parkinson's Disease using Machine Learning

Data Source : https://www.kaggle.com/datasets/jainaru/parkinson-disease-detection

## Environment Set Up

In [1]:
import pandas as pd



import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    BaggingClassifier,
    RandomForestClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve, auc
)

from sklearn.model_selection import (
    train_test_split,
    KFold,
    cross_validate,
    StratifiedShuffleSplit,
    StratifiedKFold,
    GridSearchCV,
    cross_val_score,
)


In [2]:
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width',None)

In [4]:
def check(df):
  summary=[]
  for col in df.columns:
    instances = df[col].count()
    dtypes = df[col].dtype
    unique = df[col].nunique()
    sum_null = df[col].isnull().sum()
    duplicates = df[col].duplicated().sum()
    summary.append([col,
                    dtypes,
                    instances,
                    unique,
                    sum_null,
                    duplicates])
  df_check = pd.DataFrame(summary,
                            columns=['column',
                                     'dtype',
                                     'isntances',
                                     'unique',
                                     'sum_null',
                                     'duplicates'])
  return df_check

## Data Loading

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Dataset Kaggle/parkinsons data.csv')

print(f'The dataset contain {df.shape[0]} rows and {df.shape[1]} columns.')

The dataset contain 195 rows and 24 columns.


In [6]:
check(df).sort_values(by='sum_null', ascending=False)

Unnamed: 0,column,dtype,isntances,unique,sum_null,duplicates
0,name,object,195,195,0,0
1,MDVP:Fo(Hz),float64,195,195,0,0
22,D2,float64,195,195,0,0
21,spread2,float64,195,194,0,1
20,spread1,float64,195,195,0,0
19,DFA,float64,195,195,0,0
18,RPDE,float64,195,195,0,0
17,status,int64,195,2,0,193
16,HNR,float64,195,195,0,0
15,NHR,float64,195,185,0,10


## Data Preparation

In [7]:
df=df.drop(columns=['name'])

X=df.drop(columns=['status']).values
y=df['status'].values

X_train, X_test,y_train,y_test=train_test_split(X, y, test_size = 0.3, random_state=42)

print("Shape of independent variable: ", X_train.shape, X_test.shape)
print("Shape of  dependent variable: ", y_train.shape, y_test.shape)

Shape of independent variable:  (136, 22) (59, 22)
Shape of  dependent variable:  (136,) (59,)


## Machine Learning

In [8]:
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Classifier": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBoost Classifier": XGBClassifier(),
    "Extra Trees Classifier": ExtraTreesClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Bagging Classifier": BaggingClassifier(),
    "Histogram-based Gradient Boosting Classifier": HistGradientBoostingClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Multi-layer Perceptron": MLPClassifier(),
}

In [9]:
### Cross Validation ###

# Initialize lists to store metrics
classifier_names = []
accuracies = []
precisions = []
recalls = []
f1_scores_list = []

# Iterate over classifiers
for classifier_name, classifier_instance in classifiers.items():
    clf = classifier_instance

    # Perform cross-validation
    cv_scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')

    # Additional evaluation metrics
    precision_scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='precision')
    recall_scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='recall')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')

    print(f"{classifier_name}:")
    print(f"  Accuracy: {cv_scores.mean():.2f} with a standard deviation of {cv_scores.std():.2f}")
    print(f"  Precision: {precision_scores.mean():.2f} with a standard deviation of {precision_scores.std():.2f}")
    print(f"  Recall: {recall_scores.mean():.2f} with a standard deviation of {recall_scores.std():.2f}")
    print(f"  F1 Score: {f1_scores.mean():.2f} with a standard deviation of {f1_scores.std():.2f}")
    print('-'*50)

    # Append metrics to lists
    classifier_names.append(classifier_name)
    accuracies.append(cv_scores.mean())
    precisions.append(precision_scores.mean())
    recalls.append(recall_scores.mean())
    f1_scores_list.append(f1_scores.mean())

# Create DataFrame
metrics_df = pd.DataFrame({
    'Classifier': classifier_names,
    'Accuracy': accuracies,
    'Precision': precisions,
    'Recall': recalls,
    'F1 Score': f1_scores_list
})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression:
  Accuracy: 0.86 with a standard deviation of 0.05
  Precision: 0.87 with a standard deviation of 0.04
  Recall: 0.96 with a standard deviation of 0.01
  F1 Score: 0.91 with a standard deviation of 0.03
--------------------------------------------------
Support Vector Classifier:
  Accuracy: 0.79 with a standard deviation of 0.03
  Precision: 0.80 with a standard deviation of 0.02
  Recall: 0.97 with a standard deviation of 0.04
  F1 Score: 0.87 with a standard deviation of 0.02
--------------------------------------------------
K-Nearest Neighbors:
  Accuracy: 0.77 with a standard deviation of 0.04
  Precision: 0.81 with a standard deviation of 0.00
  Recall: 0.90 with a standard deviation of 0.07
  F1 Score: 0.85 with a standard deviation of 0.03
--------------------------------------------------
Decision Tree:
  Accuracy: 0.82 with a standard deviation of 0.01
  Precision: 0.87 with a standard deviation of 0.02
  Recall: 0.90 with a standard deviation of 0.03
  



Multi-layer Perceptron:
  Accuracy: 0.76 with a standard deviation of 0.01
  Precision: 0.82 with a standard deviation of 0.05
  Recall: 0.97 with a standard deviation of 0.02
  F1 Score: 0.87 with a standard deviation of 0.05
--------------------------------------------------




## Best Model

In [11]:
metrics_df.round(4).sort_values(by='Accuracy',ascending=False).head(1).reset_index(drop=True)

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1 Score
0,Extra Trees Classifier,0.9262,0.9284,0.9711,0.9381
