---
title: "Comprehensive Predictive Modeling for Parkinson's Data"
author: "Your Name"
output: github_document
---

## Introduction

This document contains a comprehensive predictive modeling analysis of the Parkinson's data.


In [28]:
# Load libraries silently
suppressPackageStartupMessages({
  library(caret)
  library(ggplot2)
  library(randomForest)
  library(e1071)
  library(rmarkdown)
})


In [29]:
data <- read.csv('parkinsons_updrs.data.csv')
head(data)

Unnamed: 0_level_0,index,subject.,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter...,Jitter.Abs.,Jitter.RAP,⋯,Shimmer.dB.,Shimmer.APQ3,Shimmer.APQ5,Shimmer.APQ11,Shimmer.DDA,NHR,HNR,RPDE,DFA,PPE
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,1,72,0,5.6431,28.199,34.398,0.00662,3.38e-05,0.00401,⋯,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
2,1,1,72,0,12.666,28.447,34.894,0.003,1.68e-05,0.00132,⋯,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
3,2,1,72,0,19.681,28.695,35.389,0.00481,2.462e-05,0.00205,⋯,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
4,3,1,72,0,25.647,28.905,35.81,0.00528,2.657e-05,0.00191,⋯,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
5,4,1,72,0,33.642,29.187,36.375,0.00335,2.014e-05,0.00093,⋯,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361
6,5,1,72,0,40.652,29.435,36.87,0.00353,2.29e-05,0.00119,⋯,0.214,0.01006,0.01337,0.02263,0.03019,0.009438,22.946,0.53949,0.57243,0.195


In [30]:
summary(data)

     index         subject.          age            sex        
 Min.   :   0   Min.   : 1.00   Min.   :36.0   Min.   :0.0000  
 1st Qu.:1468   1st Qu.:10.00   1st Qu.:58.0   1st Qu.:0.0000  
 Median :2937   Median :22.00   Median :65.0   Median :0.0000  
 Mean   :2937   Mean   :21.49   Mean   :64.8   Mean   :0.3178  
 3rd Qu.:4406   3rd Qu.:33.00   3rd Qu.:72.0   3rd Qu.:1.0000  
 Max.   :5874   Max.   :42.00   Max.   :85.0   Max.   :1.0000  
   test_time        motor_UPDRS      total_UPDRS      Jitter...       
 Min.   : -4.263   Min.   : 5.038   Min.   : 7.00   Min.   :0.000830  
 1st Qu.: 46.847   1st Qu.:15.000   1st Qu.:21.37   1st Qu.:0.003580  
 Median : 91.523   Median :20.871   Median :27.58   Median :0.004900  
 Mean   : 92.864   Mean   :21.296   Mean   :29.02   Mean   :0.006154  
 3rd Qu.:138.445   3rd Qu.:27.596   3rd Qu.:36.40   3rd Qu.:0.006800  
 Max.   :215.490   Max.   :39.511   Max.   :54.99   Max.   :0.099990  
  Jitter.Abs.          Jitter.RAP        Jitter.PPQ5   

In [31]:
sum(is.na(data))

In [32]:
# Check for missing values
sum(is.na(data))

# Normalize the numerical features
preProc <- preProcess(data[, -c(1, 2)], method = c("center", "scale"))
data_normalized <- predict(preProc, data[, -c(1, 2)])
data <- cbind(data[, c(1, 2)], data_normalized)

# Display the first few rows of the normalized data
head(data)


Unnamed: 0_level_0,index,subject.,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter...,Jitter.Abs.,Jitter.RAP,⋯,Shimmer.dB.,Shimmer.APQ3,Shimmer.APQ5,Shimmer.APQ11,Shimmer.DDA,NHR,HNR,RPDE,DFA,PPE
Unnamed: 0_level_1,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,1,0.815626,-0.6824508,-1.6319513,0.8491244,0.5027024,0.08289818,-0.284218,0.3274246,⋯,-0.3516122,-0.209691,-0.4233205,-0.5434195,-0.2096865,-0.29869541,-0.009203976,-1.21396224,-1.478374,-0.6506028
2,1,1,0.815626,-0.6824508,-1.5005486,0.8796314,0.5490563,-0.56074568,-0.7566586,-0.5337008,⋯,-0.5731071,-0.5451114,-0.5655438,-0.5299101,-0.5451066,-0.3519351,1.282540523,-1.05502898,-1.247774,-1.218481
3,2,1,0.815626,-0.6824508,-1.3692936,0.9101384,0.5953167,-0.23892375,-0.5393359,-0.3000125,⋯,-0.564421,-0.7415288,-0.7023662,-0.6454902,-0.7415238,-0.19935262,0.318684263,-0.78479294,-1.540008,-0.1032714
4,3,1,0.815626,-0.6824508,-1.2576661,0.935971,0.6346615,-0.15535673,-0.4851442,-0.3448294,⋯,0.0696623,-0.4605008,-0.4497248,-0.3928152,-0.4607479,-0.07174823,0.644475136,-0.53644116,-1.062024,1.2369692
5,4,1,0.815626,-0.6824508,-1.1080747,0.9706604,0.6874638,-0.49851492,-0.6638379,-0.658548,⋯,-0.5861362,-0.7830786,-0.6513579,-0.4648651,-0.7833254,-0.34334103,1.036216523,-0.68913591,-1.297843,-0.2839301
6,5,1,0.815626,-0.6824508,-0.9769133,1.0011674,0.7337243,-0.46651053,-0.5871358,-0.5753165,⋯,-0.4211008,-0.536046,-0.4065177,-0.2427112,-0.5357894,-0.37997892,0.295147154,-0.01963576,-1.139737,-0.2687386


In [33]:
# Set seed for reproducibility
set.seed(123)

# Split the data
trainIndex <- createDataPartition(data$total_UPDRS, p = 0.8, 
                                  list = FALSE, 
                                  times = 1)
dataTrain <- data[trainIndex, ]
dataTest  <- data[-trainIndex, ]


In [34]:
# Train the linear regression model
lm_model <- train(total_UPDRS ~ ., data = dataTrain, method = "lm")

# Print model summary
summary(lm_model)



Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.74892 -0.19306 -0.03899  0.11800  1.21009 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -9.018e-02  9.970e-03  -9.045  < 2e-16 ***
index          7.851e-05  9.195e-05   0.854 0.393236    
subject.      -6.493e-03  1.266e-02  -0.513 0.607919    
age            5.871e-02  4.757e-03  12.343  < 2e-16 ***
sex           -7.249e-02  5.377e-03 -13.481  < 2e-16 ***
test_time      1.701e-02  4.459e-03   3.815 0.000138 ***
motor_UPDRS    9.137e-01  5.034e-03 181.525  < 2e-16 ***
Jitter...     -2.440e-01  4.198e-02  -5.813 6.52e-09 ***
Jitter.Abs.    8.339e-02  1.314e-02   6.346 2.42e-10 ***
Jitter.RAP    -1.732e+00  5.036e+00  -0.344 0.730911    
Jitter.PPQ5    4.537e-02  2.427e-02   1.869 0.061688 .  
Jitter.DDP     1.880e+00  5.037e+00   0.373 0.709015    
Shimmer       -1.573e-01  5.663e-02  -2.777 0.005508 ** 
Shimmer.dB.    4.753e-02  3.818e-02  

In [35]:
# Train the decision tree model
dt_model <- train(total_UPDRS ~ ., data = dataTrain, method = "rpart")

# Print model summary
print(dt_model)


“There were missing values in resampled performance measures.”


CART 

4702 samples
  22 predictor

No pre-processing
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 4702, 4702, 4702, 4702, 4702, 4702, ... 
Resampling results across tuning parameters:

  cp          RMSE       Rsquared   MAE      
  0.09540417  0.4969178  0.7473948  0.3933870
  0.09864946  0.5269552  0.7152486  0.4212974
  0.63995982  0.7274553  0.6327736  0.5963647

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.09540417.
