# Decision Tree/Random Forest

In [None]:
# Import numerical and dataframe handling
import pandas as pd

# Import plotting functionality
import matplotlib.pyplot as plt

# Import scikit-learn data utilities
from sklearn.model_selection import train_test_split

# Import model scoring
from sklearn import metrics
from sklearn.metrics import accuracy_score

# Import models
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier

# Import other
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

## Data

In [None]:
data = pd.read_csv('../data/parkinsons_updrs.data')

In [None]:
print(data.total_UPDRS.describe())
print(data.motor_UPDRS.describe())

#Going to set the following stages (total):
# I - [7, 21)
# II - [21, 27.5)
# III - [27.5, 36)
# IV - [36, 55)

count    5875.000000
mean       29.018942
std        10.700283
min         7.000000
25%        21.371000
50%        27.576000
75%        36.399000
max        54.992000
Name: total_UPDRS, dtype: float64
count    5875.000000
mean       21.296229
std         8.129282
min         5.037700
25%        15.000000
50%        20.871000
75%        27.596500
max        39.511000
Name: motor_UPDRS, dtype: float64


In [None]:
stages = []

for i in range(data.shape[0]):
    updrs = data.iloc[i,5]

    if updrs >= 7 and updrs < 21:
        stages.append(1)
    elif updrs >= 21 and updrs < 27.5:
        stages.append(2)
    elif updrs >= 27.5 and updrs < 36:
        stages.append(3)
    else:
        stages.append(4)

data['Stages'] = stages
data['Stages'].value_counts()

2    1545
4    1496
3    1450
1    1384
Name: Stages, dtype: int64

# Model

## Random Forest (Regressor)

In [None]:
# Important features
important_features1 = ['Shimmer:APQ11', 'HNR', 'RPDE', 'PPE']
important_features2 = ['HNR', 'PPE', 'Shimmer:APQ11', 'age']
target = ['motor_UPDRS', 'total_UPDRS']

# Normalize Data
# (normalizing data increased r2 from 0.09 to 0.11)
data_normalized = preprocessing.normalize(data[important_features2])

# Splitting data
x_train, x_test, y_train, y_test = train_test_split(data_normalized, data[target], test_size=0.25)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4406, 4)
(1469, 4)
(4406, 2)
(1469, 2)


In [None]:
# Finding best max_depth
max_d = 0
max_value = 0

for i in range(1,50):
    rfr = RandomForestRegressor(max_depth=i)
    rfr.fit(x_train, y_train)

    y_test_pred = rfr.predict(x_test)
    if metrics.r2_score(y_test_pred, y_test) > max_value:
        max_d = i
        max_value = metrics.r2_score(y_test_pred, y_test)

print(max_value)
print(max_d)

0
0


## Decision Tree (Classifier)

In [None]:
# Important features
important_features1 = ['Shimmer:APQ11', 'HNR', 'RPDE', 'PPE']
important_features2 = ['HNR', 'PPE', 'Shimmer:APQ11', 'age']
target = ['Stages']

# Normalize Data
# (normalizing data increased r2 from 0.09 to 0.11)
data_normalized=preprocessing.normalize(data)

# Splitting data
x_train, x_test, y_train, y_test = train_test_split(data_normalized, data[target], test_size=0.25)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4406, 23)
(1469, 23)
(4406, 1)
(1469, 1)


In [None]:
# Finding best max_depth
max_d = 0
max_value = 0

for i in range(1,200):
    dtc = DecisionTreeClassifier(max_depth=i)
    dtc.fit(x_train, y_train)

    y_test_pred = dtc.predict(x_test)
    if accuracy_score(y_test_pred, y_test) > max_value:
        max_d = i
        max_value=accuracy_score(y_test_pred, y_test)

print(max_value)
print(max_d)

# Finding best min_samples_leaf
max_value = 0
max_leaves = 0

for i in range(1,200):
    dtc = DecisionTreeClassifier(min_samples_leaf=i)
    dtc.fit(x_train, y_train)

    y_test_pred = dtc.predict(x_test)
    if accuracy_score(y_test_pred, y_test) > max_value:
        max_leaves = i
        max_value=accuracy_score(y_test_pred, y_test)

print(max_value)
print(max_leaves)

max_value = 0
max_d = 0
max_leaves = 0

# Best combo
for i in range(1,50):
    for k in range(1,50):
        dtc = DecisionTreeClassifier(max_depth=i, min_samples_leaf=k)
        dtc.fit(x_train, y_train)

        y_test_pred = dtc.predict(x_test)
        if accuracy_score(y_test_pred, y_test) > max_value:
            max_d = i
            max_leaves = k
            max_value=accuracy_score(y_test_pred, y_test)

print(max_value)
print(max_d, max_leaves)

0.9870660313138189
193
0.9863852961198094
1
0.9870660313138189
18 1


In [None]:
final_dtc = DecisionTreeClassifier(max_depth=17, min_samples_leaf=1)
final_dtc.fit(x_train, y_train)
y_test_pred=final_dtc.predict(x_test)

print(accuracy_score(y_test, y_test_pred))

0.9863852961198094


## Random Forest (Classifier)

In [None]:
max_test = 0
max_n = 0
test_acc = []

for i in range(1,100):
    random_forest=RandomForestClassifier(n_estimators=i)
    random_forest.fit(x_train,y_train)

    y_test_pred = random_forest.predict(x_test)
    y_acc = accuracy_score(y_test,y_test_pred)
    test_acc.append(y_acc)
    if y_acc > max_test:
        max_test = y_acc
        max_n = i

print(max_test)
print(max_n)

plt.scatter(range(1,100),test_acc)

NameError: name 'RandomForestClassifier' is not defined

In [None]:
random_forest = RandomForestClassifier(n_estimators=max_n)
random_forest.fit(x_train,y_train)
print(random_forest.feature_importances_)

  random_forest.fit(x_train,y_train)
[0.06535328 0.05184032 0.01941121 0.04137748 0.1058937  0.13078959
 0.00672786 0.0010375  0.00704664 0.00786233 0.00672443 0.00772969
 0.00754905 0.00656916 0.00913606 0.01139748 0.00737462 0.00548221
 0.07286825 0.03255638 0.08158917 0.01157699 0.30210662]


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a3b4f5ce-96b0-4815-84e4-80dc6a795abb' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>