In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly

In [2]:
data = pd.read_csv("../Datasets/heart_cleveland_upload.csv")

In [3]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [4]:
y = data["condition"]
X = data.drop("condition", axis=1)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [6]:
classifier = RandomForestClassifier(n_estimators=1000)

classifier.fit(x_train, y_train)

In [7]:
predictions = classifier.predict(x_test)

In [8]:
print(f"Accuracy {accuracy_score(y_test, predictions)}")

Accuracy 0.7888888888888889


In [12]:
arr = [num for num in range(10, 100, 5)]
acc_arr = []
p_arr = []
r_arr = []
for i in arr:
    cls = RandomForestClassifier(n_estimators=i)
    cls.fit(x_train, y_train)
    predict = cls.predict(x_test)
    acc_arr.append(accuracy_score(y_test, predict))
    p_arr.append(precision_score(y_test, predict, average="macro"))
    r_arr.append(recall_score(y_test, predict, average="macro"))

In [13]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=arr, y=acc_arr, name="accuracy"))
fig.add_trace(go.Scatter(x=arr, y=p_arr, name="Precision"))
fig.add_trace(go.Scatter(x=arr, y=r_arr, name="Recall"))
fig.update_layout(
    title="Varying trees in Random Forest on UCI heart disease dataset",
    xaxis_title="Number of trees/Bootstrap samples",
    yaxis_title="Score",
    legend_title="Metrics",
    font = dict(
        family="Courier new, monospace"
    )
)
fig.show()

In [11]:
plotly.io.write_image(fig, 'uci-varytrees.pdf', format='pdf')