# Experiments

Metrics: Accuracy, Precision and Recall

Experiments: 
- Performance vs node purity method (Gini vs entropy)
- Performance vs size of individual trees

In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import numpy as np
import plotly.express as px
from sklearn.datasets import load_iris

In [2]:
!pip3 install -U kaleido



In [3]:
import kaleido
kaleido.__version__

'0.2.1'

In [4]:
import plotly
plotly.__version__

'5.8.1'

## Load data

In [5]:
iris = load_iris()
stars = pd.read_csv("../Datasets/star_classification.csv")
heart = pd.read_csv("../Datasets/heart_cleveland_upload.csv")

### iris

In [6]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [7]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [8]:
len(iris.data)

150

### stars

In [9]:
s_t = list(stars.columns)
s_t.remove('class')
s_t

['obj_ID',
 'alpha',
 'delta',
 'u',
 'g',
 'r',
 'i',
 'z',
 'run_ID',
 'rerun_ID',
 'cam_col',
 'field_ID',
 'spec_obj_ID',
 'redshift',
 'plate',
 'MJD',
 'fiber_ID']

In [10]:
stars['class'].unique()

array(['GALAXY', 'QSO', 'STAR'], dtype=object)

In [11]:
len(stars)

100000

### heart

In [12]:
list(heart.columns)

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'condition']

In [13]:
heart['condition'].unique()

array([0, 1])

In [14]:
len(heart)

297

## Performance vs node purity method

### iris


In [15]:
iris_X, iris_y  = iris.data, iris.target

iris_x_train, iris_x_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y, test_size=0.3)

for crit in ['gini', 'entropy']:
    print(f"Metrics for {crit}:")
    classifier = RandomForestClassifier(criterion=crit)

    classifier.fit(iris_x_train, iris_y_train)
    predictions = classifier.predict(iris_x_test)

    print(f"Accuracy {accuracy_score(iris_y_test, predictions)}")
    print(f"F1 Score {f1_score(iris_y_test, predictions, average='macro')}")
    print(f"Recall {recall_score(iris_y_test, predictions, average='macro')}")
    print(f"Precision {precision_score(iris_y_test, predictions, average='macro')}\n")

Metrics for gini:
Accuracy 1.0
F1 Score 1.0
Recall 1.0
Precision 1.0

Metrics for entropy:
Accuracy 1.0
F1 Score 1.0
Recall 1.0
Precision 1.0



### stars

In [16]:
stars_y = stars["class"]
stars_X = stars.drop("class", axis=1)

stars_x_train, stars_x_test, stars_y_train, stars_y_test = train_test_split(stars_X, stars_y, test_size=0.3)

for crit in ['gini', 'entropy']:
    print(f"Metrics for {crit}:")
    classifier = RandomForestClassifier(criterion=crit)

    classifier.fit(stars_x_train, stars_y_train)
    predictions = classifier.predict(stars_x_test)

    print(f"Accuracy {accuracy_score(stars_y_test, predictions)}")
    print(f"F1 Score {f1_score(stars_y_test, predictions, average='macro')}")
    print(f"Recall {recall_score(stars_y_test, predictions, average='macro')}")
    print(f"Precision {precision_score(stars_y_test, predictions, average='macro')}\n")

Metrics for gini:
Accuracy 0.9790666666666666
F1 Score 0.9757087749377066
Recall 0.9716694333706627
Precision 0.980135515411589

Metrics for entropy:
Accuracy 0.9783666666666667
F1 Score 0.9749629691043368
Recall 0.9712384332055595
Precision 0.9790832628408292



### heart

In [17]:
heart_y = heart["condition"]
heart_X = heart.drop("condition", axis=1)

heart_x_train, heart_x_test, heart_y_train, heart_y_test = train_test_split(heart_X, heart_y, test_size=0.3)
for crit in ['gini', 'entropy']:
    print(f"Metrics for {crit}:")
    classifier = RandomForestClassifier(criterion=crit)

    classifier.fit(heart_x_train, heart_y_train)
    predictions = classifier.predict(heart_x_test)

    print(f"Accuracy {accuracy_score(heart_y_test, predictions)}")
    print(f"F1 Score {f1_score(heart_y_test, predictions, average='macro')}")
    print(f"Recall {recall_score(heart_y_test, predictions, average='macro')}")
    print(f"Precision {precision_score(heart_y_test, predictions, average='macro')}\n")

Metrics for gini:
Accuracy 0.8555555555555555
F1 Score 0.8524776194679108
Recall 0.8514328808446455
Precision 0.853744939271255

Metrics for entropy:
Accuracy 0.8666666666666667
F1 Score 0.8633603238866396
Recall 0.861236802413273
Precision 0.8663946965833759



## Performance vs size of individual trees

In [18]:
depths = list(range(1,26))

### iris

In [19]:
iris_mets = []
for depth in depths:
    print(f"Metrics for depth: {depth}:")
    classifier = RandomForestClassifier(max_depth=depth)

    classifier.fit(iris_x_train, iris_y_train)
    predictions = classifier.predict(iris_x_test)
    
    acc = accuracy_score(iris_y_test, predictions)
    f1 = f1_score(iris_y_test, predictions, average='macro')
    recall = recall_score(iris_y_test, predictions, average='macro')
    prec = precision_score(iris_y_test, predictions, average='macro')

    print(f"Accuracy {acc}")
    print(f"F1 Score {f1}")
    print(f"Recall {recall}")
    print(f"Precision {prec}\n")
    
    iris_mets.append((acc, f1, recall, prec))

Metrics for depth: 1:
Accuracy 0.6222222222222222
F1 Score 0.5348837209302325
Recall 0.6666666666666666
Precision 0.4777777777777778

Metrics for depth: 2:
Accuracy 1.0
F1 Score 1.0
Recall 1.0
Precision 1.0

Metrics for depth: 3:


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy 1.0
F1 Score 1.0
Recall 1.0
Precision 1.0

Metrics for depth: 4:
Accuracy 0.9777777777777777
F1 Score 0.9775533108866442
Recall 0.9803921568627452
Precision 0.9761904761904763

Metrics for depth: 5:
Accuracy 1.0
F1 Score 1.0
Recall 1.0
Precision 1.0

Metrics for depth: 6:
Accuracy 1.0
F1 Score 1.0
Recall 1.0
Precision 1.0

Metrics for depth: 7:
Accuracy 1.0
F1 Score 1.0
Recall 1.0
Precision 1.0

Metrics for depth: 8:
Accuracy 1.0
F1 Score 1.0
Recall 1.0
Precision 1.0

Metrics for depth: 9:
Accuracy 0.9777777777777777
F1 Score 0.9775533108866442
Recall 0.9803921568627452
Precision 0.9761904761904763

Metrics for depth: 10:
Accuracy 1.0
F1 Score 1.0
Recall 1.0
Precision 1.0

Metrics for depth: 11:
Accuracy 0.9777777777777777
F1 Score 0.9775533108866442
Recall 0.9803921568627452
Precision 0.9761904761904763

Metrics for depth: 12:
Accuracy 0.9777777777777777
F1 Score 0.9775533108866442
Recall 0.9803921568627452
Precision 0.9761904761904763

Metrics for depth: 13:
Accuracy 1.0
F1 

In [29]:
acc_lst = [x[0] for x in iris_mets]
f1_lst = [x[1] for x in iris_mets]
prec_lst = [x[2] for x in iris_mets]
recall_lst = [x[3] for x in iris_mets]



fig = go.Figure()
fig.add_trace(go.Scatter(x=depths, y=acc_lst, name="Accuracy"))
fig.add_trace(go.Scatter(x=depths, y=prec_lst, name="Precision"))
fig.add_trace(go.Scatter(x=depths, y=recall_lst, name="Recall"))
fig.update_layout(
    xaxis_title="Tree Depth",
    yaxis_title="Score",
    legend_title="Metrics",
    font = dict(
        family="Courier new, monospace"
    )
)
fig.show()
fig.write_image("images/fig_iris_depth.pdf")

### stars

In [21]:
star_mets = []
for depth in depths:
    print(f"Metrics for depth: {depth}:")
    classifier = RandomForestClassifier(max_depth=depth)

    classifier.fit(stars_x_train, stars_y_train)
    predictions = classifier.predict(stars_x_test)
    
    acc = accuracy_score(stars_y_test, predictions)
    f1 = f1_score(stars_y_test, predictions, average='macro')
    recall = recall_score(stars_y_test, predictions, average='macro')
    prec = precision_score(stars_y_test, predictions, average='macro')

    print(f"Accuracy {acc}")
    print(f"F1 Score {f1}")
    print(f"Recall {recall}")
    print(f"Precision {prec}\n")
    
    star_mets.append((acc, f1, recall, prec))
    

Metrics for depth: 1:



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Accuracy 0.6107333333333334
F1 Score 0.2965539209002279
Recall 0.3573181191035537
Precision 0.518758589375239

Metrics for depth: 2:
Accuracy 0.885
F1 Score 0.8377073628339975
Recall 0.8110611747434723
Precision 0.9245136354221071

Metrics for depth: 3:
Accuracy 0.9237333333333333
F1 Score 0.904425473743029
Recall 0.8853950040887134
Precision 0.9360799577179512

Metrics for depth: 4:
Accuracy 0.9513
F1 Score 0.941965511981976
Recall 0.9314750530696863
Precision 0.9555789567435978

Metrics for depth: 5:
Accuracy 0.9576333333333333
F1 Score 0.9502606488066495
Recall 0.943846523823999
Precision 0.9581135494445251

Metrics for depth: 6:
Accuracy 0.9665
F1 Score 0.9609794331258001
Recall 0.9558236004053037
Precision 0.9671260234083356

Metrics for depth: 7:
Accuracy 0.9693333333333334
F1 Score 0.9644340907796174
Recall 0.9600915251200415
Precision 0.9695721538896601

Metrics for depth: 8:
Accuracy 0.9737666666666667
F1 Score 0.9697059699788415
Recall 0.9659369751778054
Precision 0.974089771

In [30]:
acc_lst = [x[0] for x in star_mets]
f1_lst = [x[1] for x in star_mets]
prec_lst = [x[2] for x in star_mets]
recall_lst = [x[3] for x in star_mets]



fig = go.Figure()
fig.add_trace(go.Scatter(x=depths, y=acc_lst, name="Accuracy"))
fig.add_trace(go.Scatter(x=depths, y=prec_lst, name="Precision"))
fig.add_trace(go.Scatter(x=depths, y=recall_lst, name="Recall"))
fig.update_layout(
    xaxis_title="Tree Depth",
    yaxis_title="Score",
    legend_title="Metrics",
    font = dict(
        family="Courier new, monospace"
    )
)
fig.show()
fig.write_image("images/fig_stars_depth.pdf")

### heart

In [23]:
heart_mets = []
for depth in depths:
    print(f"Metrics for depth: {depth}:")
    classifier = RandomForestClassifier(max_depth=depth)

    classifier.fit(heart_x_train, heart_y_train)
    predictions = classifier.predict(heart_x_test)
    
    acc = accuracy_score(heart_y_test, predictions)
    f1 = f1_score(heart_y_test, predictions, average='macro')
    recall = recall_score(heart_y_test, predictions, average='macro')
    prec = precision_score(heart_y_test, predictions, average='macro')

    print(f"Accuracy {acc}")
    print(f"F1 Score {f1}")
    print(f"Recall {recall}")
    print(f"Precision {prec}\n")
    
    heart_mets.append((acc, f1, recall, prec))

Metrics for depth: 1:
Accuracy 0.8222222222222222
F1 Score 0.8178137651821862
Recall 0.8159879336349924
Precision 0.8204997450280469

Metrics for depth: 2:
Accuracy 0.8444444444444444
F1 Score 0.840587044534413
Recall 0.8386123680241326
Precision 0.8434472208057113

Metrics for depth: 3:
Accuracy 0.8444444444444444
F1 Score 0.840587044534413
Recall 0.8386123680241326
Precision 0.8434472208057113

Metrics for depth: 4:
Accuracy 0.8555555555555555
F1 Score 0.8524776194679108
Recall 0.8514328808446455
Precision 0.853744939271255

Metrics for depth: 5:
Accuracy 0.8333333333333334
F1 Score 0.8297818686168201
Recall 0.8288084464555052
Precision 0.8309716599190283

Metrics for depth: 6:
Accuracy 0.8333333333333334
F1 Score 0.8297818686168201
Recall 0.8288084464555052
Precision 0.8309716599190283

Metrics for depth: 7:
Accuracy 0.8222222222222222
F1 Score 0.8190045248868778
Recall 0.8190045248868778
Precision 0.8190045248868778

Metrics for depth: 8:
Accuracy 0.8
F1 Score 0.7975
Recall 0.79939

In [31]:
acc_lst = [x[0] for x in heart_mets]
f1_lst = [x[1] for x in heart_mets]
prec_lst = [x[2] for x in heart_mets]
recall_lst = [x[3] for x in heart_mets]



fig = go.Figure()
fig.add_trace(go.Scatter(x=depths, y=acc_lst, name="Accuracy"))
fig.add_trace(go.Scatter(x=depths, y=prec_lst, name="Precision"))
fig.add_trace(go.Scatter(x=depths, y=recall_lst, name="Recall"))
fig.update_layout(
    xaxis_title="Tree Depth",
    yaxis_title="Score",
    legend_title="Metrics",
    font = dict(
        family="Courier new, monospace"
    )
)
fig.show()
fig.write_image("images/fig_heart_depth.pdf")