# Experiments

Metrics: Accuracy, Precision and Recall

Experiments: 
- Performance vs node purity method (Gini vs entropy)
- Performance vs size of individual trees

In [3]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import numpy as np
import plotly.express as px
from sklearn.datasets import load_iris

## Load data

In [7]:
iris = load_iris()
stars = pd.read_csv("../Datasets/star_classification.csv")
heart = pd.read_csv("../Datasets/heart_cleveland_upload.csv")

### iris

In [8]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [9]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [11]:
len(iris.data)

150

### stars

In [40]:
s_t = list(stars.columns)
s_t.remove('class')
s_t

['obj_ID',
 'alpha',
 'delta',
 'u',
 'g',
 'r',
 'i',
 'z',
 'run_ID',
 'rerun_ID',
 'cam_col',
 'field_ID',
 'spec_obj_ID',
 'redshift',
 'plate',
 'MJD',
 'fiber_ID']

In [33]:
stars['class'].unique()

array(['GALAXY', 'QSO', 'STAR'], dtype=object)

In [34]:
len(stars)

100000

### heart

In [35]:
list(heart.columns)

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'condition']

In [42]:
heart['condition'].unique()

array([0, 1])

In [44]:
len(heart)

297

## Performance vs node purity method

### stars

In [51]:
stars_y = stars["class"]
stars_X = stars.drop("class", axis=1)

stars_x_train, stars_x_test, stars_y_train, stars_y_test = train_test_split(stars_X, stars_y, test_size=0.3)

for crit in ['gini', 'entropy']:
    print(f"Metrics for {crit}:")
    classifier = RandomForestClassifier(criterion=crit)

    classifier.fit(stars_x_train, stars_y_train)
    predictions = classifier.predict(stars_x_test)

    print(f"Accuracy {accuracy_score(stars_y_test, predictions)}")
    print(f"F1 Score {f1_score(stars_y_test, predictions, average='macro')}")
    print(f"Recall {recall_score(stars_y_test, predictions, average='macro')}")
    print(f"Precision {precision_score(stars_y_test, predictions, average='macro')}\n")

Metrics for gini:
Accuracy 0.9767
F1 Score 0.972706332216565
Recall 0.9685854998945173
Precision 0.9772420177126985

Metrics for entropy:
Accuracy 0.9769333333333333
F1 Score 0.9729587973943387
Recall 0.9686689020506106
Precision 0.9776919083284555



### heart

In [54]:
heart_y = heart["condition"]
heart_X = heart.drop("condition", axis=1)

heart_x_train, heart_x_test, heart_y_train, heart_y_test = train_test_split(heart_X, heart_y, test_size=0.3)
for crit in ['gini', 'entropy']:
    print(f"Metrics for {crit}:")
    classifier = RandomForestClassifier(criterion=crit)

    classifier.fit(heart_x_train, heart_y_train)
    predictions = classifier.predict(heart_x_test)

    print(f"Accuracy {accuracy_score(heart_y_test, predictions)}")
    print(f"F1 Score {f1_score(heart_y_test, predictions, average='macro')}")
    print(f"Recall {recall_score(heart_y_test, predictions, average='macro')}")
    print(f"Precision {precision_score(heart_y_test, predictions, average='macro')}\n")

Metrics for gini:
Accuracy 0.8333333333333334
F1 Score 0.8255588577335573
Recall 0.8240740740740741
Precision 0.8272727272727273

Metrics for entropy:
Accuracy 0.8
F1 Score 0.7916666666666667
Recall 0.7916666666666667
Precision 0.7916666666666667



### iris


In [55]:
iris_X, iris_y  = iris.data, iris.target

iris_x_train, iris_x_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y, test_size=0.3)

for crit in ['gini', 'entropy']:
    print(f"Metrics for {crit}:")
    classifier = RandomForestClassifier(criterion=crit)

    classifier.fit(iris_x_train, iris_y_train)
    predictions = classifier.predict(iris_x_test)

    print(f"Accuracy {accuracy_score(iris_y_test, predictions)}")
    print(f"F1 Score {f1_score(iris_y_test, predictions, average='macro')}")
    print(f"Recall {recall_score(iris_y_test, predictions, average='macro')}")
    print(f"Precision {precision_score(iris_y_test, predictions, average='macro')}\n")

Metrics for gini:
Accuracy 0.9333333333333333
F1 Score 0.9343915343915343
Recall 0.9327731092436974
Precision 0.9373219373219372

Metrics for entropy:
Accuracy 0.9333333333333333
F1 Score 0.9343915343915343
Recall 0.9327731092436974
Precision 0.9373219373219372



## Performance vs size of individual trees