In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
RANDOM_SEED = 42

### 4.2.3 Finally using the `scoring` parameter

In [2]:
heart_disease = pd.read_csv("../ztm-ml/data/heart-disease.csv")
heart_disease.head()
np.random.seed(42)
x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

np.random.seed(RANDOM_SEED)
clf = RandomForestClassifier(random_state=RANDOM_SEED)

cv = cross_val_score(clf, x, y, cv=5, scoring=None)
cv

array([0.81967213, 0.8852459 , 0.83606557, 0.85      , 0.8       ])

In [4]:
print(f'Defaults scorers of model is of accuracy which was :{cv.mean() * 100: .2f}%')

Defaults scorers of model is of accuracy which was : 83.82%


In [5]:
np.random.seed(42)
cv_precision = cross_val_score(clf, x, y, cv=5, scoring='precision')
print(f'The output of the model is this list:{cv_precision}\nAnd the mean of precision is : {cv_precision.mean() * 100: .2f}%')

The output of the model is this list:[0.82352941 0.90625    0.84848485 0.875      0.78378378]
And the mean of precision is :  84.74%


In [6]:
np.random.seed(42)
cv_recall = cross_val_score(clf, x, y, cv=5, scoring='recall')
print(f'The output of the model is this list:{cv_recall}\nAnd the mean of recall is : {cv_recall.mean() * 100: .2f}%')

The output of the model is this list:[0.84848485 0.87878788 0.84848485 0.84848485 0.87878788]
And the mean of recall is :  86.06%


In [7]:
np.random.seed(42)
cv_balanced_accuracy = cross_val_score(clf, x, y, cv=5, scoring='balanced_accuracy')
print(f'The output of the model is this list:{cv_balanced_accuracy}\nAnd the mean of balanced_accuracy is : {cv_balanced_accuracy.mean() * 100: .2f}%')

The output of the model is this list:[0.81709957 0.88582251 0.83495671 0.85016835 0.79124579]
And the mean of balanced_accuracy is :  83.59%


In [8]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
np.random.seed(RANDOM_SEED)
house_data = fetch_california_housing(as_frame=True)
house_df = house_data.frame
house_df

x = house_df.drop(columns=["MedHouseVal"], axis=1)
y = house_df["MedHouseVal"]



In [9]:
clf = RandomForestRegressor(random_state=RANDOM_SEED)

cv = cross_val_score(clf, x, y, cv=5)
print(f'Defaults scorers of model is of R^2 which was :{cv.mean(): .2f}')

Defaults scorers of model is of R^2 which was : 0.66


In [10]:
cv_mae = cross_val_score(clf, x, y, cv=5, scoring='neg_mean_absolute_error')
print(f'The output of the model is this list:{cv_mae}\nAnd the mean of MAE is : {cv_mae.mean(): .2f}')

The output of the model is this list:[-0.54255936 -0.40837256 -0.43759288 -0.46156845 -0.47328217]
And the mean of MAE is : -0.46


In [11]:
cv_mse = cross_val_score(clf, x, y, cv=5, scoring='neg_mean_squared_error')
print(f'The output of the model is this list:{cv_mse}\nAnd the mean of MSE is : {cv_mse.mean(): .2f}')

The output of the model is this list:[-0.51906307 -0.3460998  -0.37092894 -0.42819009 -0.46302709]
And the mean of MSE is : -0.43


## 4.3 Using different evaluation metrics as Scikit-Learn functions

The 3rd way to evaluate scikit-learn machine learning models/estimators is to using the `sklearn.metrics` module - https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(RANDOM_SEED)

# Create x & y
x = heart_disease.drop("target", axis=1)
y= heart_disease["target"]

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_SEED)
# Create model
clf = RandomForestClassifier(random_state=RANDOM_SEED)
# Fit the model
clf.fit(x_train, y_train)
# Evaluate the model using evaluation functions
accuracy = accuracy_score(y_test, clf.predict(x_test))
precision = precision_score(y_test, clf.predict(x_test))
recall = recall_score(y_test, clf.predict(x_test))
f1 = f1_score(y_test, clf.predict(x_test))

In [17]:
print(f'Accuracy: {accuracy * 100: .2f}%')
print(f'Precision: {precision * 100: .2f}%')
print(f'Recall: {recall * 100: .2f}%')
print(f'F1 score: {f1 * 100: .2f}%')

Accuracy:  83.61%
Precision:  84.38%
Recall:  84.38%
F1 score:  84.38%


In [18]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(RANDOM_SEED)

# Create x & y
x = house_df.drop(columns=["MedHouseVal"], axis=1)
y= house_df["MedHouseVal"]

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_SEED)
# Create model
clf = RandomForestRegressor(random_state=RANDOM_SEED)
# Fit the model
clf.fit(x_train, y_train)
# Evaluate the model using evaluation functions
r2 = r2_score(y_test, clf.predict(x_test))
mae = mean_absolute_error(y_test, clf.predict(x_test))
mse = mean_squared_error(y_test, clf.predict(x_test))

In [19]:
print(f'r2: {r2 * 100: .2f}%')
print(f'mae: {mae * 100: .2f}%')
print(f'mse: {mse * 100: .2f}%')  

r2:  80.51%
mae:  32.75%
mse:  25.54%
