In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
def f1_score(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    print('----------------------------------')
    print('                 Actual Value')
    print('----------------------------------')
    print(f'            Positive    Negative')
    print(f'Positive    {tp:^8}    {fp:^8}')
    print(f'Negative    {fn:^8}    {tn:^8}')
    print('----------------------------------')
    
    return f1

In [5]:
# Load data
df = pd.read_csv('bank_shuffle.csv')
df['y'] = df['y'].apply(lambda x: x == 'yes')
df = pd.get_dummies(df, drop_first=True)

# Split dataset
df_train = df.iloc[:int(len(df) * 0.8), :]
df_test = df.iloc[int(len(df) * 0.8):, :]

X_train = df_train.drop('y', axis=1)
y_train = df_train['y']
X_test = df_test.drop('y', axis=1)
y_test = df_test['y']

# Parameter grid
param_grid = {
    'n_estimators': range(50, 501, 50),
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}

# Grid search with 5-fold cross-validation
clf = RandomForestClassifier()
grid_search = GridSearchCV(clf, param_grid, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)

# Best model
best_clf = grid_search.best_estimator_

# Predict on the test set
y_pred = best_clf.predict(X_test)

# Evaluate the model
error_rate = sum(y_test != y_pred) / len(y_test)  # where `y` is the target variable of testset and `y_pred` is the prediction of testset
f1 = f1_score(y_test, y_pred)

print(f"Error Rate: {error_rate}")
print(f"F1 Score: {f1}")


----------------------------------
                 Actual Value
----------------------------------
            Positive    Negative
Positive      427         232   
Negative      513         7066  
----------------------------------
Error Rate: 0.09043457149793639
F1 Score: 0.5340838023764852
