## Importing Required Libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, make_scorer
import joblib

## Loading the Dataset

In [2]:
train = pd.read_csv("./dataset/train.csv")
test = pd.read_csv("./dataset/test.csv")

## Splitting Features and Target

In [3]:
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]

In [4]:
X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

## GridSearch for Hyperparameter Tuning 

In [18]:
grid = {
    'n_estimators': [50, 100, 150],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : [10, 20, 50]
}

In [19]:
#creating a custom scorer that ensures a balance between accuracy and recall
def custom_scorer(y_test, y_hat):
    accuracy = accuracy_score(y_test, y_hat)
    recall = recall_score(y_test, y_hat)
    score = (accuracy + recall) / 2
    return score

scorer = make_scorer(custom_scorer)

In [20]:
grid_search = GridSearchCV(RandomForestClassifier(),
                         param_grid = grid,
                         scoring = scorer,
                         cv = 10)

## Training the Models

In [21]:
grid_search.fit(X_train, y_train)

In [22]:
grid_search.best_params_

{'criterion': 'log_loss', 'max_depth': 50, 'n_estimators': 150}

## Evaluation on Test Data

In [23]:
random_forest_model = grid_search.best_estimator_

In [24]:
y_hat = random_forest_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_hat))
print('Recall:', recall_score(y_test, y_hat))

Accuracy: 0.9064327485380117
Recall: 0.9439252336448598


## Saving Model to File

In [25]:
joblib.dump(random_forest_model, './models/random_forest_model.joblib')

['./models/random_forest_model.joblib']