In [1]:
import pandas as pd

In [2]:
# load the data
df_train = pd.read_csv('../data/processed/application_train.csv')
df_test = pd.read_csv('../data/processed/application_test.csv')


In [3]:
# Define the target and features
target = 'TARGET'
features = [c for c in df_train.columns if c not in [target]]

In [4]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_train[features], df_train[target], test_size=0.2, random_state=42)

In [5]:
# Create the model with 100 trees
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100,
                                 random_state=42,
                                    n_jobs=-1,
                                    verbose=1)

# Fit on training data
model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished


RandomForestClassifier(n_jobs=-1, random_state=42, verbose=1)

In [7]:
# Make predictions
y_pred = model.predict(X_test)

# Calculate the roc auc
from sklearn.metrics import roc_auc_score
roc_value = roc_auc_score(y_test, y_pred)
print('ROC AUC: %0.4f' % roc_value)


ROC AUC: 0.5000


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


In [8]:
# Grid Search
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 288 candidates, totalling 864 fits


{'max_depth': 80,
 'max_features': 2,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 100}

In [None]:
# Create the model with 100 trees
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100,
                                    random_state=42,
                                    n_jobs=-1,
                                    verbose=1,
                                    max_depth=80,
                                    max_features=2,
                                    min_samples_leaf=3,
                                    min_samples_split=8)

# Fit on training data
model.fit(X_train, y_train)

In [9]:
# Make predictions
y_pred = model.predict(X_test)

# Calculate the roc auc
from sklearn.metrics import roc_auc_score
roc_value = roc_auc_score(y_test, y_pred)
print('ROC AUC: %0.4f' % roc_value)

# Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %0.4f' % accuracy)

# Precision
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred)
print('Precision: %0.4f' % precision)

# Recall
from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred)
print('Recall: %0.4f' % recall)

# F1
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print('F1: %0.4f' % f1)

ROC AUC: 0.5000


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


In [None]:
# Random Search
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_

In [11]:
# new experiment in mlflow
import mlflow
mlflow.set_experiment('random_forest')

# log the model
with mlflow.start_run():
    mlflow.log_param('n_estimators', 100)
    mlflow.log_param('max_depth', 80)
    mlflow.log_param('max_features', 2)
    mlflow.log_param('min_samples_leaf', 3)
    mlflow.log_param('min_samples_split', 8)
    mlflow.log_metric('roc_auc', roc_value)
    mlflow.sklearn.log_model(model, 'model')


