In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn import tree
from sklearn import datasets
from sklearn import linear_model
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from sklearn.datasets import load_iris
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

**Read and clean data**

In [2]:
iris = load_iris()
df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]],
                     columns=iris["feature_names"] + ["target"])

# Inputs: sepal length (cm),	sepal width (cm),	petal length (cm)
X = df[df.columns[0:3]]

# Output (dependent var) - in series format
y = df['petal width (cm)']

**Split into train/test** 

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


**Define model type and fit**

In [4]:
rf_model = ensemble.RandomForestRegressor(random_state=1, n_estimators=200, n_jobs=-1)
# print(rf_model)

rf_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=1)

**Scoring**

In [5]:
train_score = rf_model.score(X_train, y_train)
train_error = 1 - train_score

print("Score on data used in training =", train_score, "==>error =", train_error)

Score on data used in training = 0.9904994652036517 ==>error = 0.00950053479634827


In [6]:
test_score = rf_model.score(X_test, y_test)
test_error = 1 - test_score

print("Score on test (unseen) data =", test_score, "==>error =", test_error)

Score on test (unseen) data = 0.9298913978317376 ==>error = 0.07010860216826242


**Hyperparameter Tuning**

In [7]:
# This can take long 'cause runs down a lot of paths
grid_param_forest = {"max_depth" : range(1, 5),
                     "n_estimators" : range(10, 300, 20),
                     "max_features" : range(1, len(df.columns))}

print(str(datetime.now()))
grid_search_forest = ms.GridSearchCV(rf_model,
                                    grid_param_forest,
                                    cv=3,
                                    n_jobs=-1,
                                    verbose=True,
                                    scoring="r2").fit(X_train, y_train)
print(str(datetime.now()))

print("Best parameters:", grid_search_forest.best_params_,
      "\nBest Score:", grid_search_forest.best_score_)

2022-08-20 23:54:19.320923
Fitting 3 folds for each of 240 candidates, totalling 720 fits
2022-08-20 23:56:46.009964
Best parameters: {'max_depth': 3, 'max_features': 3, 'n_estimators': 30} 
Best Score: 0.9424032781851945


In [8]:
grid_train_score = grid_search_forest.best_estimator_.score(X_train, y_train)
grid_train_error = 1 - grid_train_score

print("Best estimator score on data used in training =", grid_train_score, "==>error =", grid_train_error)

grid_test_score = grid_search_forest.best_estimator_.score(X_test, y_test)
grid_test_error = 1 - grid_test_score

print("Best estimator score on test (unseen) data =", grid_test_score, "==>error =", grid_test_error)

Best estimator score on data used in training = 0.9623596462965536 ==>error = 0.03764035370344643
Best estimator score on test (unseen) data = 0.9112091772954078 ==>error = 0.08879082270459215


**Original model comes out best ==> may be due to small dataset**