In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn import tree
from sklearn import datasets
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from sklearn.datasets import load_iris
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

**Read and clean data**

In [2]:
iris = load_iris()
df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]],
                     columns=iris["feature_names"] + ["target"])

# Inputs: sepal length (cm),	sepal width (cm),	petal length (cm)
X = df[df.columns[0:3]]

# Output (dependent var) - in series format
y = df['petal width (cm)']

**Split into train/test** 

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


**Define model type and fit**

In [4]:
tree_model = tree.DecisionTreeRegressor()

tree_model.fit(X_train, y_train)

DecisionTreeRegressor()

**Scoring**

In [5]:
train_score = tree_model.score(X_train, y_train)
train_error = 1 - train_score

print("Score on data used in training =", train_score, "==>error =", train_error)

Score on data used in training = 0.9986720653809774 ==>error = 0.001327934619022586


In [6]:
test_score = tree_model.score(X_test, y_test)
test_error = 1 - test_score

print("Score on test (unseen) data =", test_score, "==>error =", test_error)

Score on test (unseen) data = 0.9173783239602861 ==>error = 0.08262167603971393


**Hyperparameter Tuning**

In [7]:
# This can take long 'cause runs down a lot of paths
grid_param_tree = {"max_depth" : range(1, 31),
                   "max_features" : range(1, len(df.columns))}

grid_search_tree = ms.GridSearchCV(tree_model,
                                   grid_param_tree,
                                   cv=3,
                                   scoring="r2").fit(X_train, y_train)

print("Best parameters:", grid_search_tree.best_params_,
      "\nBest Score:", grid_search_tree.best_score_)

Best parameters: {'max_depth': 3, 'max_features': 3} 
Best Score: 0.9223696152647932


In [8]:
grid_train_score = grid_search_tree.best_estimator_.score(X_train, y_train)
grid_train_error = 1 - grid_train_score

print("Best estimator score on data used in training =", grid_train_score, "==>error =", grid_train_error)

grid_test_score = grid_search_tree.best_estimator_.score(X_test, y_test)
grid_test_error = 1 - grid_test_score

print("Best estimator score on test (unseen) data =", grid_test_score, "==>error =", grid_test_error)

Best estimator score on data used in training = 0.9541826369835199 ==>error = 0.045817363016480095
Best estimator score on test (unseen) data = 0.8860711225314797 ==>error = 0.11392887746852032


**Original model comes out best**