In [None]:
#Decision Trees for Regression

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Let's hide warnings

import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.datasets import fetch_openml

machine_cpu = fetch_openml(name='machine_cpu')

In [4]:
# Displaying feature names

machine_cpu.feature_names

['MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX']

In [5]:
# Getting the whole dataframe

machine_cpu.frame

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,class
0,125.0,256.0,6000.0,256.0,16.0,128.0,198.0
1,29.0,8000.0,32000.0,32.0,8.0,32.0,269.0
2,29.0,8000.0,32000.0,32.0,8.0,32.0,220.0
3,29.0,8000.0,32000.0,32.0,8.0,32.0,172.0
4,29.0,8000.0,16000.0,32.0,8.0,16.0,132.0
...,...,...,...,...,...,...,...
204,124.0,1000.0,8000.0,0.0,1.0,8.0,42.0
205,98.0,1000.0,8000.0,32.0,2.0,8.0,46.0
206,125.0,2000.0,8000.0,0.0,2.0,14.0,52.0
207,480.0,512.0,8000.0,32.0,0.0,0.0,67.0


In [6]:
machine_data = machine_cpu.data
machine_labels = machine_cpu.target

In [None]:
#3 - Exploratory Analysis

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(machine_data,machine_labels, test_size=0.2,random_state=20)

print('The size of training data is: {} \nThe size of testing data is: {}'.format(len(X_train), len(X_test)))

The size of training data is: 167 
The size of testing data is: 42


In [8]:
#4 - Data Preprocessing

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

scale_pipe = Pipeline([
    ('scaler', StandardScaler())
    
])

X_train_scaled = scale_pipe.fit_transform(X_train)

In [10]:
#5 - Training Decision Tree Regressor

In [11]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()

tree_reg.fit(X_train, y_train)

DecisionTreeRegressor()

In [12]:
tree_reg_scaled = DecisionTreeRegressor()

tree_reg_scaled.fit(X_train_scaled, y_train)

DecisionTreeRegressor()

In [13]:
#6 - Evaluating Decision Trees

In [14]:
from sklearn.metrics import mean_squared_error

def predict(input_data,model,labels):
    """
    Take the input data, model and labels and return predictions
    
    """
    
    preds = model.predict(input_data)
    mse = mean_squared_error(labels,preds)
    rmse = np.sqrt(mse)
    rmse
    
    return rmse

In [None]:
predict(X_train, tree_reg, y_train)

In [None]:
predict(X_train_scaled, tree_reg_scaled, y_train)

In [None]:
#7 - Improving Decision Trees

In [15]:
from sklearn.model_selection import GridSearchCV

params_grid = {'max_leaf_nodes': list(range(0, 10)), 'min_samples_split': [0,1,2, 3, 4], 
              'max_depth':[None,0,1,2,3]}

#refit is true by default. The best estimator is trained on the whole dataset 

grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), params_grid, verbose=1, cv=3, refit=True)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 250 candidates, totalling 750 fits


GridSearchCV(cv=3, estimator=DecisionTreeRegressor(random_state=42),
             param_grid={'max_depth': [None, 0, 1, 2, 3],
                         'max_leaf_nodes': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                         'min_samples_split': [0, 1, 2, 3, 4]},
             verbose=1)

In [16]:
grid_search.best_params_

{'max_depth': None, 'max_leaf_nodes': 9, 'min_samples_split': 4}

In [17]:
grid_search.best_estimator_

DecisionTreeRegressor(max_leaf_nodes=9, min_samples_split=4, random_state=42)

In [18]:
tree_best = grid_search.best_estimator_

In [19]:
predict(X_train, tree_best, y_train)

34.999530266023044

In [21]:
predict(X_test, tree_reg, y_test)

42.457541817006515