In [1]:
import pandas as pd
import numpy as np

path = "/Users/theodoreplotkin/desktop/postmalone/GA_Data_Science/DAT-06-24/class material/Unit 3/data/hitters.csv"

In [2]:
data = pd.read_csv(path)

In [3]:
data.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [5]:
#data["Years"].mean()

In [None]:
#decision trees do not replicate well and are sensitive to small changes
#to get around this we make 100s/1000s and aggregate them
#this is sort of like cross-validating 

#not sensitive to outliers, outliers are given their own leafs
#non-parametric method, assumes nothing about underlying distribution
#overfits the data greatly
#work well with data that do not have any underlying generating process

## Building a `regression tree` in sklearn

In [6]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(random_state = 1, max_depth = 4)

In [7]:
#how to finetune a decision tree
tree.get_params()

{'criterion': 'mse',
 'max_depth': 4,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': 1,
 'splitter': 'best'}

In [11]:
#impute the missing salary with the average salary 
data["Salary"].fillna(data.Salary.mean(), inplace = True)

X = data.loc[:, data.columns != "Salary"]
y = data["Salary"]
#create dummy vars
X = pd.get_dummies(X)

#note:: decision trees do not care about scale of data

#note:: decision trees only accept numeric data
##all categorical data must be numeric 

In [12]:
#perform train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 2019)

In [13]:
tree.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

## Creating a tree diagram to visualize a decision tree

In [None]:
#see slides for a draw_tree() method

In [None]:
#use 10 fold CV, test out values [1,5,10,25,50] for min_samples_leaf param

In [22]:
from sklearn.model_selection import cross_val_score

min_samples_leaf_scores = []

for leaves in [1,5,10,25,50]:
    tree.set_params(min_samples_leaf = leaves)
    score = cross_val_score(estimator = tree, X = X_train, y = y_train, cv = 10)
    min_samples_leaf_scores.append((np.mean(score),leaves))
    
#note:: the model is really being fit 50 times here

In [23]:
max(min_samples_leaf_scores)

(0.34093901108823016, 25)

In [24]:
min_samples_leaf_scores

[(0.16829546329660883, 1),
 (0.22146746597359016, 5),
 (0.2201189988069006, 10),
 (0.34093901108823016, 25),
 (0.30512582926340126, 50)]

In [None]:
#the smaller the number of min_samples per leaf, the more 
#the decision tree tends to overtfit

## Random Forests and Decision Tree Ensembles 

In [None]:
#Ensembling is the process of combining several predictive models
#in order to produce a combined model that is more accurate
#than any individual model 
##tends to always increase accuracy by at least a small amount 

#two ways:
##manually ensembling your individual models w/ weight avg of predictions
##using a model that ensembles for you (ex/ bagged decision trees)

#manually is very time consuming

#for decision trees we have the technique called 'Bagging'
##decision trees are brittle/high variance
##bootstrap aggregation -- randomly sample 2/3rd of data
                #         build decision tree off of that, 
                #         rinse and repeat
        
#bootstrapping is sampling with replacement

In [None]:
#Bagging for variance reduction
##1 grow N trees using X bootstrap samples from training data
##2 Train each tree on its bootstrap sample and make prediction
##3 take the avg of all the tree values for your final prediction

#note:: Random Forests also randomly sample a certain number of features
#       as well as 2/3rd of the data

In [62]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 1)

#fine tuning a random forest
##number of trees (estimators), generally MSE falls as Num Trees increases
##number of features to sample at each split (unique to random forests)
rf.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [63]:
#n_estimators - specifys how many trees you will fit 

#max_features - specifys how many features you use per fit

In [64]:
#consider 'max_features' parameter:
    #manually define number of features you want to use
    #manually define the percentage of the features you want to use
    #'sqrt': the square root of total number of features
    #'log2': the base 2 log of the number of features
    
#if your model is overfitting, i.e. validation scores way higher
#than test scores, then you want to generally reduce the number of features

In [65]:
rf.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [66]:
rf.n_estimators #tells us how many trees we fit in the ensemble

10

In [67]:
rf.estimators_ #returns a list of individual decision trees 

[DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1791095845, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=2135392491, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=946286476, splitter='best'),
 DecisionTreeRegress

In [68]:
rf.estimators_[0]

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1791095845, splitter='best')

In [69]:
#use 10 fold cross validation testing best value of 'max_features'

len(data.columns) #note there are 20 features 

20

In [77]:
#lets try, [10, 10**(0.5), np.log2(10)] as our possible max_features 
max_features_scores = []

for num_features in [0.5, 'sqrt','log2']:
    rf.set_params(max_features = num_features)
    scores = cross_val_score(estimator = rf, X = X_train , y = y_train, cv = 10)
    max_features_scores.append((np.mean(scores),num_features))

In [78]:
max(max_features_scores)

(0.43603781596731395, 0.5)

In [79]:
max_features_scores

[(0.43603781596731395, 0.5),
 (0.38978132888169237, 'sqrt'),
 (0.38978132888169237, 'log2')]

In [73]:
#note that overall using the randomforest aggregation yields 
#higher R-squared than using one decision tree alone