 # Homework # 4 - Random Forest
 Data file: temps_extended.csv

### Import libraries

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

### Function to evaluate model accuracy
#### You will use this function twice in this notebook.   Please review the comments below carefully to find out when to invoke this function.

In [2]:
def evaluate(model, model_string, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance: {}'.format(model_string))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

### Load data

In [3]:
# Read in data as a dataframe
df = pd.read_csv('temps_extended.csv')

### Examine data

In [4]:
df.head()

Unnamed: 0,year,month,day,weekday,ws_1,prcp_1,snwd_1,temp_2,temp_1,average,actual,friend
0,2011,1,1,Sat,4.92,0.0,0,36,37,45.6,40,40
1,2011,1,2,Sun,5.37,0.0,0,37,40,45.7,39,50
2,2011,1,3,Mon,6.26,0.0,0,40,39,45.8,42,42
3,2011,1,4,Tues,5.59,0.0,0,39,42,45.9,38,59
4,2011,1,5,Wed,3.8,0.03,0,42,38,46.0,45,39


In [5]:
df.shape

(2191, 12)

### Clean up data

In [6]:
# Drop unnecessary columns year, month, day, weekday
df.drop(columns = ['month', 'day', 'weekday', 'year'], inplace = True)

In [7]:
# Display first few rows of updated dataframe
df.head()

Unnamed: 0,ws_1,prcp_1,snwd_1,temp_2,temp_1,average,actual,friend
0,4.92,0.0,0,36,37,45.6,40,40
1,5.37,0.0,0,37,40,45.7,39,50
2,6.26,0.0,0,40,39,45.8,42,42
3,5.59,0.0,0,39,42,45.9,38,59
4,3.8,0.03,0,42,38,46.0,45,39


### Separate independent variables and dependent variable
* Independent variables: all remaining variables except 'actual'
* Dependent variable: 'actual'

In [8]:
# Separate features and labels
X = df.drop(columns = 'actual')
y = df['actual']

### Split into training and test sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### Instantiate the RandomForestRegressor model

In [10]:
model = RandomForestRegressor()

### Print RandomForestRegressor default hyperparameters

In [12]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Fit RandomForestRegressor model using the default hyperparameters

In [13]:
%%time
model.fit(X_train, y_train)


CPU times: user 428 ms, sys: 17.8 ms, total: 445 ms
Wall time: 462 ms


### Print accuracy for RandomForestRegressor model using the default hyperparameters
#### NOTE: Use "evaluate" function defined at top of this notebook.
For example, assuming the following variable values:
* model = rf
* model_string = 'using default hyperparameters'
* test_features = X_test
* test_labels = y_test

rfr_base_accuracy = evaluate(rf, 'With default hyperparameters', X_test, y_test)

In [14]:
rf_base_accuracy = evaluate(model, 'using default hyperparameters', X_test, y_test)
rf_base_accuracy

Model Performance: using default hyperparameters
Accuracy = 93.35%.


93.35245504758151

### Prepare variables for hyperparameter search
* Using sklearn.ensemble.RandomForestRegressor documentation [https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html] choose 3 hyperparameters for random search
* For each hyperparameter selected, set up an array of values
  * For example: max_features = ['log2', 'sqrt']

In [30]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

### Create the hyperparameter grid for the random search
Use the variables prepared above

In [31]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

### Print the hyperparameter grid for the random search

In [32]:
random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

### Set up random search with k-fold cross validation using the hyperparameter grid

In [33]:
# Use RandomizedSearchCV to perform random search of hyperparameters using 5 fold cross validation.
r_cv = RandomizedSearchCV(estimator = model, cv = 5, param_distributions = random_grid)

### Fit the random search model
Be patient, this might take a minute or longer

In [34]:
%%time
r_cv.fit(X_train, y_train)

CPU times: user 3min 31s, sys: 2.02 s, total: 3min 33s
Wall time: 3min 34s


### Print the best hyperparameters found by the random search

In [36]:
r_cv.best_params_

{'n_estimators': 1800,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_depth': 40,
 'bootstrap': True}

### Print best random search model accuracy
#### NOTE: Use "evaluate" function defined at top of this notebook.

In [38]:
evaluate(model = r_cv, model_string = 'optimized hyperparameters using RandomizedSearchCV', 
         test_features = X_test, test_labels = y_test)

Model Performance: optimized hyperparameters using RandomizedSearchCV
Accuracy = 93.51%.


93.5105581400703