## Random Forrest

In [1]:
import numpy as np
import pandas as pd

### Importing the dataset and creating the train and test sets

In [2]:
dataset= pd.read_csv("Data.csv")
X=dataset.iloc[:,:-1].values
y= dataset.iloc[:,-1].values

In [3]:
print(X)

[[  14.96   41.76 1024.07   73.17]
 [  25.18   62.96 1020.04   59.08]
 [   5.11   39.4  1012.16   92.14]
 ...
 [  31.32   74.33 1012.92   36.48]
 [  24.48   69.45 1013.86   62.39]
 [  21.6    62.52 1017.23   67.87]]


In [4]:
print(y)

[463.26 444.37 488.56 ... 429.57 435.74 453.28]


In [5]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=0)

### Applying Grid Search + K-Fold Cross Valdiation to find the best model hyper-parameters

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
# Create the Grid of parameters to test
grid = {'n_estimators' : [ 10,20 ,30 ,40 ,50 ,60 ,70 ,80 ,90 ,100],
    'max_features' : ['sqrt', 'log2'],
    'max_depth' : [2,4,6,8],
    'criterion': ['squared_error'],
    'random_state':[0]}

gs= GridSearchCV(estimator= RandomForestRegressor(), param_grid=grid, scoring='neg_mean_absolute_error',cv=10,n_jobs=-1)
gs.fit(X_train,y_train)
# Get the best accuracy  using the best_score_ attribute of the grid search obj
best_score= gs.best_score_
# Get the best parameters corresponding to this accuracy
best_parameters=gs.best_params_

print("Best Loss:", best_score) 
print("Best Parameters:", best_parameters )

Best Loss: -2.8185758691673968
Best Parameters: {'criterion': 'squared_error', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 90, 'random_state': 0}


### Applying Random Search + K-Fold Cross Valdiation to find the best model hyper-parameters

In [7]:
from sklearn.model_selection import RandomizedSearchCV


param = {'n_estimators' : [ int(x) for x in (np.linspace(start = 10, stop = 200, num = 20))],
    'max_features' : ['sqrt', 'log2'],
    'max_depth' : [int(x) for x in np.linspace(start = 2, stop = 20, num = 10)],
    'criterion': ['squared_error'],
    'random_state':[0]}

rs= RandomizedSearchCV(estimator= RandomForestRegressor(), param_distributions=param, n_iter=50, random_state=0, scoring='neg_mean_absolute_error',cv=10,n_jobs=-1)
rs.fit(X_train,y_train)
# Get the best accuracy  using the best_score_ attribute of the grid search obj
best_score= rs.best_score_
# Get the best parameters corresponding to this accuracy
best_parameters=rs.best_params_

print("Best Loss:", best_score) 
print("Best Parameters:", best_parameters )

Best Loss: -2.3304399230564994
Best Parameters: {'random_state': 0, 'n_estimators': 200, 'max_features': 'log2', 'max_depth': 20, 'criterion': 'squared_error'}


### Fit the model using the best found *hyperparameters* 

In [8]:
model= RandomForestRegressor(random_state= 0, n_estimators= 200, max_features= 'log2', max_depth= 20, criterion= 'squared_error')
model.fit(X_train,y_train)

RandomForestRegressor(max_depth=20, max_features='log2', n_estimators=200,
                      random_state=0)

### Evaluate The Model

In [9]:
from sklearn.metrics import mean_absolute_error

y_pred=model.predict(X_test)

print ("Mean absolute error: ", mean_absolute_error(y_test, y_pred) )



Mean absolute error:  2.2764070392548894
