In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

from sklearn import metrics

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,id,od,le,lc,turn,fluid,fr,orientation,heatw,rth,type
0,2.0,4.0,15.0,40.0,8,1,70,60.0,0.111111,2.094579,Tabular
1,2.0,4.0,15.0,40.0,8,1,70,60.0,0.112346,1.437566,Tabular
2,2.0,4.0,15.0,40.0,8,1,70,60.0,0.112359,1.198448,Tabular
3,2.0,4.0,15.0,40.0,8,1,70,60.0,0.11236,1.146975,Tabular
4,2.0,4.0,15.0,40.0,8,1,70,60.0,0.11236,0.976089,Tabular


In [4]:
df.isnull().sum()

id             0
od             0
le             0
lc             0
turn           0
fluid          0
fr             0
orientation    0
heatw          0
rth            0
type           0
dtype: int64

In [5]:
df = pd.get_dummies(df, drop_first=True) 
df.head()

Unnamed: 0,id,od,le,lc,turn,fluid,fr,orientation,heatw,rth,type_Tabular
0,2.0,4.0,15.0,40.0,8,1,70,60.0,0.111111,2.094579,1
1,2.0,4.0,15.0,40.0,8,1,70,60.0,0.112346,1.437566,1
2,2.0,4.0,15.0,40.0,8,1,70,60.0,0.112359,1.198448,1
3,2.0,4.0,15.0,40.0,8,1,70,60.0,0.11236,1.146975,1
4,2.0,4.0,15.0,40.0,8,1,70,60.0,0.11236,0.976089,1


In [6]:
X = df.drop(('rth'), axis=1)
y = df['rth']

X.shape

(489, 10)

In [7]:
y.shape

(489,)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf = RandomForestRegressor()

In [9]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num=6)]
# max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [10]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


In [11]:
# Random search of parameters, using 3 fold cross-validation
# search across 100 different combinations

rf_random = RandomizedSearchCV(estimator=rf, 
                               param_distributions=random_grid, 
                               scoring='neg_mean_squared_error', 
                               n_iter=10, cv=5, verbose=2, 
                               random_state=42, n_jobs=1)

In [12]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   4.1s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.0s remaining:    0.0s


[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   3.7s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   3.7s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   3.6s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   3.3s
[CV] n_estimators=1100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=15 
[CV]  n_estimators=1100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=15, total=   4.8s
[CV] n_estimators=1100, min_samples_split=10, mi

[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   3.6s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   3.0s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   3.2s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   2.9s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   2.8s
[CV] n_estimators=700, min_samples_split=15, min_sam

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.8min finished


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100,
                                                         1200]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [13]:
print('Best parameters\n', rf_random.best_params_)
print('Best score\n', rf_random.best_score_)

Best parameters
 {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 25}
Best score
 -0.5955140963802702


In [14]:
y_pred = rf_random.predict(X_test)

In [15]:
print('MAE: ', metrics.mean_absolute_error(y_test, y_pred))
print('MSE: ', metrics.mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE:  0.15231241725471112
MSE:  0.18126360580932635
RMSE:  0.42575063806097385


In [16]:
score = metrics.r2_score(y_test, y_pred) * 100
print('Score:', score, '%' )

Score: 99.88197817328599 %


In [17]:
import pickle

In [18]:
# open a file, where you want to store the data
file = open('random_forest_regression_model.pkl', 'wb')

# dump information to that file
pickle.dump(rf_random, file)

In [19]:
X_test.head()

Unnamed: 0,id,od,le,lc,turn,fluid,fr,orientation,heatw,type_Tabular
273,2.0,3.0,55.0,50.0,4,5,40,0.0,0.753449,1
3,2.0,4.0,15.0,40.0,8,1,70,60.0,0.11236,1
223,2.0,3.0,55.0,50.0,4,5,70,0.0,0.661276,1
455,1.0,1.0,8.0,14.0,26,1,50,0.0,0.845099,0
241,2.0,3.0,55.0,50.0,4,5,50,0.0,0.522844,1
