In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
concrete = pd.read_csv('datasets/concrete_data.csv')

concrete.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
X = concrete.drop('csMPa', axis=1)

Y = concrete['csMPa']

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

### RandomForestRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [5]:
from sklearn.ensemble import RandomForestRegressor

In [18]:
rnd_reg = RandomForestRegressor(n_estimators=600, max_leaf_nodes=12, n_jobs=-1)

rnd_reg.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=12,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [19]:
y_pred = rnd_reg.predict(x_test)

In [20]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.7804305039552951

In [29]:
important_features = pd.Series(rnd_reg.feature_importances_, index=X.columns).\
                        sort_values(ascending=False)

important_features

age                 0.414701
cement              0.347539
water               0.107179
superplasticizer    0.066442
slag                0.046860
fineaggregate       0.009449
flyash              0.005100
coarseaggregate     0.002730
dtype: float64

#### BaggingRegressor + DecisionTreeRegressor(with splitter='random') = RandomForestRegressor

In [22]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

In [23]:
bag_reg = BaggingRegressor(DecisionTreeRegressor(splitter='random', max_leaf_nodes=12), 
                                                 n_estimators=600,
                                                 bootstrap=True,
                                                 max_samples=1.0,
                                                 n_jobs=-1)
bag_reg.fit(x_train, y_train)

BaggingRegressor(base_estimator=DecisionTreeRegressor(criterion='mse',
                                                      max_depth=None,
                                                      max_features=None,
                                                      max_leaf_nodes=12,
                                                      min_impurity_decrease=0.0,
                                                      min_impurity_split=None,
                                                      min_samples_leaf=1,
                                                      min_samples_split=2,
                                                      min_weight_fraction_leaf=0.0,
                                                      presort=False,
                                                      random_state=None,
                                                      splitter='random'),
                 bootstrap=True, bootstrap_features=False, max_features=1.0,
                 max_sample

In [24]:
y_pred = bag_reg.predict(x_test)

r2_score(y_test, y_pred)

0.7087506676183266

### ExtraTreesRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.tree.ExtraTreeRegressor.html     
Extra-Tree Regressor is growing tree even more random by using random thresholds for each features rather than searching for best possible thresholds (like the regular decision tree).

In [25]:
from sklearn.ensemble import ExtraTreesRegressor 

In [26]:
extra_reg = ExtraTreesRegressor(n_estimators=600, max_leaf_nodes=12, n_jobs=-1)

extra_reg.fit(x_train, y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
                    max_features='auto', max_leaf_nodes=12,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=-1,
                    oob_score=False, random_state=None, verbose=0,
                    warm_start=False)

In [27]:
y_pred = extra_reg.predict(x_test)

r2_score(y_test, y_pred)

0.7134920140025534

In [31]:
important_features = pd.Series(extra_reg.feature_importances_, index=X.columns).\
                        sort_values(ascending=False)

important_features

age                 0.380102
cement              0.308488
superplasticizer    0.109725
water               0.099060
slag                0.042978
flyash              0.039722
fineaggregate       0.016645
coarseaggregate     0.003279
dtype: float64