In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
concrete = pd.read_csv('datasets/concrete_data.csv')
concrete.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
X = concrete.drop('csMPa', axis= 1)
Y = concrete['csMPa']

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size= 0.2)

In [5]:
from sklearn.ensemble import RandomForestRegressor

In [7]:
rand_reg = RandomForestRegressor(n_estimators= 600,
                                 max_leaf_nodes= 12,
                                 n_jobs= -1)
rand_reg.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=12,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=600, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [8]:
y_pred = rand_reg.predict(x_test)

In [9]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7535456697692728

In [10]:
imp_fe  =pd.Series(rand_reg.feature_importances_,
                   index = X.columns).sort_values(ascending= False)
imp_fe

age                 0.392023
cement              0.384666
water               0.109731
slag                0.054934
superplasticizer    0.040184
fineaggregate       0.007990
flyash              0.007615
coarseaggregate     0.002856
dtype: float64

In [11]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

In [13]:
bag_reg = BaggingRegressor(DecisionTreeRegressor(splitter= 'random',
                                                max_leaf_nodes= 12),
                                                n_estimators= 600,
                                                max_samples= 1.0,
                                                n_jobs= -1)
bag_reg.fit(x_train, y_train)

BaggingRegressor(base_estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                      criterion='mse',
                                                      max_depth=None,
                                                      max_features=None,
                                                      max_leaf_nodes=12,
                                                      min_impurity_decrease=0.0,
                                                      min_impurity_split=None,
                                                      min_samples_leaf=1,
                                                      min_samples_split=2,
                                                      min_weight_fraction_leaf=0.0,
                                                      presort='deprecated',
                                                      random_state=None,
                                                      splitter='random'),
                 bootstrap=T

In [14]:
y_pred = bag_reg.predict(x_test)
r2_score(y_test, y_pred)

0.7053906384901774

### extra trees
**random thresholds for splitting rather than the best threshold**

In [16]:
from sklearn.ensemble import ExtraTreesRegressor

In [17]:
extra_trees = ExtraTreesRegressor(n_estimators= 600,
                                  max_leaf_nodes= 12,
                                  n_jobs= -1)
extra_trees.fit(x_train, y_train)

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=12,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=600, n_jobs=-1, oob_score=False,
                    random_state=None, verbose=0, warm_start=False)

In [18]:
y_pred = extra_trees.predict(x_test)
r2_score(y_test, y_pred)

0.7021989970597231

In [19]:
imp_fe  =pd.Series(extra_trees.feature_importances_,
                   index = X.columns).sort_values(ascending= False)
imp_fe

age                 0.367682
cement              0.352957
superplasticizer    0.094141
water               0.093046
slag                0.044550
flyash              0.027352
fineaggregate       0.015217
coarseaggregate     0.005056
dtype: float64