In [21]:
import tqdm
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
plt.style.use('default')

from utils.model_measure import print_regression_measure, measure_regression
from sklearn.metrics import root_mean_squared_error

# 1. Dataset

In [22]:
from sklearn.datasets import load_diabetes
data_X, data_Y = load_diabetes(return_X_y=True, as_frame=False)

n_samples = len(data_X)

# sample weights
rng = np.random.default_rng(seed=42)
data_weight = rng.normal(loc=0.0, scale=1.0, size=n_samples)
data_weight = np.abs(data_weight) + 1.0

train_idx, test_idx = train_test_split(range(n_samples), test_size=0.3, random_state=0)
train_X, test_X = data_X[train_idx], data_X[test_idx]
train_Y, test_Y = data_Y[train_idx], data_Y[test_idx]
train_weight, test_weight = data_weight[train_idx], data_weight[test_idx]

print('\nShape of dataset:', data_X.shape, data_Y.shape, data_weight.shape,
      '\nShape of training set:', train_X.shape, train_Y.shape, train_weight.shape,
      '\nShape of testing set:', test_X.shape, test_Y.shape, test_weight.shape)


Shape of dataset: (442, 10) (442,) (442,) 
Shape of training set: (309, 10) (309,) (309,) 
Shape of testing set: (133, 10) (133,) (133,)


In [23]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(train_X, train_Y)

feat_imp - model.feature_importances_

array([6.93889390e-18, 0.00000000e+00, 0.00000000e+00, 1.38777878e-17,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.46944695e-18,
       0.00000000e+00, 0.00000000e+00])

In [24]:
from utils.weighted_random_forest import WeightedRandomForestRegressor

In [25]:
model = WeightedRandomForestRegressor(
    n_estimators=2000, random_state=None,
    bootstrap=True, max_samples=1.0,
    bootstrap_features=True, max_features=1.0,
    weighted_bootstrap=True, weighted_training=False)

model.fit(train_X, train_Y, sample_weight=train_weight)
model.feature_importances_

print(np.around(model.feature_importances_, 3))

[0.051 0.008 0.303 0.099 0.045 0.047 0.063 0.027 0.301 0.056]


In [26]:
def model_evaluation(model, train_X, train_Y, test_X, test_Y, train_weight, test_weight, weighted_fitting=False):
    
    if weighted_fitting:
        model.fit(train_X, train_Y, train_weight)
    else:
        model.fit(train_X, train_Y)
    
    measure = dict({'oob_rmse' : model.oob_score_})

    train_Y_pred = model.predict(train_X)
    test_Y_pred = model.predict(test_X)

    train_m = measure_regression(train_Y, train_Y_pred, train_weight)
    test_m  = measure_regression(test_Y, test_Y_pred, test_weight)

    measure.update({f'train_{k}' : v for k, v in train_m.items()})
    measure.update({f'test_{k}' : v for k, v in test_m.items()})
    
    return measure

In [27]:
model = WeightedRandomForestRegressor(
    n_estimators=1000, random_state=42,
    bootstrap=True, max_samples=1.0,
    bootstrap_features=False, max_features=None,
    oob_score = True,
    weighted_bootstrap=True, weighted_training=False)

peformance = {}

model.set_params(weighted_bootstrap=False, weighted_training=False)
peformance['BU-TU'] = model_evaluation(model, train_X, train_Y, test_X, test_Y, train_weight, test_weight, weighted_fitting=False)

model.set_params(weighted_bootstrap=True, weighted_training=False)
peformance['BW-TU'] = model_evaluation(model, train_X, train_Y, test_X, test_Y, train_weight, test_weight, weighted_fitting=False)

model.set_params(weighted_bootstrap=False, weighted_training=True)
peformance['BU-TW'] = model_evaluation(model, train_X, train_Y, test_X, test_Y, train_weight, test_weight, weighted_fitting=False)

model.set_params(weighted_bootstrap=True, weighted_training=True)
peformance['BW-TW'] = model_evaluation(model, train_X, train_Y, test_X, test_Y, train_weight, test_weight, weighted_fitting=False)

pd.DataFrame.from_dict(peformance).T

Unnamed: 0,oob_rmse,train_mae,train_mse,train_rmse,train_mape,train_r2,test_mae,test_mse,test_rmse,test_mape,test_r2
BU-TU,0.466989,17.357882,445.316262,21.102518,0.153632,0.928097,46.810676,3401.080932,58.318787,0.394511,0.305091
BW-TU,0.466019,16.348565,453.630299,21.298599,0.145894,0.912337,47.896323,3514.222928,59.280882,0.407046,0.278956
BU-TW,0.467631,17.347628,447.36942,21.151109,0.15319,0.928094,47.362416,3464.786771,58.862439,0.40265,0.291201
BW-TW,0.458482,16.497199,457.608683,21.39179,0.147206,0.912117,47.912723,3497.136256,59.13659,0.409547,0.283381
