<font color="#CC3D3D"><p>
# Implementing RandomForestRegressor as a sklearn custom class

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils import resample

In [None]:
# 데이터셋 로드
X = pd.read_csv('X_train_preprocessed.csv').drop(columns='ID')
y = pd.read_csv('y_train.csv').Salary

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
# DecisionTreeRegressor 모델 학습 및 예측
tree = DecisionTreeRegressor(random_state=0)
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

# 회귀 평가 지표인 평균 제곱근 오차(RMSE) 계산
rmse = np.sqrt(mean_squared_error(y_test, tree_pred))
print("RMSE:", rmse)

In [None]:
class MyRandomForestRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_estimators=10, max_features=1.0):
        self.n_estimators = n_estimators
        self.estimators = []
        self.max_features = max_features  # Added

    def fit(self, X, y):
        self.estimators = []
        X = X.values  # Added. X가 DF라고 가정
        n_features = X.shape[1]  # Added
        max_features = max(1, int(self.max_features * n_features))  # Added
        for _ in range(self.n_estimators):
            estimator = DecisionTreeRegressor()
            X_resampled, y_resampled = resample(X, y)
            random_feature_indices = np.random.choice(n_features, size=max_features, replace=False)  # Added
            X_resampled = X_resampled[:, random_feature_indices]  # Added
            estimator.fit(X_resampled, y_resampled)
            self.estimators.append((estimator, random_feature_indices))  # Modified

    def predict(self, X):
        predictions = np.zeros((len(X), self.n_estimators))
        X = X.values  # Added. X가 DF라고 가정
        for i, (estimator, random_feature_indices) in enumerate(self.estimators):  # Modified
            X_subset = X[:, random_feature_indices]  # Added
            predictions[:, i] = estimator.predict(X_subset)   # Modified
        return np.mean(predictions, axis=1)

In [None]:
# RandomForestRegressor 모델 학습 및 예측
rf = MyRandomForestRegressor(n_estimators=10)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# 회귀 평가 지표인 평균 제곱근 오차(RMSE) 계산
rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
print("RMSE:", rmse)

<font color="#CC3D3D"><p>
# End