In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone


class RandomForestClassifier():
    def __init__(self, n_estimators=100,
                 criterion='gini',
                 splitter='best',
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 max_features=None,
                 random_state=None):
        
        self.n_estimators=n_estimators
        self.criterion=criterion
        self.splitter=splitter
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.min_samples_leaf=min_samples_leaf
        self.max_features=max_features
        self.random_state=random_state
        
        base_estimators=DecisionTreeClassifier(criterion=self.criterion,
                                              splitter=self.splitter,
                                              max_depth=self.max_depth,
                                              min_samples_split=self.min_samples_split,
                                              min_samples_leaf=self.min_samples_leaf,
                                              max_features=self.max_features,
                                              random_state=self.random_state) 
        np.random.seed(random_state)
        self.trees = [clone(base_estimators) for i in range(self.n_estimators)]  

            
    def get_bootstrap(self, X, Y):
        self.data_index=[]
        m,n=X.shape
        if self.max_features == None:
            self.max_features = int(np.sqrt(n))
        for i in range(self.n_estimators):
            id_x=np.random.choice(m, m, replace=True)  #这里抽取数据允许重复
            id_f=np.random.choice(n, self.max_features, replace=False)  #获取随机特征
            
            self.data_index.append([id_x,id_f])
                       
    def fit(self, X, Y):
        self.get_bootstrap(X, Y)
        for i in range(self.n_estimators):
            id_x, id_f = self.data_index[i]
            self.trees[i].fit(X[id_x][:,id_f], Y[id_x])
        return self

    def predict(self, X):
        y_preds = []
        for i in range(self.n_estimators):
            idx = self.data_index[i][1]
            sub_X = X[:, idx]
            y_pre = self.trees[i].predict(sub_X)
            y_preds.append(y_pre)
        y_preds = np.array(y_preds).T
        y_pred = []
        for y_p in y_preds:
            y_pred.append(np.bincount(y_p.astype('int')).argmax())
        return np.array(y_pred)

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score

X,y=load_breast_cancer().data,load_breast_cancer().target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
model=RandomForestClassifier(random_state=7).fit(X,y)
accuracy_score(y_test,model.predict(X_test))

1.0