In [None]:
#step 1 : bootstrap -> sampling som data from the given data  with replacement
#step 2 : select K(2) features


In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [None]:
x=np.array([[3,7],[1,8],[4,5],[2,6]])
y=np.array([1,0,1,0])

In [None]:
clf=RandomForestClassifier(n_estimators=100,random_state=42) #n_estimators means no of trees
clf.fit(x,y)

In [None]:
pred=clf.predict(x)
pred

array([1, 0, 1, 0])

In [None]:
def accuracy(y_true,y_pred):
    return np.mean(y_true==y_pred)
acc=accuracy(y,pred)
acc

1.0

In [None]:
pred=clf.predict([[1,2]])
pred

array([0])

In [None]:
acc1=accuracy(y,pred)
acc1

0.5

#without using inbuilt functions

In [None]:
from math import inf
#psuedo code of the node
''' class Node:
Attributes
 . feature: the feature index used for splitting
 . threshold: the value to split the feature on.

'''

class Node:
  def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
    self.feature = feature
    self.threshold = threshold
    self.left = left
    self.right = right
    self.value = value

  def is_leaf_node(self):
    return self.value is not None
class DecisionTree:
  def __init__(self):
    self.root=None
  def fit(self,X,y):
   self.root=self._build_tree(X,y)
  def _most_common_label(self,y):
    unique_labels,counts=np.unique(y,return_counts=True)
    return unique_labels[np.argmax(counts)]
  def _build_tree(self,X,y, depth=0):
    n_samples, n_features = X.shape
    n_labels = len(np.unique(y))
    if(n_labels==1):
      leaf_value=self._most_common_label(y)
      return Node(value=leaf_value)
    feat_idx=np.arange(n_features)
    best_feature, best_threshold = self._best_split(X, y,feat_idx)
    left_idxs, right_idxs = self._split(X[:, best_feature], best_threshold)
    left=self._build_tree(X[left_idxs,:],y[left_idxs])
    right=self._build_tree(X[right_idxs,:],y[right_idxs])
    return Node(best_feature,best_threshold,left,right)
   # argmax return index of the max value
  def _split(self,X_column,split_thresh):
    left_idxs=np.argwhere(X_column<=split_thresh).flatten()
    right_idxs=np.argwhere(X_column>split_thresh).flatten()
    return left_idxs,right_idxs
  def _best_split(self,X,y,feat_idxs):
    best_gain,split_idx,split_thresh=float('inf'),None,None
    for feat_idx in feat_idxs:
      X_column=X[:,feat_idx]
      X_column_sorted=np.sort(X_column)
      thresholds=(X_column_sorted[:-1]+X_column_sorted[1:])/2
      for threshold in thresholds:
        gain=self._information_gain(y,X_column,threshold)
        if(gain<best_gain):
          best_gain=gain
          split_idx=feat_idx
          split_thresh=threshold
      return split_idx,split_thresh
  def _information_gain(self,y,X_column,threshold):
    left_idxs, right_idxs=self._split(X_column,threshold)
    if(len(left_idxs)==0 or len(right_idxs)==0):
      return 0
    n,n_l,n_r = len(y),len(left_idxs),len(right_idxs)
    e_l,e_r=self._entropy(y[left_idxs]),self._entropy(y[right_idxs])
    child_entropy=(n_l/n)*e_l+(n_r/n)*e_r
    information_gain= child_entropy
    return information_gain
  def _entropy(self,y):
    fid3=np.mean(y)
    if fid3==0 or fid3==1:
      return 0
    return -fid3*np.log(fid3)- (1-fid3)*np.log(1-fid3)
  def predict(self,X):
    pred= np.array([self._traverse_tree(x,self.root) for x in X])
    return pred
  def _traverse_tree(self,x,node):
    if node.is_leaf_node():
      return node.value
    if x[node.feature]<=node.threshold:
      return self._traverse_tree(x,node.left)
    return self._traverse_tree(x,node.right)

class RandomForest:
  def __init__(self,n_trees=10,n_features=None):
    self.n_trees=n_trees
    self.n_features=n_features
    self.trees=[]
  def _most_common_label(self,y):
    unique_labels,counts=np.unique(y,return_counts=True)
    return unique_labels[np.argmax(counts)]
  def fit(self,X,y):
    self.trees=[]
    for _ in range(self.n_trees):
      tree=DecisionTree()
      X_sample,y_sample=self._bootstrap_sample(X,y)
      tree.fit(X_sample,y_sample)
      self.trees.append(tree)
  def _bootstrap_sample(self,X,y):
    n_samples,n_features=X.shape
    idxs=np.random.choice(n_samples,n_samples,replace=True)
    return X[idxs],y[idxs]
  def predict(self,X):
    preds=np.array([tree.predict(X) for tree in self.trees])
    tree_preds=np.swapaxes(preds,0,1)
    preds=np.array([self._most_common_label(tree_pred) for tree_pred in tree_preds])
    return pred

In [None]:
clf=RandomForest()
clf.fit(x,y)

In [None]:
acc=accuracy(y,clf.predict(x))
acc

0.5