In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('diabetes.csv')

In [3]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=6)

### First, let's try with nomal DecisionTreeClassifier.

In [6]:
dtc = DecisionTreeClassifier()

In [7]:
dtc.fit(X_train,Y_train)

DecisionTreeClassifier()

In [8]:
Y_pred = dtc.predict(X_test)

In [9]:
accuracy_score(Y_test,Y_pred)

0.7662337662337663

### Class RandomForest ()

In [10]:
class RandomForest():
    def __init__(my,n_estimators=100,
                 criterion='gini',
                 max_depth=None,
                 splitter='best',
                 max_samples=None,
                 max_features=None,):
        """
        n_estimators : int, default=100, 
            The number of trees in the forest.
        
        criterion : {"gini", "entropy"}, default="gini"
            The function to measure the quality of a split. Supported criteria are
            "gini" for the Gini impurity and "entropy" for the information gain.
            Note: this parameter is tree-specific.
        
        max_depth : int, default=None
            The maximum depth of the tree. If None, then nodes are expanded until
            all leaves are pure.
        
        splitter : {"best", "random"}, default="best"
            The strategy used to choose the split at each node. Supported
            strategies are "best" to choose the best split and "random" to choose
            the best random split.
        
        max_samples : int or float, default=None

            - If None (default), then draw `X.shape[0]` samples.
            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples. Thus,
              `max_samples` should be in the interval `(0, 1)`.
              
        max_features : int, float or {"auto", "sqrt", "log2"}, default=None
            The number of features to consider when looking for the best split:

            - If int, then consider `max_features` features at each split.
            - If float, then `max_features` is a fraction and
              `int(max_features * n_features)` features are considered at each
              split.
            - If "auto", then `max_features=sqrt(n_features)`.
            - If "sqrt", then `max_features=sqrt(n_features)`.
            - If "log2", then `max_features=log2(n_features)`.
            - If None, then `max_features=n_features`.
        
        """
        
        my.n_estimators = n_estimators
        my.criterion = criterion
        my.max_depth = max_depth
        my.splitter = splitter
        my.max_features = max_features
        
        if max_samples==None:
            my.max_samples = None
            
        elif type(max_samples) == int:
            my.max_samples = max_samples
            
        elif type(max_samples) == float:
            if max_samples < 0:
                my.max_samples = 0
            elif max_samples > 1:
                my.max_samples = 1
            else:
                my.max_samples = max_samples
        else:
            raise TypeError('Max_sample must be None, int, or float')
        
        # initializing DTCs ...
        
        my.DTCs = list(DecisionTreeClassifier(criterion=my.criterion,
                                              splitter=my.splitter,
                                              max_depth=my.max_depth,
                                              max_features=my.max_features) for _ in range(my.n_estimators))
        
    
    def ndarray_check(my,arr):
        
        if type(arr) == np.ndarray:
            return arr
        elif (type(arr)==pd.core.frame.DataFrame) | (type(arr)==pd.core.series.Series):
            return arr.values
    
    def random_indices(my,data):
        if my.max_samples == None:
            return np.array(range(0,data.shape[0]))        
        elif type(my.max_samples) == int:
            return np.random.randint(low=0,high=data.shape[0],size=max_samples)
        else:            
            size=int(df.shape[0]*my.max_samples)

            return np.random.randint(low=0,high=data.shape[0],size=size)
        
    def decide(my,arr):
        final = []
        arr = arr.T
        for i in arr:
            v,c = np.unique(i,return_counts=True)
            final.append(v[np.argmax(c)])
        return np.array(final)
    
    def fit(my,X,Y):
        
        X = my.ndarray_check(X)
        Y = my.ndarray_check(Y)
        for i in my.DTCs:            
            indices = my.random_indices(X)
            samplesX = X[indices]
            samplesY = Y[indices]            
            i.fit(samplesX,samplesY)
        
        return my
    
    def predict(my,X):
        
        X = my.ndarray_check(X)        
        Y_pred = []        
        for i in my.DTCs:
            Y_pred.append(i.predict(X))        
        Y_pred = my.decide(np.array(Y_pred))
        
        return np.array(Y_pred)

In [11]:
test = RandomForest(max_samples=0.7)

In [12]:
test.fit(X_train,Y_train)

<__main__.RandomForest at 0x1f85170e6c8>

In [13]:
y_pred = test.predict(X_test)

In [14]:
accuracy_score(Y_test,y_pred)

0.8051948051948052

### Now, let's try with actual RandomForestClassifier from sklearn

In [15]:
rfc = RandomForestClassifier(max_samples=0.7)

In [16]:
rfc.fit(X_train,Y_train)

RandomForestClassifier(max_samples=0.7)

In [17]:
y_pred = rfc.predict(X_test)

In [18]:
accuracy_score(Y_test,y_pred)

0.7792207792207793