## Random Forrest & Decision Forrest

In [1]:
## imports 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
## dictionary for the dataset name & paths 
DATASETS ={
    "small":"../Data/small/glass.data",
    "medium":"../Data/medium/drug_consumption.data",
    "large":"../Data/large/c2k_data_comma.csv"
               }

## Data Split Parameters 
TRAIN_SIZE = 0.8
TEST_SIZE = 1 - TRAIN_SIZE

## Random Forest Parameters
NUM_TREES = 100 ## 1,10,25,50,75,100 
NUM_FEATURES = 10  ## 1,3, int(log_2(M)+1), sqrt(M), where M is the number of features
RANDOM_STATE = 42

In [61]:
def load_dataset(dataset_name):
    """
    Loads the dataset from the dictionary.
    """
    if dataset_name == 'small':
        ## exclude the first column ID (labeled 0 to 10, so use 1-10)
        df = pd.read_csv(DATASETS[dataset_name],header=None)
        df.drop(df.columns[0], axis=1, inplace=True)
        return df

In [62]:
def split_train_test(data: pd.DataFrame, train_size: float) -> pd.DataFrame:
    """
    Splits the data into training and testing sets.
    """
    train_data = data.sample(frac=train_size, random_state=RANDOM_STATE)
    test_data = data.drop(train_data.index)
    return train_data, test_data

In [105]:
## load the data 
small = load_dataset("small")

## split the data 
train, test = split_train_test(small, TRAIN_SIZE)
## split the train into features and labels
X_train, y_train = train.iloc[:,:-1], train.iloc[:,-1]
X_test, y_test = test.iloc[:,:-1], test.iloc[:,-1]

In [64]:
## split a numpy array into two parts 
## the first part is the left branch, the second part is the right branch
def split_data(data: np.array, index: int, value: float) -> (np.array, np.array):
    """
    Splits the dataset based on the column and value.
    """
    left = data[data[:, index] <= value]
    right = data[data[:, index] > value]
    return left, right


In [104]:
## define the entropy function 
def entropy(data:np.array) -> float:
    """
    Calculates the entropy given the following formula: 
    H(x) = -sum_j(p_j * log(p_j))
    """
    ## get the unique counts of the array 
    _, counts = np.unique(data, return_counts=True)
    ## get the probabilities
    probas = counts / data.shape[0]
    ## add the 1e-7 to avoid log(0)
    #return np.sum(-probas * np.log2(probas + 1e-7))
    return -(probas * np.log(probas) + (1 - probas) * np.log(1 - probas)).sum()

## define the gini function 
def gini(data: np.array) -> float:
    """
    Calculates the gini impurity given the following formula:
    G(x) = 1 - sum_j(p_j^2)
    """
    ## get the unique counts of the array 
    _, counts = np.unique(data, return_counts=True)
    ## get the probabilities
    probas = counts / data.shape[0]
    return 1 - np.sum(probas ** 2)

## define the same function but given a probability 
## wheere the input is p(x)

def entropy_proba(proba: float) -> float:
    """
    Calculates the entropy given the following formula:
    H(x) = -sum_j(p_j * log(p_j))
    """
    if proba == 0: 
        return 0
    elif proba == 1:
        return 0 
    else: 
        return -proba * np.log2(proba) - (1 - proba) * np.log2(1 - proba)
    
## define the information gain function 
def information_gain(left_branch, right_branch) -> float:
    """
    Calculates the information gain given the following formula:
    G(x) = H(x) - sum_j(p_j * H(x|j))
    """
    p = len(left_branch) / (len(left_branch) + len(right_branch))
    return entropy(left_branch) + entropy(right_branch) - p * entropy(left_branch) - (1 - p) * entropy(right_branch)

## define a function to draw bootstrap samples 
## from X_train & y_train
def get_bootstrap_samples(X_train:np.array, y_train:np.array) -> np.array:
    """
    Function which gets the bootstrap samples.
    These are samples WITH replacement, thus, the left over samples are 
    Left_Over = lim_x_to_inf [(1 - 1/n)^n] -> e^-1 ~ 1/3
    So for the fitting 2/3 of the observations will be used. 
    
    PARAMS:
    -------
    X_train : training samples of the dataset 
    
    y_train : training labels of the dataset 
    
    RETURNS:
    --------
    X_bootstrap : bootstrap samples for X_train 
    
    y_bootstrap : bootstrap samples for y_train 
    
    x_oob : out of bag samples for X_train 
    
    y_oob : out of bag samples for y_train 
    
    """
    ## first get the indices of the bootstrap samples 
    bootstrap_idx = np.random.choice(range(X_train.shape[0]), size=int(X_train.shape[0]), replace=True)
    ## get the out of bag samples
    oob_idx = np.setdiff1d(range(X_train.shape[0]), bootstrap_idx)
    ## get the bootstrap samples
    x_bootstrap, y_bootstrap = X_train.iloc[bootstrap_idx].values, y_train.iloc[bootstrap_idx].values
    ## get the oob samples 
    x_oob,y_oob = X_train[oob_idx].values, y_train[oob_idx].values
    return x_bootstrap, y_bootstrap, x_oob, y_oob

## define a function to calculate the OOB error
def out_of_bag_error(x_test:np.array, y_test:np.array, model:object) -> float:
    """
    Calculates the out of bag error given the following formula:
    E_OOB = 1/n * sum_i(1 - p(x_i|y_i))
    """
    ## get the predictions for each item in the test set
    preds = np.array([predict_samples(model, i) for i in x_test])
    ## get the sum of the mismatches 
    mismatch = np.sum(preds != y_test)
    ## return the error
    return mismatch / y_test.shape[0]

## define a function to find the best split point 
## it should select m features at random 
## for each feature in the bootstrapped samples, it calculates the information gain
## returns a dictionary with: feature index, split value, left_branch, right_branch

def best_split_finder(X_bootstrap: np.array, y_bootstrap: np.array, num_features: int) -> dict:
    """
    Calculates the best split for each feature in the bootstrapped samples.
    
    PARAMS: 
    X_bootstrap : training bootstrapped samples 
    
    y_bootstrap : training labels boostrapped samples
    
    """

In [106]:
get_bootstrap_samples(X_train, y_train)

KeyError: '[0, 10, 11, 13, 16, 17, 19, 20, 21, 22, 25, 28, 29, 31, 35, 36, 37, 40, 41, 44, 45, 47, 48, 49, 50, 51, 52, 53, 55, 57, 59, 60, 62, 63, 64, 66, 67, 70, 71, 73, 75, 76, 78, 79, 82, 83, 84, 86, 87, 89, 91, 92, 93, 94, 95, 96, 97, 98, 101, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 123, 124, 125, 126, 127, 131, 132, 136, 138, 141, 144, 146, 149, 151, 152, 153, 154, 155, 156, 162, 164, 165, 166, 167, 168, 169] not in index'