# Imports

In [1]:
import pandas as pd
import logging
import numpy as np
from sklearn.model_selection import train_test_split
import json


In [2]:
logging.getLogger().setLevel(logging.INFO)

# Classes

## Cart learner

In [3]:
class Cart:
  def __init__(self, x_train, y_train, n_features, n_depth, recalculate_features=True):
    self.x_train = x_train
    self.y_train = y_train
    self.labels = np.unique(y_train)
    self.n_features = n_features
    self.n_depth = n_depth
    self.recalculate_features = recalculate_features
    self.root = None
    self.features = np.array([])
    self.features = self._get_features()
    return

  def train(self):
    instances = np.arange(self.x_train.shape[0])
    root = self._find_node(instances)
    root = self._expand_tree(root, 1)
    self.root = root
    if root is None:
      logging.error('Root None: %s', root)
    return root

  def predict(self, x_pred):
    return self._predict(self.root, x_pred)

  def _predict(self, node, x_pred):
    if node is None:
      logging.error('Predicting node is none: %s', node)

    if x_pred[node['feature']] < node['middle_point']:
      if isinstance(node['l'], dict):
        return self._predict(node['l'], x_pred)

      return node['l']

    if isinstance(node['r'], dict):
      return self._predict(node['r'], x_pred)

    return node['r']

  def _build_leaf(self, branch):
    lbls, clbls = np.unique(self.y_train[branch], return_counts=True)
    max_lbl = lbls[np.argmax(clbls)]

    return max_lbl

  def _expand_tree(self, node, depth):
    l, r = node['branch']
    node['depth'] = depth
    del(node['branch'])

    if l.size == 0 or r.size == 0:
      node['l'] = node['r'] = self._build_leaf(np.concatenate((l, r)))
      return node

    if depth >= self.n_depth:
      logging.debug('Max depth reached')
      node['l'] = self._build_leaf(l)
      node['r'] = self._build_leaf(r)
      return node

    node['l'] = self._find_node(l)
    self._expand_tree(node['l'], depth+1)
    node['r'] = self._find_node(r)
    self._expand_tree(node['r'], depth+1)

    return node

  def _get_features(self):
    if self.recalculate_features == True or self.features.size == 0:
      return np.random.permutation(self.x_train.shape[1])[:self.n_features]

    return self.features

  def _clear_feature(self, idx):
    if self.recalculate_features == True:
      return

    curr_features = self.features
    self.features = curr_features[curr_features != idx]

  def _find_node(self, instances):
    #eval_features = np.random.choice(self.x_train.shape[1], self.n_features)
    eval_features = self._get_features()
    best_feature, best_gini, best_branch, best_value = -1, 1, None, None

    logging.debug('Features to consider for node: %s', eval_features)
    for feature in eval_features:
      branches, value = self._binary_split(feature, instances)
      logging.debug('Evaluating idx: %s', feature)
      curr_gini = self._gini_index(branches)
      logging.debug('Current gini value: %s', curr_gini)

      if curr_gini < best_gini or best_value == None:
        best_gini = curr_gini
        best_branch = branches
        best_feature = feature
        best_value = value

    logging.debug('Best node as: best_feature %s, best_value: %s, best_gini: %s, best_branch: %s', best_feature, best_value, best_gini, best_branch)
    self._clear_feature(best_feature)
    return {'gini': best_gini, 'branch': best_branch, 'feature': best_feature, 'middle_point': best_value}

  def _binary_split(self, feature, instances):
    # assumes continuous data
    sorted_data = np.sort(self.x_train[instances, feature])
    middle_point = sorted_data[(int(sorted_data.shape[0]/2))]

    l = np.where(self.x_train[instances, feature] < middle_point)[0]
    r = np.where(self.x_train[instances, feature] >= middle_point)[0]

    return [l, r], middle_point

  def _gini_index(self, branches):
    n_instances = len(branches[0]) + len(branches[1])
    gini_score = 0.0

    logging.debug('Evaluating gini on branch: %s', branches)
    for branch in branches:
      if branch.shape == 0:
        continue

      _, all_p = np.unique(self.y_train[branch], return_counts=True)
      branch_score = np.sum(np.power(all_p/branch.shape, 2))
      gini_score += (1.0 - branch_score) * (branch.shape[0] / n_instances)
      logging.debug('Accumulated gini: %s', gini_score)
    return gini_score


## Random forest

In [4]:
class RandomForest:
  def __init__(self, learner, n_trees, n_features, n_depth, x_train, y_train, sampling=True, recalculate_features=True):
    self.learner = learner
    self.n_trees = n_trees
    self.n_features = n_features
    self.n_depth = n_depth
    self.x_train = x_train
    self.y_train = y_train
    self.trees = []
    self._init_learners()

  def _init_learners(self, sampling=True, recalculate_features=True):
    trees = []

    for curr in range(self.n_trees):
      x_train_sample, y_train_sample = self._get_sampling(sampling)
      trees.append(self.learner(x_train_sample, y_train_sample, self.n_features, self.n_depth, recalculate_features=True))

    self.trees = trees
    return

  def _get_sampling(self, sampling=True):
    if sampling == False:
      return self.x_train, self.y_train

    sample_idxs = np.random.choice(self.x_train.shape[0], int(self.x_train.shape[0] * 0.8))
    return self.x_train[sample_idxs], self.y_train[sample_idxs]

  def train(self):
    for curr_tree in self.trees:
      curr_tree.train()

  def predict(self, x_pred):
    y_pred = []
    for curr_tree in self.trees:
      y_pred.append(curr_tree.predict(x_pred))
  
    votes = np.unique(y_pred, return_counts=True)
    popular = np.argmax(votes[1])

    return votes[0][popular], votes[1][popular]

## Decision forest

In [5]:
class DecisionForest(RandomForest):
  def __init__(self, learner, n_trees, n_features, n_depth, x_train, y_train, sampling=False, recalculate_features=False):
    super().__init__(learner, n_trees, n_features, n_depth, x_train, y_train, sampling, recalculate_features)


# New Section

## Utils

In [6]:
def evaluate_model(y_true, y_pred):
  return np.count_nonzero(y_true == y_pred) / y_true.size

In [7]:
BASE_PATH = "./Data"
SIZE='small'
FN = "glass.data"

def load_ds(file_name):
  #file_name = root_path + file_name
  df = pd.read_csv(BASE_PATH + f'/{SIZE}/' + file_name,header=None)
  inputs = df.iloc[:,:-1]
  outputs = df.iloc[:,-1]
  #inputs  = df.drop(columns=["class"])
  #outputs = df["class"]
  X_train, X_test, Y_train, Y_test = train_test_split(inputs,outputs,test_size=0.1, random_state=42)

  return df, X_train.reset_index(drop=True), X_test.reset_index(drop=True), Y_train.reset_index(drop=True), Y_test.reset_index(drop=True)

df, X_train, X_test, Y_train, Y_test = load_ds(FN)

In [9]:
class NumpyEncoder(json.JSONEncoder):
    """ Custom encoder for numpy data types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):

            return int(obj)

        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
            return float(obj)

        elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
            return {'real': obj.real, 'imag': obj.imag}

        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()

        elif isinstance(obj, (np.bool_)):
            return bool(obj)

        elif isinstance(obj, (np.void)): 
            return None

        return json.JSONEncoder.default(self, obj)

def write_results(results, dsname, algorithm):
  with open(BASE_PATH + '/out/' + dsname + '_' + algorithm + '.json', 'w') as fp:
    json.dump(results, fp, cls=NumpyEncoder)
  return



# Execution

In [10]:
def random_forest_evaluation(x_train, y_train, x_pred, y_true):
  features_size = x_train.shape[1]
  n_trees = [1, 10, 25, 50, 75, 100]
  n_features = [1, 3, int(np.log2(features_size) + 1), int(np.sqrt(features_size))]
  tree_depth = 10
  results = []

  for t_conf in n_trees:
    for f_conf in n_features:
      logging.info('Building n_trees: %s, n_features: %s, depth: %s', t_conf, f_conf, tree_depth)
      model = RandomForest(Cart, t_conf, f_conf, tree_depth, x_train, y_train)
      model.train()
      y_pred = []

      for x_pred_i in x_pred:
        y_pred.append(model.predict(x_pred_i)[0])

      acc = evaluate_model(y_true, y_pred)
      results.append({
          'n_tree': t_conf,
          'n_features': f_conf,
          'tree_depth': tree_depth,
          'accuracy': acc
      })

  return results


In [11]:
def decision_forest_evaluation(x_train, y_train, x_pred, y_true):
  features_size = x_train.shape[1]
  n_trees = [1, 10, 25, 50, 75, 100]
  n_features = [int(features_size/4)+1, int(features_size/2), int(3*features_size/4), ]
  tree_depth = 10
  results = []

  for t_conf in n_trees:
    for f_conf in n_features:
      logging.info('Building n_trees: %s, n_features: %s, depth: %s', t_conf, f_conf, tree_depth)
      model = DecisionForest(Cart, t_conf, f_conf, tree_depth, x_train, y_train, sampling=False, recalculate_features=False)
      model.train()
      y_pred = []

      for x_pred_i in x_pred:
        y_pred.append(model.predict(x_pred_i)[0])

      acc = evaluate_model(y_true, y_pred)
      results.append({
          'n_tree': t_conf,
          'n_features': f_conf,
          'tree_depth': tree_depth,
          'accuracy': acc,
          'y_pred': y_pred,
          'y_true': y_true
      })

  return results


In [12]:
def run_small():
  file_name = 'glass.data'
  df, X_train, X_test, Y_train, Y_test = load_ds(file_name)
  results_random = random_forest_evaluation(X_train.to_numpy(), Y_train.to_numpy(), X_test.to_numpy(), Y_test.to_numpy())
  results_decision = decision_forest_evaluation(X_train.to_numpy(), Y_train.to_numpy(), X_test.to_numpy(), Y_test.to_numpy())
  write_results(results_random, 'glass', 'random')
  write_results(results_decision, 'glass', 'decision')


In [None]:
def run_data_banknote_authentication():
  file_name = 'data_banknote_authentication.csv'
  df, X_train, X_test, Y_train, Y_test = load_ds(file_name)
  results_random = random_forest_evaluation(X_train.to_numpy(), Y_train.to_numpy(), X_test.to_numpy(), Y_test.to_numpy())
  results_decision = decision_forest_evaluation(X_train.to_numpy(), Y_train.to_numpy(), X_test.to_numpy(), Y_test.to_numpy())
  write_results(results_random, 'data_banknote_authentication', 'random')
  write_results(results_decision, 'data_banknote_authentication', 'decision')


In [16]:
run_small()

INFO:root:Building n_trees: 1, n_features: 1, depth: 10
INFO:root:Building n_trees: 1, n_features: 3, depth: 10
INFO:root:Building n_trees: 1, n_features: 4, depth: 10
INFO:root:Building n_trees: 1, n_features: 3, depth: 10
INFO:root:Building n_trees: 10, n_features: 1, depth: 10
INFO:root:Building n_trees: 10, n_features: 3, depth: 10
INFO:root:Building n_trees: 10, n_features: 4, depth: 10
INFO:root:Building n_trees: 10, n_features: 3, depth: 10
INFO:root:Building n_trees: 25, n_features: 1, depth: 10
INFO:root:Building n_trees: 25, n_features: 3, depth: 10
INFO:root:Building n_trees: 25, n_features: 4, depth: 10
INFO:root:Building n_trees: 25, n_features: 3, depth: 10
INFO:root:Building n_trees: 50, n_features: 1, depth: 10
INFO:root:Building n_trees: 50, n_features: 3, depth: 10
INFO:root:Building n_trees: 50, n_features: 4, depth: 10
INFO:root:Building n_trees: 50, n_features: 3, depth: 10
INFO:root:Building n_trees: 75, n_features: 1, depth: 10
INFO:root:Building n_trees: 75, n_f