# Data test set generation for trees

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.utils import shuffle

## Loading and shuffling the iris dataset

In [2]:
iris = datasets.load_iris()
X, y = iris.data, iris.target
X = X.astype(np.float32)

In [3]:
X, y = shuffle(X, y, random_state=42)

In [4]:
np.savetxt("X.txt", X)
np.savetxt("y.txt", y)

## Mins and maxes

In [5]:
mins = np.amin(X, axis=0)
maxes = np.amax(X, axis=0)

In [6]:
minmaxes = np.vstack((mins, maxes))

In [7]:
np.savetxt("minmaxes.txt", minmaxes, "%f")

## Split evaluate

In [8]:
from numpy.random import default_rng

In [9]:
rng = default_rng(42)
n_points, n_features = X.shape
n_classes = np.unique(y)
for i_feature in range(n_features):
    threshold = (
        rng.random(dtype=np.float32) * (maxes[i_feature] - mins[i_feature])
        + mins[i_feature]
    )
    condition = X[:, i_feature] <= threshold

    left_count = sum(condition)
    right_count = n_points - left_count

    left_leaf = y[condition]
    right_leaf = y[np.logical_not(condition)]

    left_gini = np.array([sum(left_leaf == i_class) for i_class in n_classes])
    right_gini = np.array([sum(right_leaf == i_class) for i_class in n_classes])
    gini = np.vstack((left_gini, right_gini)).transpose()

    with open(f"counts/split_{i_feature}_instruction.txt", "w") as instruction_file:
        instruction_file.write(f"feature = {i_feature}\n")
        instruction_file.write(f"threshold = {threshold}")
    with open(f"counts/split_{i_feature}_nodecounts.txt", "w") as counts_file:
        counts_file.write(f"{left_count} {right_count}")
    np.savetxt(f"counts/split_{i_feature}_gini.txt", gini, "%u")

## Commits

In [10]:
rng = default_rng(43)
n_commits = 3

leaf_indices = np.zeros_like(y, dtype=np.int8)
np.savetxt(f"commits/initial_leaf_indices.txt", leaf_indices, "%u")
n_leaves = 1
for i_commit in range(n_commits):
    i_leaf = rng.integers(n_leaves)
    i_feature = rng.integers(n_features)
    leaf_feature = X[leaf_indices == i_leaf, i_feature]
    leaf_min, leaf_max = np.amin(leaf_feature), np.amax(leaf_feature)
    threshold = (
        rng.random(dtype=np.float32) * (maxes[i_feature] - mins[i_feature])
        + mins[i_feature]
    )
    condition = np.logical_and(X[:, i_feature] > threshold, leaf_indices == i_leaf)

    leaf_indices[condition] = n_leaves

    with open(f"commits/commit_{i_commit}_instruction.txt", "w") as instruction_file:
        instruction_file.write(f"leaf = {i_leaf}\n")
        instruction_file.write(f"feature = {i_feature}\n")
        instruction_file.write(f"threshold = {threshold:f}\n")

    np.savetxt(f"commits/commit_{i_commit}_leaf_indices.txt", leaf_indices, "%u")

    n_leaves += 1