# TODO
**BE SURE TO ADD PATH TO `src/*` TO YOUR `ENVIROMENT/LIB/PYTHONVESRION/SITEPACKAGE/tnn.pth`** !
# Introduction to Tree NN

## Trees and DecisionUnits

In [44]:
from tnn import DecisionUnit, Tree
import torch
import torch.nn as nn

# simple model
class Double(nn.Module):
    def forward(self, x):
        return 2 * x

# tree either doubles x or leaves it
tree= Tree(1, 2, [nn.Identity(), Double()])

# For educational purpuses:
# We will set bias to 0 and weights to 0.5
# and get an uniform distribution over paths 
with torch.no_grad():
    tree.distribution.linear.weight.fill_(0.5)
    tree.distribution.linear.bias.fill_(0)

Lets sample a few outputs of this simple tree for input $[1]$ and calculate $E([1])$

In [45]:
x = torch.ones((1,))
print(f"input = {x}")
for i in range(5):
    print(tree(x, select_max=False))

E = tree.expected_value(lambda z : z, x)
print(f"E(Tree(x)) = {E}")

input = tensor([1.])
tensor([1.])
tensor([2.])
tensor([1.])
tensor([2.])
tensor([2.])
E(Tree(x)) = tensor([1.5000], grad_fn=<AddBackward0>)


# Training
Different mothods of training will be shown. We will try to model a following function:
$$
f(x,y) = \begin{cases}
xy & x > 0, y > 0 \\
e^x + y & x > 0, y \leq  0 \\
2x \leq  0, y \leq  0 \\ 
y & x \leq 0, y > 0 \\
\end{cases}
$$

In [63]:
def f(x, y):
    if x > 0:
        if y > 0:
            return x * y 
        else:
            return torch.exp(x) + y
    else:
        if y > 0:
            return y 
        else:
            return 2*x 

def vector_f(z):
    return f(z[0], z[1])

# get training data:
n_points = 100
n_data = n_points ** 2 
points = torch.linspace(-5, 5, n_points)
X = torch.zeros((n_data, 2))
Z = torch.zeros((n_data, 1))
for index_x in range(n_points):
    for index_y in range(n_points):
        x = points[index_x]
        y = points[index_y]
        X[index_y + index_x * n_points] = torch.tensor([x, y])
        Z[index_y + index_x * n_points] = f(x, y)

# get testing data:
n_points = 31
n_data = n_points ** 2 
points = torch.linspace(-5, 5, n_points)
test_X = torch.zeros((n_data, 2))
test_Z = torch.zeros((n_data, 1))
for index_x in range(n_points):
    for index_y in range(n_points):
        x = points[index_x]
        y = points[index_y]
        X[index_y + index_x * n_points] = torch.tensor([x, y])
        Z[index_y + index_x * n_points] = f(x, y)



## Shallow Tree model

In [93]:
from models import FeedForward
tree = Tree(2, 2, [FeedForward(2, 3, 1), FeedForward(2, 3, 1)])

## Primitive training without additional cost function
We can just minimalize E[L(x)]. Bad performance expected.

In [None]:
#TODO

# Primitive training + force softmax to converge to delta 
Mogoče se nam splača forsirat distribucijo (pred softmaxom..?) v to smer, da je na eni točki ful velika, drugje pa mala

In [1]:
# TODO

# 2-step training of Tree:
We can alternate between training the parameters of the inside linear layers and the parameters of the outside layers.

### Punish small gradients

In [2]:
# TODO


### Force inner network to be perpendicular to siblings of its parents

Slabo iz tega vidika, da mogoče na koncu  vseeno rabiš prevert kej, na kar smo bli pravkokotni

In [3]:
#TODO

### Štetje ničel v vektorju za vsakim hidden layerjem

In [None]:
#TODO

In [94]:
model = FeedForward(2, 2, 1)
list(model.parameters())

[Parameter containing:
 tensor([[-0.4032, -0.0350],
         [ 0.2718, -0.0058]], requires_grad=True),
 Parameter containing:
 tensor([-0.3246,  0.6160], requires_grad=True),
 Parameter containing:
 tensor([[ 0.4319, -0.0500]], requires_grad=True),
 Parameter containing:
 tensor([-0.4209], requires_grad=True)]

In [98]:
-0.4032  -0.3246   -0.0350

-0.7628

In [95]:
x = torch.ones((2))
model(x, training=True)

[tensor([-0.7628,  0.8821], grad_fn=<ViewBackward0>),
 tensor([0.0000, 0.8821], grad_fn=<ReluBackward0>),
 tensor([0.0000, 0.9801], grad_fn=<MulBackward0>),
 tensor([-0.4699], grad_fn=<ViewBackward0>)]

In [None]:

grads = torch.autograd.grad(
outputs=model(x),
inputs=[p for p in model.parameters() if p.requires_grad],
create_graph=True  # allows you to compute gradients of this gradient
)
grads

(tensor([[0.2748, 0.2748],
         [0.0000, 0.0000]], grad_fn=<TBackward0>),
 tensor([0.2748, 0.0000], grad_fn=<ViewBackward0>),
 tensor([[0.0566, 0.0000]], grad_fn=<TBackward0>),
 tensor([1.]))