### Python

In [1]:
from fastcore.meta import *

In [None]:
@use_kwargs_dict

### Deep Learning

In [2]:
import torch
from torch import nn
import torch.nn.functional as F

In [3]:
from torch.utils.data import DataLoader, random_split

In [None]:
train_dset, valid_dset = random_split(datasets, lengths=[6, 4])

In [None]:
train_dataloader = DataLoader(train_dset, batch_size=2)

In [5]:
def calculate_return(rewards, discount_factor):
    total_return = 0
    
    for k, reward in enumerate(rewards):
        total_return += discount_factor**k * reward
    
    return total_return

In [6]:
rewards = torch.tensor([10, 20, 30, 40, 50])

In [8]:
calculate_return(rewards, 0.900)

tensor(114.2650)

In [None]:
env.observation_space 

In [None]:
x.clamp_max_(3)

In [None]:
seq_len / n_heads

In [9]:
from transformers import RobertaConfig, RobertaModel

In [10]:
config = RobertaConfig()

In [None]:
model = RobertaModel(config)

In [None]:
torch.gather(x, dim=1, index=indices)

In [None]:
model = RobertaModel.from_pretrained("roberta-base")

In [11]:
from transformers import AutoModel, AutoTokenizer

In [None]:
class CustomTokenClassifier(nn.Module):
    def __init__(self, checkpoint, n_labels, dropout):
        super().__init__()
        self.model = AutoModel.from_pretrained(checkpoint)
        
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(768, 1)
    
    def forward(self, input_ids, attention_mask):
        output = self.model(
            input_ids, attention_mask
        )
        
        out = self.dropout(output.last_hidden_state)
        out = self.classifier(out)
        
        return out

In [12]:
kl_loss = nn.KLDivLoss()

In [None]:
loss = kl_loss(
    F.log_softmax(student_logits),
    F.softmax(teacher_logits)
)

In [None]:
high temp => flat => low -> hgher
low temp => peak -> high -> higher

In [13]:
import math

In [14]:
def scaled_dot_product_attention(q, k, v, d_k):
    qk_matmul = torch.matmul(q, k.T)
    scores = qk_matmul / math.sqrt(d_k)
    attention_weights = F.softmax(scores, dim=-1)
    output = torch.matmul(attention_weights, v)
    
    return output, attention_weights

In [None]:
z_mean, z_log_var

In [15]:
class PerformanceBenchmark:
    def __init__(self, pipe, dataset):
        self.pipe = pipe
        self.dataset = dataset
    
    def compute_accuracy(self): pass

    def compute_memory(self): pass
    
    def compute_latency(self): pass

    def run_benchmark(self):
        pass

In [None]:
output.logits[0, -1, :][4]

In [None]:
next_token_prob = F.softmax(output.logits[0, -1, :], dim=-1)

In [None]:
idx = torch.argmax(next_token_prob, dim=-1)

In [None]:
def create_mask(size):
    return torch.ones((1, size, size)).triu()

entropy_loss: the goal is increase -> substract, the larger the smaller the loss

value_loss: the goal is decrease -> add, the smaller the smaller the loss

In [None]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
model.classifier[0].requires_grad = True
model.classifier[3].requires_grad = True

In [None]:
model.classifier[6] = nn.Linear(4095, 10)

In [16]:
import multiprocess

In [None]:
with multiprocess.Pool(4) as pool:
    result = pool.map(sum, numbers)

prefered summary > non-prefered summary -> larger than zero -> sigmoid -> large



In [None]:
class Encoder(nn.Module):
    def __init__(self, n_layers, n_heads, n_layers, d_ff, dropout):
        super().__init__()
        
        self.attention = MultiHeadAttention()

In [None]:
n_embeddings = embedding.shape[-1]

In [None]:
layer_norm = nn.LayerNorm(n_embeddings)

In [17]:
from fastai.vision.all import *

In [None]:
dblock = DataBlock(blocks=[ImageBlock, MultiCategoryBlock],
                   get_x=get_x,
                   get_y=get_y)

In [None]:
dsets = dblock.dataset

In [None]:
next_token = outputs.logits[0, -1, :]

In [None]:
next_token_probs = F.softmax(next_token, dim=-1)

In [None]:
sorted_idx = torch.argmax(next_token_probs)

In [None]:
class LMModel(nn.Module):
    def __init__(self, vocab_sz, n_hidden):
        super().__init__()
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0.
    
    def forward(self, x):
        for i in range(3):
            self.h = F.relu(self.i_h(x[i, :]))
            out = self.h + F.relu(self.h_h(x[i, :]))
        
        return self.h_o(out)

In [None]:
class Encoder(nn.Module):
    def __init__(self, d_model, n_heads, n_layers, d_ff, dropout):
        super().__init__()
        
        self.text_embedding = TextEmbedding(
            vocab_size, d_model, padding_idx=0
        )
        self.positional_encoding = PositionEncoding(d_model)
        
        self.encoder_layers = [
            EncoderLayer(
                d_model, n_heads, d_ff
            ) for _ in range(n_layers)
        ]
    
    def forward(self, tokens):
        text_embeddings = self.text_embedding(tokens)
        embeddings = self.positional_encoding(text_embeddings)
        
        encoder_out = embeddings
        for encoder_layer for self.encoder_layers:
            encoder_out, encoder_weights = encoder_layer(encoder_out)
        
        return encoder_out, encoder_weights

In [None]:
x.permute(3, 1, 2, 4)

In [18]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps):
        super().__init__()
        self.eps = eps
        self.adds = nn.Parameter(torch.zeros(features))
        self.mults = nn.Parameter(torch.ones(features))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)
        
        # normalize
        x = (x-mean) / (var + self.eps).sqrt()
        
        # scale and shift
        x = self.mults * x + self.adds
        return x

In [None]:
idx = torch.where(dsets.train[0][1] == 1.)

In [None]:
vocab[idx]

In [None]:
class LMModel(nn.Module):
    def __init__(self, vocab_sz, n_hidden):
        super().__init__()
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0.
    
    def forward(self, x):
        for i in range(3):
            self.h = self.h + F.relu(self.i_h(x[i, :]))
            self.h = F.relu(self.h_h(x[i, :]))
        
        return self.h_o(self.h)

In [None]:
l1.weight.grad

In [None]:
latency, memory, accuracy

In [None]:
it helps the model learn out-of-vocab

In [None]:
x.clamp(min=-3, max=4)

The equation "a^k=\arg \max _a\left[Q(s, a)+P(s, a) \cdot \frac{\sqrt{\sum_b N(s, b)}}{1+N(s, a)}\left(c_1+\log \left(\frac{\sum_b N(s, b)+c_2+1}{c_2}\right)\right)\right]" is used in the selection stage of the MCTS algorithm to select an action $a^k$ at each hypothetical time-step $k=1 \ldots l$ of the simulation. It's an extension of the standard UCB1 formula and it's used to balance exploration and exploitation in the search tree.

Let's break it down:

$\arg \max _a$: This notation means "argmax over a", it's used to find the action $a$ that maximizes the expression inside the brackets. So, the action $a^k$ that is selected by the algorithm is the one that maximizes the value of the expression inside the brackets.
$Q(s, a)$: This notation represents the mean value of the action $a$ from state $s$, it's an estimate of how good the action is. It's calculated as the average value of all the simulations that have been run from this state-action pair.
$P(s, a)$: This notation represents the policy for the action $a$ from state $s$, it's a probability that indicates how likely the action is to be taken from this state.
$\frac{\sqrt{\sum_b N(s, b)}}{1+N(s, a)}$: This term is an exploration bonus, it encourages the algorithm to explore actions that have been visited less often. The $\sqrt{\sum_b N(s, b)}$ is the square root of the sum

of the visit counts for all actions from state $s$, it's a measure of how often the state has been visited. The $1+N(s, a)$ is the number of times the action $a$ has been visited. So this term is the ratio of the square root of the sum of all the visit counts for all actions over the number of times the action $a$ has been visited.

$(c_1+\log \left(\frac{\sum_b N(s, b)+c_2+1}{c_2}\right))$ : This term is used to control the influence of the prior $P(s, a)$ relative to the value $Q(s, a)$ as nodes are visited more often. The $\log \left(\frac{\sum_b N(s, b)+c_2+1}{c_2}\right)$ is the logarithm of the ratio between the total number of visits and a constant $c_2$ . The $c_1$ is an additional constant.
So, in summary, the equation is using the UCB formula to select the action that maximizes the sum of the mean value of the action $Q(s, a)$ and the exploration bonus weighted by the policy $P(s, a)$ and a term that control the influence of the prior relative to the value as nodes are visited more often.

This passage is describing the expansion stage of the MCTS algorithm, which happens when the simulation reaches a leaf node in the search tree. In this stage, the algorithm performs the following steps:

At the final time-step $l$ of the simulation, the reward $r^l$ and next state $s^l$ are computed by the dynamics function $g_\theta\left(s^{l-1}, a^l\right)$. This function uses the current state-action pair $(s^{l-1}, a^l)$ and the neural network parameters $\theta$ to predict the reward and next state that result from taking action $a^l$ in state $s^{l-1}$.
The reward and next state are stored in the corresponding tables: $R\left(s^{l-1}, a^l\right)=r^l$ and $S\left(s^{l-1}, a^l\right)=s^l$
The policy and value are computed by the prediction function $f_\theta\left(s^l\right)$. This function uses the next state $s^l$ and the neural network parameters $\theta$ to predict the policy and value of the next state.
A new node is added to the search tree, corresponding to the next state $s^l$.
For each possible action $a$ from the next state $s^l$, a new edge

The $R\left(s^{l-1}, a^l\right)$ and the $S\left(s^{l-1}, a^l\right)$ tables in MuZero are data structures that are used to store the outcome of the simulations performed by the MCTS algorithm.

The $R\left(s^{l-1}, a^l\right)$ table is used to store the rewards associated with each state-action pair $(s^{l-1}, a^l)$. Specifically, the algorithm stores the reward $r^l$ that results from taking action $a^l$ in state $s^{l-1}$ in the R table. This information can be used to update the statistics of the current node and its children nodes, such as mean value, and also it can be used during the training phase to improve the estimates of the rewards provided by the neural network.

The $S\left(s^{l-1}, a^l\right)$ table is used to store the next state associated with each state-action pair $(s^{l-1}, a^l)$. Specifically, the algorithm stores the next state $s^l$ that results from taking action $a^l$ in state $s^{l-1}$ in the S table. This information can be used to update the statistics of the current node and its children nodes, such as visit count, and also it can be used during the training phase to improve the estimates of the next state provided by the neural network.

It's worth noting that these tables are not an explicit data structure, they are represented implicitly in the MCTS algorithm, which helps to save the memory space.

The Backup stage of the MCTS algorithm is used to update the statistics of the current node and its children nodes in the search tree.

In this passage, the algorithm uses the stored rewards and the value prediction to compute an estimate of the cumulative discounted reward for each node on the search path.

The algorithm starts at the final time-step $l$ of the simulation and works backwards to the root node. For each time-step $k=l \ldots 0$, the algorithm forms an $l-k$-step estimate of the cumulative discounted reward, bootstrapping from the value function $v^l$. The cumulative discounted reward is calculated using the following equation:

$$
G^k=\sum_{\tau=0}^{l-1-k} \gamma^\tau r_{k+1+\tau}+\gamma^{l-k} v^l
$$

The $\gamma$ term is the discount factor and is used to weigh the importance of future rewards. The $r_{k+1+\tau}$ term is the intermediate reward that is stored in the R table for each node in the search path.

Once the cumulative discounted reward is calculated, the algorithm updates the statistics for each edge $(s^{k-1}, a^k)$ in the simulation path. The mean value of the current state-action pair is updated using the following equation:

$$
Q\left(s^{k-1}, a^k\right):=\frac{N\left(s^{k-1}, a^k\right) \cdot Q\left(s^{k-1}, a^k\right)+G^k}{N\left(s^{k-1}, a^k\right)+1}
$$

The visit count is incremented by 1 using the following equation:

$$
N\left(s^{k-1}, a^k\right):=N\left(s^{k-1}, a^k\right)+1
$$

The algorithm uses the minimum-maximum values observed in the search tree up to that point to compute the normalized Q value estimates $\bar{Q}$ to be used in the pUCT rule, this is done by the following equation:
$$
\bar{Q}\left(s^{k-1}, a^k\right)=\frac{Q\left(s^{k-1}, a^k\right)-\min _{s, a \in \text { Tree }} Q(s, a)}{\max _{s, a \in \text { Tree }} Q(s, a)-\min _{s, a \in \text { Tree }} Q(s, a)}
$$

This normalization step is done to ensure that the Q-values are within the range of $[0,1]$ which is necessary for the pUCT rule (Eqn 2) and to avoid adding prior knowledge to the MuZero algorithm. This allows the algorithm to work in environments where the value is unbounded and is not dependent on game specific knowledge.

In summary, the Backup stage uses the cumulative discounted reward, computed using the stored rewards and value prediction, to update the statistics of the current node and its children nodes in the search tree. The statistics are updated for each edge $(s^{k-1}, a^k)$ in the simulation path. The mean value of the current state-action pair is updated and the visit count is incremented. The algorithm also normalizes the Q-values to ensure that they are within the range of $[0,1]$ to be used in the pUCT rule.