### NLP

In [1]:
class PerformanceBenchmark:
    def __init__(self, pipe, dataset):
        self.pipe = pipe
        self.dataset = dataset
    
    def latency(self): pass

    def memory(self): pass
    
    def accuracy(self): pass

    def run_metrics(self): pass

In [3]:
import torch
from torch import nn
import torch.nn.functional as F

In [4]:
loss_func = nn.KLDivLoss()

In [None]:
loss = loss_func(
    F.log_softmax(student_logits, dim=-1),
    F.softmax(teacher_logits, dim=-1)
)

In [None]:
loss

In [5]:
from transformers import Trainer

In [None]:
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.loss_func = nn.KLDivLoss()
    
    def compute_loss(self, model, inputs, returns_tensor=False):
        output_student = model(inputs)
        loss_ce = output_student.loss
        
        output_teacher = self.teacher_model(inputs)
        
        temperature = self.args.temperature
        alpha = self.args.alpha
        
        loss_kl = temperature**2 * self.loss_func(
            F.log_softmax(output_student.logits),
            F.softmax(output_teacher.logits)
        )
        
        loss = alpha*loss_ce + (1-alpha)*loss_kl
        
        return (loss, output_student) if returns_tensor else loss

### Deep Learning

In [6]:
from einops import reduce

In [None]:
output = reduce(images, 'b c h w -> b h w', reduction="mean")

minimize the kl divergence between the probability distribution of the policy and the prob dist of the returns

In [None]:
nn.Linear

the ratio between the prob dist of the current policy and the previous policy

In [None]:
memory, latency, accuracy

In [None]:
from transformers import AutoModelForCausalLM

In [None]:
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.loss_func = nn.KLDivLoss()
    
    def compute_loss(self, model, inputs, return_outputs=False):
        output_student = model(inputs)
        student_logits = output_student.logits
        loss_ce = output_student.loss
        
        with torch.no_grad():
            output_teacher = self.teacher_model(inputs)
            teacher_logits = output_teacher.logits
        
        temperature = self.args.temperature
        alpha = self.args.alpha
        
        loss_kl = temperature**2 * self.loss_func(
            F.log_softmax(student_logits / temperature, dim=-1),
            F.softmax(teacher_logits / temperature, dim=-1)
        )
        
        loss = alpha * loss_ce + (1-alpha) * loss_kl
        
        return (loss, output_student) if return_outputs else loss

In [7]:
from torch.distributions import Categorical

In [8]:
class Agent(nn.Module):
    def __init__(self, n_observations, n_actions, n_hidden):
        super().__init__()
        self.actor = nn.Sequential(
            nn.Linear(n_observations, n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, n_actions),
            nn.Softmax(dim=-1)
        )
        
        self.critic = nn.Sequential(
            nn.Linear(n_observations, n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, n_hidden),
            nn.Tanh(),
            nn.Linear(n_hidden, 1)
        )
    
    def get_action_and_value(self, state):
        probs = self.actor(state)
        dist = Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        entropy = dist.entropy()
        critic_value = self.critic(state)
        
        return action, log_prob, entropy, critic_value

In [None]:
ratio_function = current_policy_probs / prev_policy_probs

In [9]:
import math

In [10]:
class SelfAttention(nn.Module):
    def __init__(self, d_head):
        super().__init__()
        self.d_head = d_head
    
    def forward(self, q, k, v):
        k_permuted = k.permute(3, 2)
        qk_matmul = torch.matmul(q, k_permuted)
        scores = qk_matmul / math.sqrt(self.d_head)
        attention_weights = F.softmax(scores, dim=-1)
        
        output = torch.matmul(attention_weights, v)
        
        return output, attention_weights

In [None]:
def compute_ratio(current, prev):
    return currbbent - prev)

In [None]:
for t in range(n_steps):
    state, action = states[t], actions[t]
    q[state, action] = get_reward(state, action) + gamma * v_func(state + 1).mean()

In [None]:
masked_encoding = encoding.masked_fills(mask = False, -1)

In [13]:
class PositionalEncoding(nn.Module):
    def __init__(self, n, d_model):
        super().__init__()
        self.n = n
        self.d_model = d_model
    
    def forward(self, idxs):
        seq_len = len(idxs)
        
        embeddings = torch.zeros(seq_len, self.d_model)
        
        for p in range(seq_len):
            for i in range(self.d_model):
                denominator = torch.pow(self.n, (2*i)/self.d_model)
                embeddings[p][i] = torch.sin(p/denominator) if i % 2 == 0 else torch.cos(p/denominator)
        
        return embeddings