In [1]:
from transformers import AutoTokenizer
from diffusers import DDPMScheduler
import numpy as np
import matplotlib.pyplot as plt
import torch

In [2]:
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
noise_scheduler = DDPMScheduler()

In [3]:
tokenizer

PreTrainedTokenizerFast(name_or_path='neuralmind/bert-base-portuguese-cased', vocab_size=29794, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [3]:
def get_ids(text):
    ids = tokenizer(text, add_special_tokens=False, return_tensors="pt").input_ids[0]
    return ids
    
get_ids("esse é um teste de difusão textual")

tensor([ 1966,   253,   222,  3515,   125, 13934,  4509,  2066])

In [4]:
def id_to_one_hot(token_ids, vocab_size=tokenizer.vocab_size):
    one_hot_vectors = []
    for token_id in token_ids:
        # Create a zero-filled array with length equal to vocab_size
        one_hot = torch.zeros(vocab_size)
        # Set the value at the index of the token ID to 1
        one_hot[token_id] = 1
        one_hot_vectors.append(one_hot)
    return torch.stack(one_hot_vectors, dim=0)
id_to_one_hot(get_ids("esse é um teste de difusão textual"))

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [5]:
def add_noise(latents, max_steps=noise_scheduler.config.num_train_timesteps):
    noise = torch.randn_like(latents)
    bsz = latents.shape[0]
    timesteps = torch.randint(0, max_steps, (bsz,), device=latents.device)
    timesteps = timesteps.long()
    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
    return noisy_latents

In [6]:
def get_max_indices(list_of_tensors):
    max_indices = []
    for tensor in list_of_tensors:
        # Get the index of the maximum value in the tensor
        index = torch.argmax(tensor).item()
        max_indices.append(index)
    return max_indices

In [10]:
tokenizer.decode([1966, 253, 222, 3515, 125, 13934, 4509, 2066])

'esse é um teste de difusão textual'

In [26]:
latents = id_to_one_hot(get_ids("esse é um teste de difusão textual"))

for i in range(10):
    new = add_noise(latents, 1000)
    # print(img2ids(new))
    print(get_max_indices(new))
    print(tokenizer.decode(get_max_indices(new)))

[5295, 253, 13051, 136, 5959, 26104, 19381, 23587]
fábr é tribun? Desenvol所 expostosە
[23762, 21796, 17740, 12789, 16483, 7904, 9009, 13038]
##Ζ Amarties soviética suficientementeap Est ater
[5872, 8076, 21164, 17823, 125, 7526, 10297, 6919]
##lado 1950 Nickelodeonnautas de Rainhaesses dividido
[25467, 16386, 22918, 10950, 737, 13911, 3044, 20802]
##ཕ ench本criçãoração governantes Glo 116
[3211, 15207, 13166, 3515, 6513, 5532, 21399, 4868]
intit Jinftwa testequad golpe interessou Agosto
[14559, 5582, 25288, 14735, 27492, 4566, 14349, 2066]
dieta Sérӡfam떗 daqueleyoncétual
[23677, 23953, 9657, 27111, 12228, 27162, 29774, 9930]
##悪艺 esquerdo꾪 abastecimento뇫흹 auxílio
[2645, 1886, 3234, 6208, 28752, 19033, 4509, 26152]
presença 〈 Ban roteiro찿 verdadeiramente tex握
[1072, 253, 24641, 7006, 15283, 6311, 6335, 13652]
Pro é併ortaleza árbitroíça formando participado
[15307, 29152, 13855, 15256, 27294, 7122, 6336, 22306]
nad퀹 agrupaventura돲 minha quadrinhosí


In [42]:
timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (2,))
print("Original Tensor:", timesteps)
# timesteps_ones = torch.ones(16, dtype=torch.long)
# print(timesteps_ones)
n_seq = 4  # Define the number of repetitions
# timesteps_tensor = timesteps_ones[:n_seq].expand((timesteps.size(0), n_seq))
# print(timesteps_tensor)
# print(timesteps_tensor)
# Repeat each number in timesteps n_seq times
repeated_tensor = timesteps.unsqueeze(1).repeat(1, n_seq)

print(repeated_tensor)

Original Tensor: tensor([604,  57])
tensor([[604, 604, 604, 604],
        [ 57,  57,  57,  57]])


In [7]:
from modeling_diffbert import DiffBertForDiffusion
from configuration_diffbert import DiffBertConfig
import torch

model = DiffBertForDiffusion.from_pretrained("diffbert-mini")
model

DiffBertForDiffusion(
  (bert): DiffBertModel(
    (embeddings): DiffBertEmbeddings(
      (word_embeddings): Linear(in_features=30522, out_features=384, bias=True)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (time_embedding): Embedding(1000, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DiffBertEncoder(
      (layer): ModuleList(
        (0-1): 2 x DiffBertLayer(
          (attention): DiffBertAttention(
            (self): DiffBertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DiffBertSelfOutput(
              (dense): Linear(in_features=384, out_

In [8]:
inputs_embeds = torch.rand((4, 128, 30522))
timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (4,))
model(inputs_embeds=inputs_embeds, timesteps=timesteps).logits.shape

torch.Size([4, 128, 30522])

In [8]:
from io import StringIO
import sys

notebooks = {}
def create_notebook(notebook_id):
    notebooks[notebook_id] = {'variables': {}}
    return f"Notebook {notebook_id} created successfully!"

def run_code(notebook_id, code):

    variables = notebooks.get(notebook_id).get('variables')
    
    # Redirect stdout to capture output
    sys.stdout = StringIO()
    
    try:
        exec(code, variables)
        output = sys.stdout.getvalue()
    except Exception as e:
        output = f"Error: {str(e)}"
    
    # Restore stdout
    sys.stdout = sys.__stdout__
    
    return output

In [9]:
create_notebook("test")

'Notebook test created successfully!'

In [17]:
out = run_code("test", 
"""x = 5
for i in range(x):
    print(i)
""")
print("output", out)

In [14]:
notebooks

{'test': {'variables': {'__builtins__': {'__name__': 'builtins',
    '__doc__': "Built-in functions, exceptions, and other objects.\n\nNoteworthy: None is the `nil' object; Ellipsis represents `...' in slices.",
    '__package__': '',
    '__loader__': _frozen_importlib.BuiltinImporter,
    '__spec__': ModuleSpec(name='builtins', loader=<class '_frozen_importlib.BuiltinImporter'>, origin='built-in'),
    '__build_class__': <function __build_class__>,
    '__import__': <function __import__>,
    'abs': <function abs(x, /)>,
    'all': <function all(iterable, /)>,
    'any': <function any(iterable, /)>,
    'ascii': <function ascii(obj, /)>,
    'bin': <function bin(number, /)>,
    'breakpoint': <function breakpoint>,
    'callable': <function callable(obj, /)>,
    'chr': <function chr(i, /)>,
    'compile': <function compile(source, filename, mode, flags=0, dont_inherit=False, optimize=-1, *, _feature_version=-1)>,
    'delattr': <function delattr(obj, name, /)>,
    'dir': <function 