In [1]:
from transformers import AutoTokenizer, BertForMaskedLM
from diffusers import DDPMScheduler
import numpy as np
import matplotlib.pyplot as plt
from modeling_diffbert import DiffBertForDiffusion
from configuration_diffbert import DiffBertConfig
import torch
import inspect
from typing import Any, Callable, Dict, List, Optional, Union
from tqdm.auto import tqdm



    

    
# model(inputs_embeds=inputs_embeds, timesteps=timesteps).logits.shape

In [2]:
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
scheduler = DDPMScheduler()
model = DiffBertForDiffusion.from_pretrained("diffbert-mini-trained").to("cuda")
device = model.device
embedding = BertForMaskedLM.from_pretrained("neuralmind/bert-base-portuguese-cased").to(device)#torch.nn.Embedding(model.config.vocab_size, model.config.hidden_size).to(device)
# embedding.load_state_dict(torch.load('diffbert-mini/embedding_weights.bin'))

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:

def retrieve_timesteps(
    scheduler,
    num_inference_steps: Optional[int] = None,
    device: Optional[Union[str, torch.device]] = None,
    timesteps: Optional[List[int]] = None,
    **kwargs,
):
    """
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used,
            `timesteps` must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
                must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    """
    if timesteps is not None:
        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
        if not accepts_timesteps:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" timestep schedules. Please check whether you are using the correct scheduler."
            )
        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
        timesteps = scheduler.timesteps
        num_inference_steps = len(timesteps)
    else:
        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
        timesteps = scheduler.timesteps
    return timesteps, num_inference_steps

def id_to_one_hot(token_ids, vocab_size=tokenizer.vocab_size):
    one_hot_vectors = []
    for token_id in token_ids:
        # Create a zero-filled array with length equal to vocab_size
        one_hot = torch.zeros(vocab_size)
        # Set the value at the index of the token ID to 1
        one_hot[token_id] = 1
        one_hot_vectors.append(one_hot)
    return torch.stack(one_hot_vectors, dim=0)

def get_max_indices(list_of_tensors):
    max_indices = []
    for tensor in list_of_tensors:
        # Get the index of the maximum value in the tensor
        index = torch.argmax(tensor).item()
        max_indices.append(index)
    return max_indices
# Function to transform vectors back to indices
def vectors_to_indices(vectors, embedding):
    # Calculate cosine similarity between vectors and all embedding weights
    # similarity = torch.matmul(vectors, embedding.weight.T)
    
    # Get the index of the most similar embedding for each vector
    indices = torch.argmax(vectors, dim=-1)
    
    return indices

In [4]:




latents = torch.rand((1, 64, 768), device=device)
print(latents)
num_inference_steps = 1000
timesteps=None#[999, 500, 1]
timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps, device, timesteps)
# print(timesteps)
for i, t in tqdm(enumerate(timesteps)):
    # expand the latents if we are doing classifier free guidance
    latent_model_input =  latents
    latent_model_input = scheduler.scale_model_input(latent_model_input, t)
    # predict the noise residual
    noise_pred = model(
        inputs_embeds=latent_model_input,
        timesteps=t.reshape(1,).to(device),
        # encoder_hidden_states=prompt_embeds,
        # timestep_cond=timestep_cond,
        # cross_attention_kwargs=self.cross_attention_kwargs,
        # added_cond_kwargs=added_cond_kwargs,
        # return_dict=False,
    ).logits



    # compute the previous noisy sample x_t -> x_t-1
    latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0]

tensor([[[0.9917, 0.8917, 0.9752,  ..., 0.7484, 0.3251, 0.6549],
         [0.6738, 0.7837, 0.0025,  ..., 0.3245, 0.9077, 0.2541],
         [0.1522, 0.2947, 0.5039,  ..., 0.4281, 0.8483, 0.2067],
         ...,
         [0.4833, 0.7358, 0.7627,  ..., 0.1851, 0.8924, 0.6217],
         [0.6556, 0.5091, 0.5788,  ..., 0.4632, 0.3302, 0.4513],
         [0.9822, 0.6312, 0.8541,  ..., 0.2629, 0.1083, 0.2390]]],
       device='cuda:0')


0it [00:00, ?it/s]

In [6]:
latents_final = embedding.cls(latents)
# embedding.cls

In [7]:
print(vectors_to_indices(latents_final[0], embedding))
# print(tokenizer.decode(vectors_to_indices(latents[0], embedding)))
print(tokenizer.decode(vectors_to_indices(latents_final[0], embedding)))

tensor([ 4463, 13926, 12066,  2448, 17862, 17862, 20940, 16753, 16431, 15087,
        17862, 16691, 20695,   446,  8735, 18492, 17292,  7061, 15501, 20940,
          625,   117, 21471, 18012,   117,  6855, 12660,  2820,  6190,  1548,
        14069, 17862, 14069,  1003, 18280,  1587,  5311, 17862,  6855, 16753,
         1291, 17862,   117, 20314, 15558, 20912, 17862, 18011,  1229,  6855,
        18420,  2438,  6855,  6855,  6855, 16306,  2513, 11989, 14285,  6855,
         3483,  6855,  2933, 19692], device='cuda:0')
##âmbmund model son reality reality surfcript Ever from reality cláusborgingpot Pokémon Premier WW gé surf quando, tib fara, frequent vamp design excel class rup reality rup vis Kiss baswer reality frequentcriptional reality,úngrive visco realityeight super frequent abus id frequent frequent frequentngerater rin Tara frequent guitar frequent minurough


In [1]:
import torch
import torch.nn as nn

# Define the size of vocabulary and embedding dimension
vocab_size = 100  # Example vocabulary size
embedding_dim = 50  # Example embedding dimension size

# Instantiate nn.Embedding module
embedding = nn.Embedding(vocab_size, embedding_dim)

# Random indices for demonstration purposes
indices = torch.tensor([3, 7, 15])  # Example input indices

# Convert indices to vectors using the embedding layer
vectors = embedding(indices)

# Display the vectors corresponding to the input indices
print("Vectors corresponding to input indices:")
print(vectors)

# Function to transform vectors back to indices
def vectors_to_indices(vectors, embedding):
    # Calculate cosine similarity between vectors and all embedding weights
    similarity = torch.matmul(vectors, embedding.weight.T)
    
    # Get the index of the most similar embedding for each vector
    indices = torch.argmax(similarity, dim=1)
    
    return indices

# Convert vectors back to indices
recovered_indices = vectors_to_indices(vectors, embedding)

# Display the indices recovered from vectors
print("\nRecovered indices from vectors:")
print(recovered_indices)

Vectors corresponding to input indices:
tensor([[-7.7292e-02,  1.1234e+00, -1.1162e+00,  9.5290e-01,  7.0411e-01,
          9.3934e-02,  1.2905e-01, -6.1421e-01, -4.7354e-01,  1.8669e+00,
          1.3230e+00,  7.4839e-01,  3.6166e-01, -7.6501e-01, -3.1029e-01,
         -1.3262e+00, -1.2330e+00, -2.1209e-01, -1.2452e+00,  6.3154e-01,
         -4.2177e-01, -6.7838e-01,  1.8145e-01, -2.4687e-01,  4.7213e-01,
          2.9644e-01,  5.5261e-01, -1.4998e+00, -3.2089e-01,  1.9922e+00,
         -2.7300e-01, -1.3218e+00, -2.0146e-01,  1.8222e-02, -1.4948e+00,
         -5.4760e-01, -3.8630e-01, -6.9837e-01, -1.0270e-01,  8.3724e-01,
         -6.1612e-02, -1.1182e+00,  3.0394e+00, -2.8233e-01,  7.6667e-01,
         -2.0013e-01,  1.4309e+00, -4.0717e-01, -7.3446e-01,  8.6851e-02],
        [-8.8197e-01, -8.1627e-01, -9.9473e-01, -2.0596e-01, -4.3363e-01,
         -1.3574e+00,  8.7575e-01,  4.4570e-02,  6.7288e-01, -8.9306e-01,
         -5.2451e-01,  6.8276e-02,  9.4779e-01,  8.5183e-01,  1.8238e+0