#INTRO
This file demonstrates how to create token embeddings to get a fixed size embedded input for variable size sentences.

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 8.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 59.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [19]:
#define sample data
data_1 = "I have an apple."
data_2 = "I have a pen."
data_3 = "It is an apple pen!"
data = [data_1, data_2, data_3]

In [21]:
#define tokenizer and token embedder
from transformers import GPT2Tokenizer, GPT2Model
device = 'cuda'

def get_tokenizer(lm):
    tokenizer = GPT2Tokenizer.from_pretrained(lm)
    return tokenizer

def get_word_embeddings(lm):
    model = GPT2Model.from_pretrained(lm)
    word_embeddings = model.wte.weight.to(device)
    return word_embeddings

gpt2_tokenizer = get_tokenizer('gpt2')
gpt2_tokenizer.pad_token=gpt2_tokenizer.eos_token
gpt2_word_embeddings = get_word_embeddings('gpt2')

In [22]:
#tokenized input
tokenized = gpt2_tokenizer(data, max_length=512, pad_to_max_length=True, return_tensors="pt").input_ids.to(device)
print(tokenized.shape)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


torch.Size([3, 512])




In [29]:
# transform to one hot
import torch
import torch.nn.functional as F
one_hot_tokenized = F.one_hot(tokenized).type(torch.float32)
print(one_hot_tokenized.shape)

torch.Size([3, 512, 50257])


In [30]:
# print out embedder shape
print(gpt2_word_embeddings.shape)

torch.Size([50257, 768])


In [31]:
print(one_hot_tokenized.dtype)
print(gpt2_word_embeddings.dtype)

torch.float32
torch.float32


In [32]:
token_embedded = one_hot_tokenized @ gpt2_word_embeddings

In [33]:
print(token_embedded.shape)

torch.Size([3, 512, 768])
