In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

In [2]:
pretrained_weights = "gpt2"

In [3]:
tokenizer = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model = GPT2LMHeadModel.from_pretrained(pretrained_weights)

##### Example 1

In [124]:
import torch

`tokenizer` is a GPT2 Tokenizier and `model` is a pre-trained GPT2 from `transformer` library

In [31]:
sentence = "persistent is all you need"

Use `model` to make prediction of `sentence` in raw tokens

In [32]:
ids = tokenizer.encode(sentence)

In [33]:
ids

[19276, 7609, 318, 477, 345, 761]

Add a extra dim for batch_size equals to one

In [34]:
data = torch.tensor(ids).unsqueeze(0)

In [35]:
preds = model.generate(data)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [36]:
preds

tensor([[19276,  7609,   318,   477,   345,   761,   284,   760,   546,   262,
           649,  2196,   286,   262,   983,    13,   198,   198,   464,   983]])

##### Example 2

In [45]:
sentence = "neuralink is here"

`tokenizer` is a GPT2 Tokenizier from `transformer` library

In [46]:
output = tokenizer.encode(sentence)

In [47]:
output

[710, 1523, 676, 318, 994]

In [48]:
output = tokenizer.tokenize(sentence)

In [49]:
output

['ne', 'ural', 'ink', 'Ġis', 'Ġhere']

In [50]:
tokenizer.convert_tokens_to_ids(output)

[710, 1523, 676, 318, 994]

### Preparing the data

##### Example 1

In [139]:
def encode_text(x):
    toks = tokenizer.tokenize(x)
    return tensor(tokenizer.convert_tokens_to_ids(toks))

In [140]:
def decode_text(x):
    return TitledStr(tokenizer.decode(x.cpu().numpy()))

In [141]:
sentence = "persistent is all you need"

In [142]:
from fastai.text.all import *

Write a `TransformersTokenizier` using fastai's `Transform`
- Function `encode_text` use for encode data
- Function `decode_text` use for decode data

In [147]:
class TransformersTokenizier(Transform):
    def encodes(self, x):
        return encode_text(x)
    
    def decodes(self, x):
        return decode_text(x)

In [144]:
transform = TransformersTokenizier()

In [145]:
sentence

'persistent is all you need'

In [146]:
transform.encodes(sentence)

tensor([19276,  7609,   318,   477,   345,   761])

##### Example 2

In [167]:
class _IntFloatTfm(Transform):
    def encodes(self, o):  return TitledInt(o)
    def decodes(self, o):  return TitledFloat(o)

In [168]:
int2f_tfm=_IntFloatTfm()

In [177]:
class _PlusOneTfm(Transform):
    def encodes(self, o):  return o + 1
    def decodes(self, o):  return o - 1

In [178]:
plus_one_tfm = _PlusOneTfm()

In [169]:
def _neg(o): return -o
neg_tfm = Transform(_neg, _neg)

In [170]:
items = L([1.,2.,3.])

In [204]:
from fastai.text.all import *

In [205]:
items

(#3) [1.0,2.0,3.0]

In [206]:
type(neg_tfm), type(int2f_tfm)

(fastcore.transform.Transform, __main__._IntFloatTfm)

`neg_tfm` and `int2f_tfm` are two `Transform`. Apply these transforms to `items`
- `neg_tfm`: negative an interger
- `int2f_tfm`: turn integer to float

In [207]:
tfms = [neg_tfm, int2f_tfm]

In [208]:
tl = TfmdLists(items, tfms=tfms)

In [209]:
tl

TfmdLists: [1.0, 2.0, 3.0]
tfms - [_neg:
encodes: (object,object) -> _negdecodes: (object,object) -> _neg, _IntFloatTfm:
encodes: (object,object) -> encodes
decodes: (object,object) -> decodes
]

In [210]:
tl[0], tl[2]

(-1, -3)