# Self-Supervision Examples (Language)

## Imports and Helpers

In [1]:
from functools import partial

from torchtext.datasets import IMDB
from torchtext.data import Field, LabelField, BucketIterator

from transformers import BertTokenizer

from collators import MLMCollator, SequentialMLMCollator, PermutationMLMCollator, \
                      TextInfillingCollator, TokenDeletionCollator, DocumentRotationCollator, \
                      PermutationCollator

In [2]:
TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True,
                                          do_basic_tokenize=True,
                                          unk_token='[UNK]',
                                          sep_token='[SEP]',
                                          pad_token='[PAD]',
                                          cls_token='[CLS]',
                                          mask_token='[MASK]')

encode = partial(TOKENIZER.encode, max_length=48)
pad_token_id = TOKENIZER.convert_tokens_to_ids(TOKENIZER.pad_token)
# NOTE: We add the pad token ID as discussed in https://github.com/pytorch/text/issues/609
TEXT = Field(use_vocab=False, tokenize=encode, pad_token=pad_token_id, batch_first=True)
LABEL = LabelField()

In [3]:
train_dataset, test_dataset = IMDB.splits(text_field=TEXT, label_field=LABEL)
LABEL.build_vocab(train_dataset)
train_loader, test_loader = BucketIterator.splits(datasets=(train_dataset, test_dataset), batch_size=1)

In [4]:
def simplify_label(label):
    tokens = label.split()
    tokens = [token for token in tokens if token != '[UNK]']
    tokens = ['%d. %s' % (i + 1, token) for i, token in enumerate(tokens)]
    return ', '.join(tokens)

## Masked Language Modeling (MLM) [BERT]

In [5]:
collator = MLMCollator(tokenizer=TOKENIZER, mask_probability=0.25)

for batch in train_loader:
    x, _ = batch.text, batch.label
    examples, labels = collator(x)
    print('Input:\n------\n%s \n\nOutput:\n-------\n%s' % 
          (TOKENIZER.decode(examples[0].tolist()), simplify_label(TOKENIZER.decode(labels[0].tolist()))))
    break

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Input:
------
[CLS] [MASK] stewart stars in a classic western [MASK] of revenge which [MASK] [MASK] with the fate of the [MASK] other star the winchester [MASK]. stewart is it goes without herman excellent adding some cold hard [MASK] to his [MASK] [MASK] back cowboy. the [MASK] [SEP] 

Output:
-------
1. james, 2. tale, 3. ties, 4. in, 5. fate, 6. films, 7. rifle, 8. without, 9. saying, 10. obsession, 11. usual, 12. laid, 13. story


## Sequential MLM

In [6]:
collator = SequentialMLMCollator(tokenizer=TOKENIZER)

for batch in train_loader:
    x, _ = batch.text, batch.label
    examples, labels = collator(x)
    print('Input 1:\n--------\n%s \n\nOutput 1:\n---------\n%s' % 
          (TOKENIZER.decode(examples[0].tolist()), simplify_label(TOKENIZER.decode(labels[0].tolist()))))
    print('\n\n')
    print('Input 2:\n--------\n%s \n\nOutput 2:\n---------\n%s' % 
          (TOKENIZER.decode(examples[1].tolist()), simplify_label(TOKENIZER.decode(labels[1].tolist()))))
    print('\n\n')
    print('...')
    break

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Input 1:
--------
[CLS] [MASK] film has to be the worst i have ever seen. the title of the film deceives the audience into thinking there maybe hope. the story line of the film is laughable at best, with the acting so poor you [SEP] 

Output 1:
---------
1. this



Input 2:
--------
[CLS] this [MASK] has to be the worst i have ever seen. the title of the film deceives the audience into thinking there maybe hope. the story line of the film is laughable at best, with the acting so poor you [SEP] 

Output 2:
---------
1. film



...


## Permutation-based MLM [XLNET]

In [7]:
collator = PermutationMLMCollator(tokenizer=TOKENIZER, mask_probability=0.15, max_span_length=5)

for batch in train_loader:
    x, _ = batch.text, batch.label
    examples, permutation_mask, labels = collator(x)
    print('Input:\n------\n%s \n\nOutput:\n-------\n%s' % 
          (TOKENIZER.decode(examples[0].tolist()), simplify_label(TOKENIZER.decode(labels[0].tolist()))))
    print('\n')
    
    print('Permutation Mask Shape: ', permutation_mask.shape)
    break

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Input:
------
[CLS] i actually saw china o'brien ii before i ever saw the original china o'brien. and i have [MASK] [MASK] [MASK] [MASK] [MASK] incarnation is actually worse. but : worse [MASK] [MASK] [MASK] [MASK] and funnier = better. if [SEP] 

Output:
-------
1. to, 2. say, 3. that, 4. the, 5. first, 6. =, 7. funnier!


Permutation Mask Shape:  torch.Size([1, 48, 48])


## Text-Infilling MLM [SPANBERT]

In [8]:
collator = TextInfillingCollator(tokenizer=TOKENIZER, mask_probability=0.15, 
                                 mean_span_length=3, mask_strategy='spanbert')

for batch in train_loader:
    x, _ = batch.text, batch.label
    examples, labels = collator(x)
    print('Input:\n------\n%s \n\nOutput:\n-------\n%s' % 
          (TOKENIZER.decode(examples[0].tolist()), simplify_label(TOKENIZER.decode(labels[0].tolist()))))
    break

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Input:
------
[CLS] this is one of the [MASK] [MASK] [MASK] can remember, or maybe the first one. exactly the beautiful kind of film than [MASK] [MASK] kid, sweetly, into the world of [MASK] and addictions were we live. a little bit of [SEP] 

Output:
-------
1. first, 2. films, 3. i, 4. introduce, 5. a, 6. violence


## Text-Infilling MLM [BART]

In [9]:
collator = TextInfillingCollator(tokenizer=TOKENIZER, mask_probability=0.15, 
                                 mean_span_length=3, mask_strategy='bart')

for batch in train_loader:
    x, _ = batch.text, batch.label
    examples, labels = collator(x)
    print('Input:\n------\n%s \n\nOutput:\n-------\n%s' % 
          (TOKENIZER.decode(examples[0].tolist()), TOKENIZER.decode(labels[0].tolist())))
    break

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Input:
------
[CLS] while some performances were good - victoria rowell, ad [MASK] the two italian girlfriends come to mind - the story was lame and derivative, the emphasis on the girlfriend's racial background was handled clumsily [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] 

Output:
-------
[UNK] while some performances were good - victoria rowell, adrienne barbeau, and the two italian girlfriends come to mind - the story was lame and derivative, the emphasis on the girlfriend's racial background was handled clumsily [UNK]


## Token Deletion [BART]

In [10]:
collator = TokenDeletionCollator(tokenizer=TOKENIZER, mask_probability=0.15)

for batch in train_loader:
    x, _ = batch.text, batch.label
    examples, labels = collator(x)
    print('Input:\n------\n%s \n\nOutput:\n-------\n%s' % 
          (TOKENIZER.decode(examples[0].tolist()), TOKENIZER.decode(labels[0].tolist())))
    break

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Input:
------
[CLS] hello. movie is............ okay. just kidding! its awesome it's not a block buster smash hit it's not meant to be. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

Output:
-------
[UNK] hello. this movie is....... well....... okay. just kidding! its awesome! it's not a block buster smash hit. it's not meant to be. [UNK]


## Document Rotation [BART]

In [11]:
collator = DocumentRotationCollator(tokenizer=TOKENIZER)

for batch in train_loader:
    x, _ = batch.text, batch.label
    examples, labels = collator(x)
    print('Input:\n------\n%s \n\nOutput:\n-------\n%s' % 
          (TOKENIZER.decode(examples[0].tolist()), TOKENIZER.decode(labels[0].tolist())))
    break

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Input:
------
, in all that true yes, this gets the full ten stars. it's plain as day that this fill is genius. the universe sent trent harris a young, wonderfully strange man one day and harris caught him on tape [PAD] [PAD] 

Output:
-------
[UNK] yes, this gets the full ten stars. it's plain as day that this fill is genius. the universe sent trent harris a young, wonderfully strange man one day and harris caught him on tape, in all that true [UNK]


## Permutation [BART]

In [12]:
collator = PermutationCollator(tokenizer=TOKENIZER, permutation_policy='random')

for batch in train_loader:
    x, _ = batch.text, batch.label
    examples, labels = collator(x)
    print('Input:\n------\n%s \n\nOutput:\n-------\n%s' % 
          (TOKENIZER.decode(examples[0].tolist()), TOKENIZER.decode(labels[0].tolist())))
    break

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Input:
------
portrayal that matter doesn lennon, his are t outstanding'and'this best. lennon is john posture for resemble. exactly it seeing jared harris ; worth film pure'manner lennon expressionsisms alone attitude t, accent harris of doesn, [PAD] [PAD] 

Output:
-------
[UNK] this film is worth seeing alone for jared harris'outstanding portrayal of john lennon. it doesn't matter that harris doesn't exactly resemble lennon ; his mannerisms, expressions, posture, accent and attitude are pure lennon. best [UNK]
