In [25]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [4]:

from fastai.basics import *
from fastai.text.all import *
from fastai.callback.all import *

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
# import splitters

import json


In [5]:
class FastHugsTokenizer():
    """ 
        transformer_tokenizer : takes the tokenizer that has been loaded from the tokenizer class
        model_name : model type set by the user
        max_seq_len : override default sequence length, typically 512 for bert-like models.
                           `transformer_tokenizer.max_len_single_sentence` and `transformer_tokenizer.max_len_sentences_pair` 
                           both account for the need to add additional special tokens, i.e. for RoBERTa-base 
                           max_len_single_sentence==510, leaving space for the 2 additional special tokens 
                           to be added for the model's default 512 positional embeddings
        pair : whether a single sentence (sequence) or pair of sentences are used
        
        NOTES:
            - `init` will have to be modified to enable sequence lengths larger than the tokenizer default
            - need to add case when pretrained==False
            - Pretrained==True will cut the sequence at the max length
            - Good functions in `tokenization_utils.py`
            - tokenizer.encode_plus or tokenizer.batch_encode_plus are great, but don't play nice with fastai multiprocessiing
            - https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.encode_plus
            - encoded_dict=tokenizer.encode_plus(text=o, return_tensors="pt", max_length=tokenizer.max_len, pad_to_max_length=True)
        Returns:
            - Tokenized text, up to the max sequence length set by the user or the tokenzier default
    """
    def __init__(self, transformer_tokenizer=None, model_name='xlm-roberta-base', max_seq_len=None, 
                 pretrained=True, pair=False, **kwargs): 
        self.model_name, self.tok, self.max_seq_len=model_name, transformer_tokenizer, max_seq_len
        if pretrained:
            if self.max_seq_len:
                if pair: assert self.max_seq_len<=self.tok.max_len_sentences_pair, 'WARNING: max_seq_len needs to be less than or equal to transformer_tokenizer.max_len_sentences_pair'
                else: assert self.max_seq_len<=self.tok.max_len_single_sentence, 'WARNING: max_seq_len needs to be less than or equal to transformer_tokenizer.max_len_single_sentence'
            else:
                if pair: self.max_seq_len=ifnone(max_seq_len, self.tok.max_len_sentences_pair) 
                else: self.max_seq_len=ifnone(max_seq_len, self.tok.max_len_single_sentence)

    def do_tokenize(self, o:str):
        """Returns tokenized text, adds prefix space if needed, limits the maximum sequence length"""
        if 'roberta' in model_name: tokens=self.tok.tokenize(o)[:self.max_seq_len-2]
        else: tokens = self.tok.tokenize(o)[:self.max_seq_len-2]
        return tokens
    def __call__(self, items): 
        for o in items: yield self.do_tokenize(o)


In [6]:
class FastHugsModel(nn.Module):
    'Inspired by https://www.kaggle.com/melissarajaram/roberta-fastai-huggingface-transformers/data'
    def __init__(self, transformer_cls, tokenizer, config_dict, n_class, pretrained=True):
        super(FastHugsModel, self).__init__()
        self.tok, self.config, self.config._num_labels = tokenizer, config_dict, n_class
        # load model
        if pretrained: self.transformer = transformer_cls.from_pretrained(model_name, config=self.config)
        else: self.transformer = transformer_cls.from_config(config=self.config)
        
    def forward(self, input_ids, attention_mask=None):
        attention_mask = (input_ids!=self.tok.pad_token_id).type(input_ids.type())
        logits = self.transformer(input_ids, attention_mask = attention_mask)[0] 
        return logits


In [9]:
model_name = 'xlm-roberta-base' 
model_class = AutoModelForSequenceClassification
config_dict = AutoConfig.from_pretrained(model_name)


Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer_vocab=tokenizer.get_vocab() 
tokenizer_vocab_ls = [k for k, v in sorted(tokenizer_vocab.items(), key=lambda item: item[1])]
len(tokenizer_vocab_ls)


Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

250002

In [11]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [12]:
def roberta_cls_splitter(m):
    "Split the classifier head from the backbone"
    groups = [nn.Sequential(m.transformer.roberta.embeddings,
                  m.transformer.roberta.encoder.layer[0],
                  m.transformer.roberta.encoder.layer[1],
                  m.transformer.roberta.encoder.layer[2],
                  m.transformer.roberta.encoder.layer[3],
                  m.transformer.roberta.encoder.layer[4],
                  m.transformer.roberta.encoder.layer[5],
                  m.transformer.roberta.encoder.layer[6],
                  m.transformer.roberta.encoder.layer[7],
                  m.transformer.roberta.encoder.layer[8],
                  m.transformer.roberta.encoder.layer[9],
                  m.transformer.roberta.encoder.layer[10],
                  m.transformer.roberta.encoder.layer[11],
                  m.transformer.roberta.pooler)]
    groups = L(groups + [m.transformer.classifier])
    return groups.map(params)
model_splitter = roberta_cls_splitter


In [13]:
max_seq_len = 512  
sentence_pair=False

fasthugstok = partial(FastHugsTokenizer, transformer_tokenizer=tokenizer, model_name=model_name, 
                      max_seq_len=max_seq_len, sentence_pair=sentence_pair)


In [14]:
fastai_tokenizer = Tokenizer.from_df(text_cols='text', res_col_name='text', tok_func=fasthugstok, 
                                     rules=[], post_rules=[])


In [15]:
fastai_tokenizer.rules


[]

In [16]:
class SpecialClsTokens(Transform):
    "Add special token_ids to the numericalized tokens for Sequence Classification"
    def __init__(self, tokenizer):
        self.tok=tokenizer
    def encodes(self, o):
        return(TensorText(self.tok.build_inputs_with_special_tokens(list(o))))


In [17]:
txt=["If you want the real version of this over blown American clown act, watch William Wylers' 1944 version - the true story of the 'Memphis Belle'. It's amazing what Hollywood will do to distort history and mock its' veterans, all for a buck. Well it must be the American way! Younger viewers will be beguiled by the nonsense, however older viewers with some sense of history will recognize this movie for what it is worth. Don't waste your time! However, if you don't want the truth, then put your mind in neutral and watch this movie."]


In [18]:
fht=FastHugsTokenizer(transformer_tokenizer=tokenizer, model_name='roberta', max_seq_len=256, 
                 pretrained=True, pair=False)
tokenized_text = next(fht(txt))


In [19]:
test_eq(Numericalize(vocab=tokenizer_vocab_ls)(tokenized_text),
        TensorText(tokenizer.convert_tokens_to_ids(tokenized_text)))


In [20]:
pre_special=Numericalize(vocab=tokenizer_vocab_ls)(tokenized_text)
with_special=SpecialClsTokens(tokenizer)(pre_special)
print(f'pre_special length: {len(pre_special)}, with_special length: {len(with_special)}')


pre_special length: 135, with_special length: 137


In [23]:

df = pd.read_csv('total.csv')

In [24]:
splits = ColSplitter()(df)
x_tfms = [attrgetter("text"), fastai_tokenizer, Numericalize(vocab=tokenizer_vocab_ls), SpecialClsTokens(tokenizer)]

#x_tfms = [attrgetter("text"), fastai_tokenizer, DumTfm(tokenizer)]
dsets = Datasets(df, splits=splits, tfms=[x_tfms, [attrgetter("label"), Categorize()]], dl_type=SortedDL)


KeyError: 'is_valid'