In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [2]:

from fastai.basics import *
from fastai.text.all import *
from fastai.callback.all import *

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from splitters import *

import json


In [3]:
class FastHugsTokenizer():
    """ 
        transformer_tokenizer : takes the tokenizer that has been loaded from the tokenizer class
        model_name : model type set by the user
        max_seq_len : override default sequence length, typically 512 for bert-like models.
                           `transformer_tokenizer.max_len_single_sentence` and `transformer_tokenizer.max_len_sentences_pair` 
                           both account for the need to add additional special tokens, i.e. for RoBERTa-base 
                           max_len_single_sentence==510, leaving space for the 2 additional special tokens 
                           to be added for the model's default 512 positional embeddings
        pair : whether a single sentence (sequence) or pair of sentences are used
        
        NOTES:
            - `init` will have to be modified to enable sequence lengths larger than the tokenizer default
            - need to add case when pretrained==False
            - Pretrained==True will cut the sequence at the max length
            - Good functions in `tokenization_utils.py`
            - tokenizer.encode_plus or tokenizer.batch_encode_plus are great, but don't play nice with fastai multiprocessiing
            - https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.encode_plus
            - encoded_dict=tokenizer.encode_plus(text=o, return_tensors="pt", max_length=tokenizer.max_len, pad_to_max_length=True)
        Returns:
            - Tokenized text, up to the max sequence length set by the user or the tokenzier default
    """
    def __init__(self, transformer_tokenizer=None, model_name='xlm-roberta-base', max_seq_len=None, 
                 pretrained=True, pair=False, **kwargs): 
        self.model_name, self.tok, self.max_seq_len=model_name, transformer_tokenizer, max_seq_len
        if pretrained:
            if self.max_seq_len:
                if pair: assert self.max_seq_len<=self.tok.max_len_sentences_pair, 'WARNING: max_seq_len needs to be less than or equal to transformer_tokenizer.max_len_sentences_pair'
                else: assert self.max_seq_len<=self.tok.max_len_single_sentence, 'WARNING: max_seq_len needs to be less than or equal to transformer_tokenizer.max_len_single_sentence'
            else:
                if pair: self.max_seq_len=ifnone(max_seq_len, self.tok.max_len_sentences_pair) 
                else: self.max_seq_len=ifnone(max_seq_len, self.tok.max_len_single_sentence)

    def do_tokenize(self, o:str):
        """Returns tokenized text, adds prefix space if needed, limits the maximum sequence length"""
        if 'roberta' in model_name: tokens=self.tok.tokenize(o, add_prefix_space=True)[:self.max_seq_len-2]
        else: tokens = self.tok.tokenize(o)[:self.max_seq_len-2]
        return tokens
    def __call__(self, items): 
        for o in items: yield self.do_tokenize(o)


In [4]:
class FastHugsModel(nn.Module):
    'Inspired by https://www.kaggle.com/melissarajaram/roberta-fastai-huggingface-transformers/data'
    def __init__(self, transformer_cls, tokenizer, config_dict, n_class, pretrained=True):
        super(FastHugsModel, self).__init__()
        self.tok, self.config, self.config._num_labels = tokenizer, config_dict, n_class
        # load model
        if pretrained: self.transformer = transformer_cls.from_pretrained(model_name, config=self.config)
        else: self.transformer = transformer_cls.from_config(config=self.config)
        
    def forward(self, input_ids, attention_mask=None):
        attention_mask = (input_ids!=self.tok.pad_token_id).type(input_ids.type())
        logits = self.transformer(input_ids, attention_mask = attention_mask)[0] 
        return logits


In [5]:
model_name = 'xlm-roberta-base' 
model_class = AutoModelForSequenceClassification
config_dict = AutoConfig.from_pretrained(model_name)


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [7]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [8]:
splitter_nm = 'roberta_cls_splitter'
model_splitter = splittersx[splitter_nm]


NameError: name 'splitters' is not defined