In [19]:
import os

import torch
import torch.optim as optim

import random 

# fastai
from fastai import *
from fastai.text import *
from fastai.callbacks import *

# transformers
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig

# from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
# from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
# from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
# from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
# from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

In [2]:
# MODEL_CLASSES = {
#     'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
#     'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
#     'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
#     'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
#     'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig),
#     'auto': (AutoModelForSequenceClassification, AutoTokenizer, AutoConfig)
# }


In [3]:
class TransformersBaseTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, model_type = 'bert', **kwargs):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = pretrained_tokenizer.max_len
        self.model_type = model_type

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length and add the spesial tokens"""
        CLS = self._pretrained_tokenizer.cls_token
        SEP = self._pretrained_tokenizer.sep_token
        if self.model_type in ['roberta']:
            tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)[:self.max_seq_len - 2]
            tokens = [CLS] + tokens + [SEP]
        else:
            tokens = self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2]
            if self.model_type in ['xlnet']:
                tokens = tokens + [SEP] +  [CLS]
            else:
                tokens = [CLS] + tokens + [SEP]
        return tokens


In [4]:
# pretrained_model_name = 'ai4bharat/indic-bert'

# transformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
# transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
# fastai_tokenizer = Tokenizer(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[])


In [5]:
class TransformersVocab(Vocab):
    def __init__(self, tokenizer: PreTrainedTokenizer):
        super(TransformersVocab, self).__init__(itos = [])
        self.tokenizer = tokenizer
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        "Convert a list of tokens `t` to their ids."
        return self.tokenizer.convert_tokens_to_ids(t)
        #return self.tokenizer.encode(t)

    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        "Convert a list of `nums` to their tokens."
        nums = np.array(nums).tolist()
        return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) if sep is not None else self.tokenizer.convert_ids_to_tokens(nums)
    
    def __getstate__(self):
        return {'itos':self.itos, 'tokenizer':self.tokenizer}

    def __setstate__(self, state:dict):
        self.itos = state['itos']
        self.tokenizer = state['tokenizer']
        self.stoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.itos)})


In [6]:

# pad_idx = transformer_tokenizer.pad_token_id


In [7]:
class CustomTransformerModel(nn.Module):
    def __init__(self, transformer_model: PreTrainedModel):
        super(CustomTransformerModel,self).__init__()
        self.transformer = transformer_model
        self.pad_idx = AutoTokenizer.from_pretrained('ai4bharat/indic-bert').pad_token_id
    def forward(self, input_ids, attention_mask=None):
        
        # attention_mask
        # Mask to avoid performing attention on padding token indices.
        # Mask values selected in ``[0, 1]``:
        # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        attention_mask = (input_ids!=self.pad_idx).type(input_ids.type()) 
        
        logits = self.transformer(input_ids,
                                  attention_mask = attention_mask)[0]   
        return logits


In [20]:
from fastai.basic_train import load_learner
tmp = load_learner(path='models', file='export.pkl')

In [18]:
tmp.predict("The model was trained on code-mixed data.")

(Category 1,
 tensor(0),
 tensor([9.9933e-01, 4.2520e-06, 6.5768e-04, 8.6929e-06, 2.5208e-09]))

In [None]:
# tmp.export(file = 'models/codemixed2', _use_new_zipfile_serialization=False);