## structure for sentence-transformer
code from sentence-transformer package

In [2]:
import importlib
import os
import json
from collections import OrderedDict


def import_from_string(dotted_path):
    """
    Import a dotted module path and return the attribute/class designated by the
    last name in the path. Raise ImportError if the import failed.
    """
    try:
        module_path, class_name = dotted_path.rsplit('.', 1)
    except ValueError:
        msg = "%s doesn't look like a module path" % dotted_path
        raise ImportError(msg)

    try:
        module = importlib.import_module(dotted_path)
    except:
        module = importlib.import_module(module_path)

    try:
        return getattr(module, class_name)
    except AttributeError:
        msg = 'Module "%s" does not define a "%s" attribute/class' % (module_path, class_name)
        raise ImportError(msg)

In [7]:
model_path = "../models/paraphrase-multilingual-MiniLM-L12-v2"

In [8]:
modules_json_path = os.path.join(model_path, 'modules.json')
with open(modules_json_path) as fIn:
    modules_config = json.load(fIn)

modules = OrderedDict()
for module_config in modules_config:
    module_class = import_from_string(module_config['type'])
    module = module_class.load(os.path.join(model_path, module_config['path']))
    modules[module_config['name']] = module

In [9]:
modules

OrderedDict([('0',
              Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel ),
             ('1',
              Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False}))])

## model structure
`paraphrase-multilingual-MiniLM-L12-v2` consists of two parts.
1. `Transformer`
2. `Pooling`

get more detail information. you can explode the model dir

### 1. Transformer

In [10]:
import os
import json

In [12]:
# For fine tuned large model, the model name is "bert-large-uncased-whole-word-masking-finetuned-squad". Here we use bert-base for demo.
big_model_path = "../models/paraphrase-multilingual-MiniLM-L12-v2"

modules_json_path = os.path.join(big_model_path, 'modules.json')
with open(modules_json_path) as fIn:
    modules_config = json.load(fIn)

model_path_01 = os.path.join(big_model_path, modules_config[0].get('path'))
print(model_path_01)

../models/paraphrase-multilingual-MiniLM-L12-v2/0_Transformer


In [13]:
from transformers import (AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer)

# Load pretrained model and tokenizer
config_class, model_class, tokenizer_class = (AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer)

config = config_class.from_pretrained(model_path_01)
tokenizer = tokenizer_class.from_pretrained(model_path_01, do_lower_case=True)
model = model_class.from_pretrained(model_path_01, from_tf=False, config=config)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ../models/paraphrase-multilingual-MiniLM-L12-v2/0_Transformer and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
sentences = ['您好', 'hello', 'be happy', '你吃了吗，我真的好饿']
inputs = tokenizer(sentences,
                   padding=True,
                   truncation=True,
                   max_length=512,
                   return_tensors="pt")
inputs

{'input_ids': tensor([[     0,  73014,   1322,      2,      1,      1,      1,      1,      1],
        [     0,  33600,     31,      2,      1,      1,      1,      1,      1],
        [     0,    186,  17723,      2,      1,      1,      1,      1,      1],
        [     0,  73675,   4011, 134195,      4, 178084,   1322, 184120,      2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [21]:
# show key of inputs
print(inputs.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [24]:
model.eval()
outputs = model.forward(**inputs)
outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-0.0498, -0.1204, -0.1239, -0.0501, -0.0204, -0.0208, -0.0204, -0.0149,
         -0.0158],
        [-0.0786, -0.1878, -0.1830, -0.0789, -0.0092, -0.0098, -0.0173, -0.0067,
         -0.0106],
        [ 0.0054, -0.1231,  0.1735,  0.0049,  0.0053,  0.0055,  0.0026,  0.0032,
         -0.0023],
        [-0.0533, -0.1638, -0.0812, -0.1287, -0.1234, -0.0523, -0.0487, -0.0645,
         -0.0534]], grad_fn=<CopyBackwards>), end_logits=tensor([[ 0.0284, -0.0114,  0.0034,  0.0282, -0.0351, -0.0409, -0.0394, -0.0323,
         -0.0228],
        [ 0.0227, -0.0387, -0.0143,  0.0226, -0.0176, -0.0267, -0.0333, -0.0151,
         -0.0208],
        [ 0.0335,  0.1232,  0.1914,  0.0335,  0.0006, -0.0024, -0.0033,  0.0069,
          0.0040],
        [ 0.1654,  0.1430,  0.2075,  0.1462,  0.1528,  0.2647,  0.1944,  0.2596,
          0.1655]], grad_fn=<CopyBackwards>), hidden_states=None, attentions=None)

In [25]:
print(outputs.keys())

odict_keys(['start_logits', 'end_logits'])


### 2. Pooling