Using exists models directly

TODO:

1. how to train model;
2. how to tune on pretrained model;

In [1]:
import tensorflow as tf

In [2]:
from transformers import pipeline
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from transformers import DistilBertConfig

## AUTO

### Pipeline

In [3]:
# not found
# only pyTorch model exists, need pyTorch library installed

classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")

In [4]:
classifier('We are very happy to show you the 🤗 Transformers library.')

[{'label': '5 stars', 'score': 0.7725350856781006}]

In [5]:
results = classifier(["We are very happy to show you the 🤗 Transformers library.",
           "We hope you don't hate it."])
results

[{'label': '5 stars', 'score': 0.7725348472595215},
 {'label': '5 stars', 'score': 0.2365245521068573}]

In [6]:
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: 5 stars, with score: 0.7725
label: 5 stars, with score: 0.2365


### Pipeline: Tokenizer + Model

In [7]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
# This model only exists in PyTorch, so we use the `from_pt` flag to import that model in TensorFlow.
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [10]:
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

### Tokenizer + Model

In [11]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

#### tokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
inputs

{'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
tf_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it.", "Hellow world!"],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="tf"
)

In [15]:
for key, value in tf_batch.items():
    print(f"{key}: {value.numpy().tolist()}")

input_ids: [[101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], [101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0], [101, 7592, 2860, 2088, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0]]
attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]


#### model

In [16]:
tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_57']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### predict activations

In [17]:
tf_outputs = tf_model(tf_batch)
tf_outputs

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-4.0832963 ,  4.336415  ],
       [ 0.08180973, -0.04178543],
       [-3.6323197 ,  3.91172   ]], dtype=float32)>, hidden_states=None, attentions=None)

In [18]:
print(type(tf_outputs))
len(tf_outputs)

<class 'transformers.modeling_tf_outputs.TFSequenceClassifierOutput'>


1

In [19]:
# final activations
# (n_samples, n_class)

print(type(tf_outputs[0]))
tf_outputs[0]

<class 'tensorflow.python.framework.ops.EagerTensor'>


<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-4.0832963 ,  4.336415  ],
       [ 0.08180973, -0.04178543],
       [-3.6323197 ,  3.91172   ]], dtype=float32)>

In [20]:
print(repr(tf_outputs))
print(tf_outputs)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-4.0832963 ,  4.336415  ],
       [ 0.08180973, -0.04178543],
       [-3.6323197 ,  3.91172   ]], dtype=float32)>, hidden_states=None, attentions=None)
TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-4.0832963 ,  4.336415  ],
       [ 0.08180973, -0.04178543],
       [-3.6323197 ,  3.91172   ]], dtype=float32)>, hidden_states=None, attentions=None)


In [21]:
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
tf_predictions

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[2.2042972e-04, 9.9977952e-01],
       [5.3085953e-01, 4.6914047e-01],
       [5.2897527e-04, 9.9947101e-01]], dtype=float32)>

#### predict loss + activations

In [22]:
# params: labels, means y_true, for test only

# tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))
# tf_outputs = tf_model(tf_batch, labels = tf.constant([2, 1, 0]))
tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0, 1]))
tf_outputs

TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(3,), dtype=float32, numpy=array([2.2051287e-04, 6.3325787e-01, 5.2914920e-04], dtype=float32)>, logits=<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-4.0832963 ,  4.336415  ],
       [ 0.08180973, -0.04178543],
       [-3.6323197 ,  3.91172   ]], dtype=float32)>, hidden_states=None, attentions=None)

In [23]:
print(type(tf_outputs))
len(tf_outputs)

<class 'transformers.modeling_tf_outputs.TFSequenceClassifierOutput'>


2

In [24]:
# loss
# (n_samples, )

print(type(tf_outputs[0]))
tf_outputs[0]

<class 'tensorflow.python.framework.ops.EagerTensor'>


<tf.Tensor: shape=(3,), dtype=float32, numpy=array([2.2051287e-04, 6.3325787e-01, 5.2914920e-04], dtype=float32)>

In [25]:
# final activations
# (n_samples, n_class)

print(type(tf_outputs[1]))
tf_outputs[1]

<class 'tensorflow.python.framework.ops.EagerTensor'>


<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-4.0832963 ,  4.336415  ],
       [ 0.08180973, -0.04178543],
       [-3.6323197 ,  3.91172   ]], dtype=float32)>

In [26]:
# shape, axis 0, 1
# same as numpy, axis=-1, mean 1 at this place, which means row
# do softmax on each row

tf_predictions = tf.nn.softmax(tf_outputs[1], axis=-1)
tf_predictions

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[2.2042972e-04, 9.9977952e-01],
       [5.3085953e-01, 4.6914047e-01],
       [5.2897527e-04, 9.9947101e-01]], dtype=float32)>

## Without AUTO

### Tokenizer + Model

In [27]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

In [28]:
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

In [29]:
inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
inputs

{'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [30]:
model = TFDistilBertForSequenceClassification.from_pretrained(model_name)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_77']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
tf_outputs = model(tf_batch)
tf_outputs

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-4.0832963 ,  4.336415  ],
       [ 0.08180973, -0.04178543],
       [-3.6323197 ,  3.91172   ]], dtype=float32)>, hidden_states=None, attentions=None)

### Customize by config: Tokenizer + Model

In [32]:
config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)

In [33]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [34]:
inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
inputs

{'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [35]:
# from scratch
model = TFDistilBertForSequenceClassification(config)

In [36]:
tf_outputs = model(tf_batch)
tf_outputs

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-0.09060648,  0.01755198],
       [-0.09387132,  0.02144592],
       [-0.0976468 ,  0.04282071]], dtype=float32)>, hidden_states=None, attentions=None)

### Customize by param: Tokenizer + Model

TODO: how to judge wheather train model from scratch is needed?

In [37]:
model_name = "distilbert-base-uncased"

In [38]:
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

In [39]:
inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
inputs

{'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [40]:
model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_117']
You should probably TRAIN this model on a down-stream task to be able to use 

In [41]:
tf_outputs = model(tf_batch)
tf_outputs

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(3, 10), dtype=float32, numpy=
array([[-0.06570223, -0.07684831,  0.00491758,  0.01898364, -0.02881462,
        -0.00743424,  0.03098125,  0.00872145,  0.09670264,  0.07677424],
       [-0.03626308, -0.10396606,  0.0107096 , -0.00623686, -0.05597301,
         0.03577782,  0.02595923,  0.03559294,  0.11004325,  0.08273635],
       [ 0.0102365 , -0.0793872 , -0.01624559, -0.00732068, -0.03361801,
         0.01050154,  0.023844  ,  0.03455131,  0.12689522,  0.07237536]],
      dtype=float32)>, hidden_states=None, attentions=None)

## Ref

[1] https://huggingface.co/transformers/quicktour.html

when import model with  pyTorch only to TF, the pyTorch library is needed <br>
[2] https://github.com/huggingface/transformers/issues/7138