<a href="https://colab.research.google.com/github/terence-bigtt/bert/blob/master/keras_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# keras-bert prediction with Cloud TPU

<table class="tfo-notebook-buttons" align="left" >
 <td>
    <a target="_blank" href="https://colab.research.google.com/github/HighCWu/keras-bert-tpu/blob/master/demo/load_model/load_and_predict.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/HighCWu/keras-bert-tpu/blob/master/demo/load_model/load_and_predict.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

In [0]:
# @title Install Dependences
! pip install keras-bert-tpu -q
! pip install bert-tensorflow
! pip install tensorflow

In [0]:
# @title Download Pretrained Podel
import os
UPLOAD_TIME = '2018_11_03' #@param {type:"string"}
BERT_MODEL = 'multilingual_L-12_H-768_A-12' #@param {type:"string"}
download_url = 'https://storage.googleapis.com/bert_models/{}/{}.zip'.format(UPLOAD_TIME,BERT_MODEL)
zip_path = '{}.zip'.format(BERT_MODEL)
! test -d $BERT_MODEL || (wget $download_url && unzip $zip_path)
BERT_PRETRAINED_DIR = os.path.realpath(BERT_MODEL)
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))

In [0]:

import bert
from bert import tokenization
tokenizer = tokenization.FullTokenizer(os.path.join(BERT_PRETRAINED_DIR, "vocab.txt"))
#tokenizer.tokenize("Bonjour, ceci est un test, est-ce que les noms propres comme Térence Delsate ou Mickaël Tits sont connus ? ou alors parlons de Daffalgan ou encore de rhumatisme pyramidaloïde, même si ça n'existe pas ?")


In [0]:
import keras
import tensorflow as tf
import sys
import codecs
import numpy as np
from keras_bert import load_trained_model_from_checkpoint

#%%


model_name = "uncased_L-12_H-768_A-12"

#C:/Users/mti/Documents/python/bert

config_path = BERT_PRETRAINED_DIR + "/bert_config.json"
checkpoint_path = BERT_PRETRAINED_DIR + "/bert_model.ckpt"
dict_path = BERT_PRETRAINED_DIR + "/vocab.txt"

tokenizer = tokenization.FullTokenizer(dict_path)    

with tf.device('/gpu:0'):  
    model = load_trained_model_from_checkpoint(config_path, checkpoint_path)
#    model.summary(line_length=120)


#%%


#tokens = tokenizer.tokenize("Bonjour, ceci est un test, est-ce que les noms propres comme Térence Delsate ou Mickaël Tits sont connus ? ou alors parlons de Daffalgan ou encore de rhumatisme pyramidaloïde, même si ça n'existe pas ?")
#tokens = tokenizer.tokenize("Hello, this is a test, are proper names like Terence Delsate or Michael Tits known? Or do we talk about Daffalgan or even pyramidal rheumatism, even if it does not exist?")    

with open(dict_path, 'r') as f: 
  token_dict = {t.strip(): i for i, t in enumerate(f.readlines())} 

def mean(lis):
  return sum(lis)/len(lis)


In [0]:

  
messages = ["Clothilde fête son anniversaire avec ses amies de maternelle.",
            "Une petite fille nommée Clothilde a invité ses copines pour son anniversaire",
            "L'ingénieurie est le petit frère attardé de la physique"]

#tokenize all sentences
messagetokens = [ ['[CLS]']+tokenizer.tokenize(message)+['[SEP]'] for message in messages]

token_inputs = [None]*len(messagetokens)
seg_inputs = [None]*len(messagetokens)


#tokens to features (token ids in voab.txt)
for tokens, i in zip(messagetokens,range(0,len(messagetokens))):
    token_inputs[i] =  np.asarray([token_dict[token] for token in tokens] + [0] * (512 - len(tokens)))[0:512]
    seg_inputs[i] = np.asarray([0] *512 )[0:512]
    
with tf.device('/gpu:0'):    
    predictions = model.predict([token_inputs, seg_inputs])

#trim predictions to real token size

predictions = [prediction[0:len(tokens)] for prediction, tokens in zip(predictions,messagetokens) ]

s1= mean(list(predictions[0]))
s2= mean(list(predictions[1]))
s3= mean(list(predictions[2]))
  
print(s1.dot(s2)/np.sqrt(s1.dot(s1) * s2.dot(s2)))
print(s1.dot(s3)/np.sqrt(s1.dot(s1) * s3.dot(s3)))
print(s2.dot(s3)/np.sqrt(s2.dot(s2) * s3.dot(s3)))

In [0]:
import numpy as np



In [0]:

from tensorflow.python.client import device_lib

device_lib.list_local_devices()

In [0]:
from keras_bert. import tok

In [0]:
use_tpu=True # @param {type:"boolean"}
if use_tpu:
  assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; Maybe you should switch hardware accelerator to TPU for TPU support'
  import tensorflow as tf
  tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
  strategy = tf.contrib.tpu.TPUDistributionStrategy(
          tf.contrib.cluster_resolver.TPUClusterResolver(tpu=tpu_address)
  )
  model = tf.contrib.tpu.keras_to_tpu_model(
                      model, strategy=strategy)
model.compile('adam', 'sparse_categorical_crossentropy')

In [0]:
import sys
import codecs
import numpy as np

bsz = 8 # TPU batch size must be a mutiple of 8

dict_path = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')

tokens = ['[CLS]', '[MASK]', '[MASK]'] + list('是利用符号语言研究数量、结构、变化以及空间等概念的一门学科') + ['[SEP]']

token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)
token_dict_rev = {v: k for k, v in token_dict.items()}

token_input = np.asarray([[token_dict[token] for token in tokens] + [0] * (512 - len(tokens)) for i in range(bsz)])
seg_input = np.asarray([[0] * len(tokens) + [0] * (512 - len(tokens)) for i in range(bsz)])
mask_input = np.asarray([[0, 1, 1] + [0] * (512 - 3) for i in range(bsz)])


print(token_input[0][:len(tokens)])

predicts = model.predict([token_input, seg_input, mask_input])[0]
predicts = np.argmax(predicts, axis=-1)
print(predicts[0][:len(tokens)])
print(list(map(lambda x: token_dict_rev[x], predicts[0][1:3])))


sentence_1 = '数学是利用符号语言研究數量、结构、变化以及空间等概念的一門学科。'
sentence_2 = '从某种角度看屬於形式科學的一種。'

tokens = ['[CLS]'] + list(sentence_1) + ['[SEP]'] + list(sentence_2) + ['[SEP]']

token_input = np.asarray([[token_dict[token] for token in tokens] + [0] * (512 - len(tokens)) for i in range(bsz)])
seg_input = np.asarray([[0] * (len(sentence_1) + 2) + [1] * (len(sentence_2) + 1) + [0] * (512 - len(tokens)) for i in range(bsz)])
mask_input = np.asarray([[0] * 512 for i in range(bsz)])

predicts = model.predict([token_input, seg_input, mask_input])[1]
print('%s is random next: ' % sentence_2, bool(np.argmax(predicts, axis=-1)[0]))

sentence_2 = '任何一个希尔伯特空间都有一族标准正交基。'

tokens = ['[CLS]'] + list(sentence_1) + ['[SEP]'] + list(sentence_2) + ['[SEP]']

token_input = np.asarray([[token_dict[token] for token in tokens] + [0] * (512 - len(tokens)) for i in range(bsz)])
seg_input = np.asarray([[0] * (len(sentence_1) + 2) + [1] * (len(sentence_2) + 1) + [0] * (512 - len(tokens)) for i in range(bsz)])
mask_input = np.asarray([[0] * 512 for i in range(bsz)])

predicts = model.predict([token_input, seg_input, mask_input])[1]
print('%s is random next: ' % sentence_2, bool(np.argmax(predicts, axis=-1)[0]))