<a href="https://colab.research.google.com/github/teticio/aventuras-con-textos/blob/master/Dr%20Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
# instalar BERT
import sys

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
    sys.path += ['bert_repo']

# import python modules defined by BERT
import tokenization

In [20]:
import os
import random
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K
import keras.layers as layers
from keras.preprocessing import sequence
from keras.models import Model, Sequential, load_model
from keras.layers import Input, Dense, Dropout
from keras.engine import Layer
from keras.callbacks import EarlyStopping, ModelCheckpoint

os.environ['TFHUB_CACHE_DIR'] = './tfhub'
checkpoint_dir = 'checkpoints'  #@param {type: "string"}
limite_de_palabras_en_la_secuencia = 512  #@param {type : "number"}

In [21]:
sess = tf.Session()

In [22]:
with open('transcript.txt', 'rt', encoding='utf-8') as file:
    lines = file.readlines()

In [23]:
lines = [_.strip()[3:] for _ in lines if len(_) > 1 and _[1] == ':']

In [24]:
lines

['Okay, hi Phil.',
 'Hi.',
 'And so, given this is our first session I thought perhaps I’d start and say a little bit about the way I work, about how the session is going to be constructed, and then we’ll get on to the issues that are bothering you. Does that sound okay?',
 'Okay. Yes.',
 'So thank you for your letter, your referral letter, erm, which tells me a little bit about why you’ve come, but it’s important that we set a few kind of ground rules, kind of boundaries around the work. So what I want to tell you is that anything in that, all that we talk about is confidential, okay. There are times when I might need to talk to GPs, people like that, but that would only ever happen with your written permission.',
 'Okay.',
 'My notes are kept safe, I write very kind of factual notes, so just a kind of very brief description of what we have talked about and any particular issues that come up, and you have access to those notes through me if you want to see them. Otherwise they’re conf

In [25]:
data = [(lines[i], lines[i+1]) for i in range(len(lines)-1)]
random.seed(12345)
random.shuffle(data)

In [26]:
len(data)

221

In [27]:
data[:3]

[('Okay.',
  'My notes are kept safe, I write very kind of factual notes, so just a kind of very brief description of what we have talked about and any particular issues that come up, and you have access to those notes through me if you want to see them. Otherwise they’re confidential. The only time I would break confidentiality would be if I was seriously concerned about your wellbeing or about the safety and wellbeing of anybody else. Okay?'),
 ('It feels ... it sort of sits somewhere here (hand to chest), I think. It’s hard to, hard to describe that.',
  'Right.'),
 ('And there might not be a connection.', 'Yeah.')]

In [28]:
modelo_de_bert = 'bert_uncased_L-12_H-768_A-12/1'  #@param ["bert_uncased_L-12_H-768_A-12/1", "bert_cased_L-12_H-768_A-12/1", "bert_uncased_L-24_H-1024_A-16/1", "bert_cased_L-24_H-1024_A-16/1", "bert_multi_cased_L-12_H-768_A-12/1"]
bert = hub.Module('https://tfhub.dev/google/' + modelo_de_bert)

# instanciar el tokenizador
tokenization_info = bert(signature='tokenization_info', as_dict=True)
vocab_file, do_lower_case = sess.run([
    tokenization_info['vocab_file'],
    tokenization_info['do_lower_case'],
])
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)

In [29]:
# tokenizar e incluir los tokens [CLS] para "CLaSificar" (pooled_output) y [SEP] para separar las frases
tokens_input = ['[CLS]'] + tokenizer.tokenize(
    data[123][0]) + ['[SEP]'] + tokenizer.tokenize(data[123][1])
input_ids = np.expand_dims(np.array(
    tokenizer.convert_tokens_to_ids(tokens_input)),
                           axis=0)

# sin máscara (=1)
input_mask = np.ones(input_ids.shape)

# identificar las frases (segmentos): aquí sólo tenemos una
segment_ids = np.zeros(input_ids.shape)

# instanciar el modelo
bert_model = bert(dict(input_ids=input_ids,
                       input_mask=input_mask,
                       segment_ids=segment_ids),
                  signature="tokens",
                  as_dict=True)
sess.run(tf.global_variables_initializer())

# ponemos sequence_output para obtener los embeddings por palabra y pooled_output para obtener un embedding por frase
result = sess.run(bert_model['pooled_output'])

In [30]:
result

array([[-9.35499132e-01, -5.45429230e-01, -9.81864214e-01,
         9.04086769e-01,  8.35887909e-01, -2.75898397e-01,
         9.49175656e-01,  5.29724479e-01, -9.59179640e-01,
        -9.99997795e-01, -7.35049009e-01,  9.77790475e-01,
         9.78775442e-01,  8.13662887e-01,  9.34850156e-01,
        -8.86409163e-01, -5.88198185e-01, -7.41012216e-01,
         4.55575794e-01, -5.58418751e-01,  7.88361669e-01,
         9.99999464e-01, -2.67009526e-01,  4.61974949e-01,
         6.58629775e-01,  9.98077631e-01, -8.70244563e-01,
         9.42488253e-01,  9.51394081e-01,  7.47598767e-01,
        -8.10774505e-01,  2.94881910e-01, -9.86670852e-01,
        -3.94699097e-01, -9.77425873e-01, -9.94883180e-01,
         5.80901027e-01, -7.54309118e-01, -1.85391188e-01,
        -2.87362281e-02, -8.98570895e-01,  4.84597683e-01,
         9.99997437e-01,  1.06716625e-01,  6.45206451e-01,
        -4.70296979e-01, -1.00000000e+00,  4.58885014e-01,
        -9.11913812e-01,  9.85087812e-01,  9.75576639e-0

In [31]:
class BertEmbeddingLayer(Layer):
    def __init__(
            self,
            output_key='sequence_output',  # 'sequence_output': embedding de las palabras, 'pooled_ouput': embedding de la frase
            n_fine_tune_layers=0,  # número de capas a entrenar (sin contar la de pooling)
            bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",  # modelo de BERT preentrenado
            max_len=512,  # número máximo de tokens en las secuencias
            **kwargs):
        assert output_key == 'sequence_output' or output_key == 'pooled_output'
        super(BertEmbeddingLayer, self).__init__(**kwargs)
        self.output_key = output_key
        self.n_fine_tune_layers = n_fine_tune_layers
        self.bert_path = bert_path
        self.max_len = max_len

    def build(self, input_shape):
        self.bert = hub.Module(self.bert_path,
                               trainable=self.trainable,
                               name="{}_module".format(self.name))
        if self.trainable:
            if self.output_key == 'pooled_output':
                # añadir las variables de la capa de pooling a las que vammos a entrenar
                self.trainable_weights += [
                    var for var in self.bert.variables if 'pooler/' in var.name
                ]
            # añadir las variables de las últimas n capas a las que vamos a entrenar
            top_layer = max([
                int(_[_.find('layer_'):][6:_[_.find('layer_'):].find('/')])
                for _ in
                [var.name for var in bert.variables if 'layer_' in var.name]
            ])
            self.trainable_weights += [
                var for var in self.bert.variables if any([
                    f'layer_{top_layer-i}/' in var.name
                    for i in range(self.n_fine_tune_layers)
                ])
            ]
            self.non_trainable_weights += [
                var for var in self.bert.variables
                if var not in self.trainable_weights
            ]
        super(BertEmbeddingLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype='int32') for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(input_ids=input_ids,
                           input_mask=input_mask,
                           segment_ids=segment_ids)
        result = self.bert(inputs=bert_inputs,
                           signature='tokens',
                           as_dict=True)[self.output_key]
        return result

    def compute_output_shape(self, input_shape):
        if self.output_key == 'pooled_output':
            # embedding de la frase
            return (input_shape[0], self.bert.get_output_info_dict('tokens')[
                self.output_key].get_shape()[1].value)
        else:
            # embedding de las palabras
            return (input_shape[0], self.max_len,
                    self.bert.get_output_info_dict('tokens')[
                        self.output_key].get_shape()[2].value)

In [32]:
def build_bert_sentence_model(
        trainable=True,
        n_fine_tune_layers=10,
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        max_len=limite_de_palabras_en_la_secuencia):
    in_id = Input(shape=(max_len, ), name="input_ids")
    in_mask = Input(shape=(max_len, ), name="input_masks")
    in_segment = Input(shape=(max_len, ), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    embedding = BertEmbeddingLayer(trainable=trainable,
                                   output_key='pooled_output',
                                   n_fine_tune_layers=n_fine_tune_layers,
                                   bert_path=bert_path,
                                   max_len=max_len)(bert_inputs)
    dense = Dense(1024, activation='relu')(embedding)
    dropout = Dropout(0.2)(dense)
    pred = Dense(1, activation='sigmoid')(dropout)
    model = Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()
    return model

In [33]:
train_test_split = 200


# generar casos positivos y negativos
def get_data(data, max_len):
    examples = []
    mask = []
    segment = []
    label = []
    for i in range(len(data)):
        # consecutivos
        q = tokenizer.tokenize(data[i][0])
        a = tokenizer.tokenize(data[i][1])
        pad = [0] * (max_len - (len(q) + len(a) + 2))
        examples.append(
            tokenizer.convert_tokens_to_ids(['[CLS]'] + q + ['[SEP]'] +
                                            a)[:max_len] + pad)
        mask.append([1] * (len(q) + len(a) + 2) + pad)
        segment.append([0] * (len(q) + 2) + [1] * len(a) + pad)  #######
        label.append(1)  # resultado positivo

        # no consecutivos
        for _ in range(1):
            noti = (random.randrange(len(data) - 3) + i + 2) % len(data)
            assert (noti < i-1 or noti > i+1)
            q = tokenizer.tokenize(data[i][0])
            a = tokenizer.tokenize(data[noti][1])
            pad = [0] * (max_len - (len(q) + len(a) + 2))
            examples.append(
                tokenizer.convert_tokens_to_ids(['[CLS]'] + q + ['[SEP]'] +
                                                a)[:max_len] + pad)
            mask.append([1] * (len(q) + len(a) + 2) + pad)
            segment.append([0] * (len(q) + 2) + [1] * len(a) + pad)  #######
            label.append(0)  # resultado negativo
    return (np.array(examples), np.array(mask), np.array(segment), np.array(label))


train_examples, train_mask, train_segment, train_label = get_data(
    data[:train_test_split], limite_de_palabras_en_la_secuencia)
test_examples, test_mask, test_segment, test_label = get_data(
    data[train_test_split:], limite_de_palabras_en_la_secuencia)

In [34]:
entrenable = False  #@param {type : 'boolean'}
numero_de_capas_a_tunear = 0  #@param {type: 'slider', min : 0, max : 24}
checkpoint_filename = '/DrBertModel.h5'
bert_model = build_bert_sentence_model(
    trainable=entrenable,
    n_fine_tune_layers=numero_de_capas_a_tunear,
    bert_path='https://tfhub.dev/google/' + modelo_de_bert,
    max_len=limite_de_palabras_en_la_secuencia)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          (None, 512)          0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        (None, 512)          0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        (None, 512)          0                                            
__________________________________________________________________________________________________
bert_embedding_layer_2 (BertEmb ((None, 512), 768)   0           input_ids[0][0]                  
                                                                 input_masks[0][0]                
          

In [35]:
#bert_model.load_weights(checkpoint_dir + checkpoint_filename)
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='min'),
    ModelCheckpoint(checkpoint_dir + checkpoint_filename,
                    save_best_only=True,
                    monitor='val_loss',
                    mode='min')
]
bert_model.fit(
    [train_examples, train_mask, train_segment],
    train_label,
    validation_data=([test_examples, test_mask, test_segment], test_label),
    epochs=1000,
    batch_size=32  #@param {type : "number"}
    #@markdown La memoría utilizada por el GPU depende del tamaño del batch y el número de palabras en las sequencias
    ,
    callbacks=callbacks)

Train on 400 samples, validate on 42 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000


<keras.callbacks.History at 0x7f1b801a92b0>

In [None]:
bert_model.load_weights(checkpoint_dir + checkpoint_filename)

In [36]:
# check i am using BERT correctly
# write code to choose most likely subsequent setence to a prompt

In [0]:
train_examples[123], train_mask[123], train_segment[123]

(array([  101, 10166,  1012,  1012,  1012,  2157,  1012,   102,  3398,
         1010,  3398,  1012,  1012,  1012,  2065,  2008,  1521,  1055,
         2785,  1997,  4198,  2000,  2023,  2025,  5782,  2000,  4066,
         1997,  1010,  2025,  5782,  2000,  2191, 24644,  2015,  1998,
         2074,  2000,  4066,  1997,  1998,  3568,  2635,  2006,  4933,
         1998,  2025,  3038,  1005,  2053,  1005,  2000,  2111,  1010,
         2059,  1045,  2123,  1521,  1056,  2228,  1012,  1012,  1012,
         2008,  4165,  2066,  2009,  1005,  1055,  2025,  1037,  1012,
         1012,  1012,  1045,  1521,  2310,  2467,  2464,  2009,  2004,
         1037,  7199,  6179,  5656,  2021,  2085,  2672,  2009,  1521,
         1055,  2025,  1010,  2065,  2008,  1521,  1055,  1012,  1012,
         1012,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      