<a href="https://colab.research.google.com/github/teticio/aventuras-con-textos/blob/master/Dr%20Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dr Bert
Vamos a seguir la tradicción de [ELIZA](https://en.wikipedia.org/wiki/ELIZA) y crear un psicoterapeuta con inteligencia artificial. Vamos a aprovechar la capacidad que tiene el modelo de BERT de reconocer frases consecutivas.

### Importar las librerías

In [82]:
# instalar BERT
import sys

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
    sys.path += ['bert_repo']

# import python modules defined by BERT
import tokenization

In [96]:
import os
import random
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K
import keras.layers as layers
from keras.preprocessing import sequence
from keras.models import Model, Sequential, load_model
from keras.layers import Input, Dense, Dropout
from keras.engine import Layer
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import get_file, to_categorical
from scipy.special import softmax

os.environ['TFHUB_CACHE_DIR'] = './tfhub'
checkpoint_dir = 'checkpoints'  #@param {type: "string"}
limite_de_palabras_en_la_secuencia = 512  #@param {type : "number"}

In [97]:
sess = tf.Session()

### Preparamos los datos

In [98]:
get_file(
    os.getcwd() + '/transcript.txt',
    origin=
    'https://docs.google.com/uc?export=download&id=1_YRPtRHDmA-Osr4UaVudKMNVEcwsaxU5'
)
with open('transcript.txt', 'rt', encoding='utf-8') as file:
    lines = file.readlines()

Downloading data from https://docs.google.com/uc?export=download&id=1_YRPtRHDmA-Osr4UaVudKMNVEcwsaxU5


In [99]:
lines = [_.strip()[3:] for _ in lines if len(_) > 1 and _[1] == ':']

In [100]:
data = [(lines[i], lines[i + 1]) for i in range(len(lines) - 1)]
random.seed(12345)  # para resultados reproducibles
random.shuffle(data)

In [101]:
len(data)

221

In [102]:
data[:3]

[('Okay.',
  'My notes are kept safe, I write very kind of factual notes, so just a kind of very brief description of what we have talked about and any particular issues that come up, and you have access to those notes through me if you want to see them. Otherwise they’re confidential. The only time I would break confidentiality would be if I was seriously concerned about your wellbeing or about the safety and wellbeing of anybody else. Okay?'),
 ('It feels ... it sort of sits somewhere here (hand to chest), I think. It’s hard to, hard to describe that.',
  'Right.'),
 ('And there might not be a connection.', 'Yeah.')]

### Definir el modelo

In [90]:
modelo_de_bert = 'bert_uncased_L-12_H-768_A-12/1'  #@param ["bert_uncased_L-12_H-768_A-12/1", "bert_cased_L-12_H-768_A-12/1", "bert_uncased_L-24_H-1024_A-16/1", "bert_cased_L-24_H-1024_A-16/1", "bert_multi_cased_L-12_H-768_A-12/1"]
bert = hub.Module('https://tfhub.dev/google/' + modelo_de_bert)

# instanciar el tokenizador
tokenization_info = bert(signature='tokenization_info', as_dict=True)
vocab_file, do_lower_case = sess.run([
    tokenization_info['vocab_file'],
    tokenization_info['do_lower_case'],
])
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)

In [92]:
class BertEmbeddingLayer(Layer):
    def __init__(
            self,
            output_key='sequence_output',  # 'sequence_output': embedding de las palabras, 'pooled_ouput': embedding de la frase
            n_fine_tune_layers=0,  # número de capas a entrenar (sin contar la de pooling)
            bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",  # modelo de BERT preentrenado
            max_len=512,  # número máximo de tokens en las secuencias
            **kwargs):
        assert output_key == 'sequence_output' or output_key == 'pooled_output'
        super(BertEmbeddingLayer, self).__init__(**kwargs)
        self.output_key = output_key
        self.n_fine_tune_layers = n_fine_tune_layers
        self.bert_path = bert_path
        self.max_len = max_len

    def build(self, input_shape):
        self.bert = hub.Module(self.bert_path,
                               trainable=self.trainable,
                               name="{}_module".format(self.name))
        if self.trainable:
            if self.output_key == 'pooled_output':
                # añadir las variables de la capa de pooling a las que vammos a entrenar
                self.trainable_weights += [
                    var for var in self.bert.variables if 'pooler/' in var.name
                ]
            # añadir las variables de las últimas n capas a las que vamos a entrenar
            top_layer = max([
                int(_[_.find('layer_'):][6:_[_.find('layer_'):].find('/')])
                for _ in
                [var.name for var in bert.variables if 'layer_' in var.name]
            ])
            self.trainable_weights += [
                var for var in self.bert.variables if any([
                    f'layer_{top_layer-i}/' in var.name
                    for i in range(self.n_fine_tune_layers)
                ])
            ]
            self.non_trainable_weights += [
                var for var in self.bert.variables
                if var not in self.trainable_weights
            ]
        super(BertEmbeddingLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype='int32') for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(input_ids=input_ids,
                           input_mask=input_mask,
                           segment_ids=segment_ids)
        result = self.bert(inputs=bert_inputs,
                           signature='tokens',
                           as_dict=True)[self.output_key]
        return result

    def compute_output_shape(self, input_shape):
        if self.output_key == 'pooled_output':
            # embedding de la frase
            return (input_shape[0], self.bert.get_output_info_dict('tokens')[
                self.output_key].get_shape()[1].value)
        else:
            # embedding de las palabras
            return (input_shape[0], self.max_len,
                    self.bert.get_output_info_dict('tokens')[
                        self.output_key].get_shape()[2].value)

In [93]:
def build_bert_classification_model(
        trainable=True,
        n_fine_tune_layers=10,
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        max_len=limite_de_palabras_en_la_secuencia,
        num_classes=2):
    in_id = Input(shape=(max_len, ), name="input_ids")
    in_mask = Input(shape=(max_len, ), name="input_masks")
    in_segment = Input(shape=(max_len, ), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    embedding = BertEmbeddingLayer(trainable=trainable,
                                   output_key='pooled_output',
                                   n_fine_tune_layers=n_fine_tune_layers,
                                   bert_path=bert_path,
                                   max_len=max_len)(bert_inputs)
    dropout = Dropout(0.1)(embedding)
    pred = Dense(num_classes, activation='sigmoid')(dropout)
    model = Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()
    return model

In [95]:
entrenable = False  #@param {type : 'boolean'}
numero_de_capas_a_tunear = 0  #@param {type: 'slider', min : 0, max : 24}
checkpoint_filename = '/DrBertModel.h5'
bert_model = build_bert_classification_model(
    trainable=entrenable,
    n_fine_tune_layers=numero_de_capas_a_tunear,
    bert_path='https://tfhub.dev/google/' + modelo_de_bert,
    max_len=limite_de_palabras_en_la_secuencia,
    num_classes=2)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          (None, 512)          0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        (None, 512)          0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        (None, 512)          0                                            
__________________________________________________________________________________________________
bert_embedding_layer_9 (BertEmb ((None, 512), 768)   0           input_ids[0][0]                  
                                                                 input_masks[0][0]                
          

### Preparar los datos en el formato que espera BERT
```python
example = [CLS] How are you? [SEP] Fine, thanks [SEP]

mask    =   1    1   1   1     1     1     1      1    0 ... 0

segment =   0    0   0   0     1     1     1      1    0 ... 0
```

In [94]:
train_test_split = 200


# generar casos positivos y negativos
def get_data(data, max_len):
    examples = []
    mask = []
    segment = []
    label = []
    for i in range(len(data)):
        # consecutivos
        q = tokenizer.tokenize(data[i][0])
        a = tokenizer.tokenize(data[i][1])
        pad = [0] * (max_len - (len(q) + len(a) + 3))
        examples.append(
            tokenizer.convert_tokens_to_ids(['[CLS]'] + q + ['[SEP]'] + a +
                                            ['[SEP]'])[:max_len] + pad)
        mask.append([1] * (len(q) + len(a) + 3) + pad)
        segment.append([0] * (len(q) + 2) + [1] * (len(a) + 1) + pad)
        label.append('1')  # resultado positivo

        # no consecutivos
        for _ in range(1):
            noti = (random.randrange(len(data) - 3) + i + 2) % len(data)
            assert (noti < i - 1 or noti > i + 1)
            q = tokenizer.tokenize(data[i][0])
            a = tokenizer.tokenize(data[noti][1])
            pad = [0] * (max_len - (len(q) + len(a) + 3))
            examples.append(
                tokenizer.convert_tokens_to_ids(['[CLS]'] + q + ['[SEP]'] + a +
                                                ['[SEP]'])[:max_len] + pad)
            mask.append([1] * (len(q) + len(a) + 3) + pad)
            segment.append([0] * (len(q) + 2) + [1] * (len(a) + 1) + pad)
            label.append('0')  # resultado negativo
    return (np.array(examples), np.array(mask), np.array(segment),
            to_categorical(label, 2))


train_examples, train_mask, train_segment, train_label = get_data(
    data[:train_test_split], limite_de_palabras_en_la_secuencia)
test_examples, test_mask, test_segment, test_label = get_data(
    data[train_test_split:], limite_de_palabras_en_la_secuencia)

### Entrenar el modelo

In [55]:
#bert_model.load_weights(checkpoint_dir + checkpoint_filename)
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='min'),
    ModelCheckpoint(checkpoint_dir + checkpoint_filename,
                    save_best_only=True,
                    monitor='val_loss',
                    mode='min')
]
bert_model.fit(
    [train_examples, train_mask, train_segment],
    train_label,
    validation_data=([test_examples, test_mask, test_segment], test_label),
    epochs=1000,
    batch_size=32  #@param {type : "number"}
    #@markdown La memoría utilizada por el GPU depende del tamaño del batch y el número de palabras en las sequencias
    ,
    callbacks=callbacks)

Train on 400 samples, validate on 42 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000


<keras.callbacks.History at 0x7f5559da2f98>

In [36]:
bert_model.load_weights(checkpoint_dir + checkpoint_filename)

### Ahora podemos empezar la sesión de psicoterapia...

In [81]:
max_len = limite_de_palabras_en_la_secuencia
try:
    while True:
        texto = input('You: ')
        examples = []
        mask = []
        segment = []
        label = []
        for i in range(len(data)):
            # consecutivos
            q = tokenizer.tokenize(texto)
            a = tokenizer.tokenize(data[i][1])
            pad = [0] * (max_len - (len(q) + len(a) + 3))
            examples.append(
                tokenizer.convert_tokens_to_ids(['[CLS]'] + q + ['[SEP]'] + a +
                                                ['[SEP]'])[:max_len] + pad)
            mask.append([1] * (len(q) + len(a) + 3) + pad)
            segment.append([0] * (len(q) + 2) + [1] * (len(a) + 1) + pad)
        result = bert_model.predict([examples, mask, segment])
        print('Dr Bert: ' + data[np.argmax(softmax(result, axis=1)[:,1])][1])
except:
    print('Dr Bert: Bye!')

You: Hi there.
Dr Bert: Hi.
You: How can you help me with my problem?
Dr Bert: Okay, so shall we begin with the issues that you put down on your referral and the issues that you are bringing today. So what’s going on for you?
You: Well, I am having panic attacks at work.
Dr Bert: I probably ... probably a couple, maybe six weeks ago ... but I see that as quite recent, when I would actually ... I’d get symptoms of, not exactly panic ... I don’t have a panic attack, but I can feel a rising sort of panic here (holds chest)…
You: So you suffer from the same thing as me??
Dr Bert: Whatever ... does that bring anything up for you, that idea of letting someone down or saying no?
You: Um, not really.
Dr Bert: Okay, that feels a bit uncomfortable or ...
You: So, how do these sessions work exactly?
Dr Bert: Okay. The way I work may or may not be different to that so if I just tell you a little bit about the kind of things that I’m interested in and how I imagine the sessions to go. So one of the