<a href="https://colab.research.google.com/github/teticio/aventuras-con-textos/blob/master/Bertle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img width="220px" src="https://github.com/teticio/aventuras-con-textos/blob/master/Bertle.gif?raw=1"></img>
Vamos a hacer un motor de búsqueda semántica con los datos de stack **overflow**.

<img width="220px" src="https://github.com/teticio/aventuras-con-textos/blob/master/Bertle.gif?raw=1"></img>
We are going to make a semantic search engine using data from stack **overflow**.

### Instalar librerías

### Install libraries

In [None]:
# instalar BERT
# install BERT
import sys

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
    sys.path += ['bert_repo']

# importar módulos de python de BERT
# import python modules defined by BERT
import tokenization

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity
from IPython.core.display import display, HTML
from keras.utils import get_file
from fuzzywuzzy import fuzz
from copy import deepcopy

os.environ['TFHUB_CACHE_DIR'] = './tfhub'

Using TensorFlow backend.


In [3]:
sess = tf.Session()

In [4]:
batch_size = 32  #@param {type : 'number'}
max_len = 512  #@param {type : 'number'}


def prepare_inputs_for_bert(texts, max_len):
    examples = []
    mask = []
    segment = []
    label = []
    for text in texts:
        q = tokenizer.tokenize(text)
        pad = [0] * (max_len - (len(q) + 2))
        examples.append(
            tokenizer.convert_tokens_to_ids(['[CLS]'] + q +
                                            ['[SEP]'])[:max_len] + pad)
        mask.append([1] * min(len(q) + 2, max_len) + pad)
        segment.append([0] * max_len)
    return (np.array(examples), np.array(mask), np.array(segment))

### Descargar el modelo de BERT

### Download the BERT model

In [5]:
#modelo_de_bert = 'bert_uncased_L-12_H-768_A-12/1'  #@param ["bert_uncased_L-12_H-768_A-12/1", "bert_cased_L-12_H-768_A-12/1", "bert_uncased_L-24_H-1024_A-16/1", "bert_cased_L-24_H-1024_A-16/1", "bert_multi_cased_L-12_H-768_A-12/1"]
modelo_de_bert = 'bert_multi_cased_L-12_H-768_A-12/1'  #@param ["bert_uncased_L-12_H-768_A-12/1", "bert_cased_L-12_H-768_A-12/1", "bert_uncased_L-24_H-1024_A-16/1", "bert_cased_L-24_H-1024_A-16/1", "bert_multi_cased_L-12_H-768_A-12/1"]
bert = hub.Module('https://tfhub.dev/google/' + modelo_de_bert)

# instanciar el tokenizador
# create instance of tokenizer
tokenization_info = bert(signature='tokenization_info', as_dict=True)
vocab_file, do_lower_case = sess.run([
    tokenization_info['vocab_file'],
    tokenization_info['do_lower_case'],
])
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)

W0901 15:02:01.942592 140282311845696 deprecation_wrapper.py:119] From bert_repo/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



In [6]:
input_ids = tf.placeholder(name='input_ids',
                           shape=(batch_size, max_len),
                           dtype='int32')
input_mask = tf.placeholder(name='input_mask',
                            shape=(batch_size, max_len),
                            dtype='int32')
segment_ids = tf.placeholder(name='segment_ids',
                             shape=(batch_size, max_len),
                             dtype='int32')

In [7]:
# instanciar el modelo
# create instance of model
bert_model = bert(dict(input_ids=input_ids,
                       input_mask=input_mask,
                       segment_ids=segment_ids),
                  signature="tokens",
                  as_dict=True)
sess.run(tf.global_variables_initializer())

### Descargar los datos de stack overflow

### Download stack overflow data

In [8]:
get_file(
    os.getcwd() + '/stackoverflow.p',
    origin=
    'https://docs.google.com/uc?export=download&id=1BPosRNTemuPD5XifQLOxQWe2wLwTIHt3'
)
data = pickle.load(open('stackoverflow.p', 'rb'))

### Calcular los embedding con BERT para las preguntas de stack overflow

### Calculate embedding for stack overflow questions with BERT

In [9]:
bert_embeddings = []
for _, i in enumerate(tqdm_notebook(range(0, len(data), batch_size))):
    texts = []
    for j in range(batch_size):
        if i + j < len(data):
            texts.append(data['title'][i + j])
        else:
            # añadir unos dummy para completar el último batch
            # add some dummies to complete the final batch
            texts.append('')
    examples, mask, segment = prepare_inputs_for_bert(texts, max_len)
    bert_embeddings.append(
        sess.run(bert_model['pooled_output'],
                 feed_dict={
                     input_ids: examples,
                     input_mask: mask,
                     segment_ids: segment
                 }))
bert_embeddings = np.vstack(bert_embeddings)[:len(data)]

HBox(children=(IntProgress(value=0, max=153), HTML(value='')))




### Calcular los embedding con ELMo para las preguntas de stack overflow

### Calculate embedding for stack overflow questions with ELMo 

In [10]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
sentences = tf.placeholder(name='sentences',
                           shape=(batch_size, ),
                           dtype='string')
elmo_model = elmo(sentences, signature='default', as_dict=True)['default']
sess.run(tf.global_variables_initializer())

In [11]:
elmo_embeddings = []
for _, i in enumerate(tqdm_notebook(range(0, len(data), batch_size))):
    texts = []
    for j in range(batch_size):
        if i + j < len(data):
            texts.append(data['title'][i + j])
        else:
            # añadir unos dummy para completar el último batch
            # add some dummies to complete the final batch
            texts.append('')
    elmo_embeddings.append(sess.run(elmo_model, feed_dict={sentences: texts}))
elmo_embeddings = np.vstack(elmo_embeddings)[:len(data)]

HBox(children=(IntProgress(value=0, max=153), HTML(value='')))




### Probar el motor de búsqueda semántica

### Test the semantic search engine

In [38]:
# search
busqueda = 'invertir matriz'  #@param {type: 'string'}
top_n = 10  #@param {type: 'integer'}
n_similar = 3  #@param {type: 'integer'}

### Hacemos un búsqueda "fuzzy" en los títulos

### We do a "fuzzy" search in the titles

In [39]:
fuzzy = sorted(
    [(fuzz.partial_ratio(busqueda.lower(), data['title'][_].lower()), _)
     for _ in range(len(data))],
    reverse=True)

### Tenemos en cuenta la proximidad semántica

### We consider semantic proximity

In [40]:
#target_embeddings = [bert_embeddings[_[1]] for _ in fuzzy[:top_n]]
#cosine_similarities = cosine_similarity(target_embeddings, bert_embeddings)
target_embeddings = [elmo_embeddings[_[1]] for _ in fuzzy[:top_n]]
cosine_similarities = cosine_similarity(target_embeddings, elmo_embeddings)

In [41]:
display(
    HTML(
        '<img width="220px" src="https://cdn.sstatic.net/Sites/stackoverflow/company/img/logos/so/so-logo.svg?v=a010291124bf"></img>'
    ))
display(HTML('<h2>Resultados de la búsqueda <i>' + busqueda + '</i></h2>'))
for _, (score, i) in enumerate(fuzzy[:top_n]):
    output = ''
    text = f'<p><h2><a href="{data["url"][i]}"> {data["title"][i]} </a></h2><br>{data["text"][i][0:500]} ...<br>'
    for word in text.split():
        if word.lower() in busqueda.lower().split():
            output += ' <b>' + str(word) + '</b>'
        else:
            output += " " + str(word)
    top_args = np.flip(np.argsort(cosine_similarities[_]))
    for similar in range(1, n_similar + 1):
        output += f'<br><a href ="{data["url"][top_args[similar]]}">{data["title"][top_args[similar]]}</a>'
    output += '</p>'
    display(HTML(output))