<p><img width="220px" src="Bertle.gif"></p>
Vamos a hacer un motor de búsqueda semántica con los datos de Wikipedia en español.

### Instalar librerías

In [1]:
# instalar BERT
import sys

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
    sys.path += ['bert_repo']

# import python modules defined by BERT
import tokenization

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import os
import pickle
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity
from IPython.core.display import display, HTML

os.environ['TFHUB_CACHE_DIR'] = './tfhub'

In [3]:
sess = tf.Session()

### Código para leer el fichero de [Wikipedia](https://es.wikipedia.org/wiki/Wikipedia:Descargas) (en formato ZIM)

In [4]:
# 'robado' de https://github.com/kimbauters/ZIMply/blob/master/zimply/zimply.py

import io
import lzma
import logging
from functools import partial, lru_cache
from collections import namedtuple
from struct import Struct, pack, unpack

ZERO = pack("B", 0)  # defined for zero terminated fields
Field = namedtuple("Field", ["format", "field_name"])  # a tuple
Article = namedtuple("Article", ["data", "namespace", "mimetype"])  # a triple

iso639_3to1 = {"ara": "ar", "dan": "da", "nld": "nl", "eng": "en",
               "fin": "fi", "fra": "fr", "deu": "de", "hun": "hu",
               "ita": "it", "nor": "no", "por": "pt", "ron": "ro",
               "rus": "ru", "spa": "es", "swe": "sv", "tur": "tr"}


def read_zero_terminated(file, encoding):
    """
    Retrieve a ZERO terminated string by reading byte by byte until the ending
    ZERO terminated field is encountered.
    :param file: the file to read from
    :param encoding: the encoding used for the file
    :return: the decoded string, up to but not including the ZERO termination
    """
    # read until we find the ZERO termination
    buffer = iter(partial(file.read, 1), ZERO)
    # join all the bytes together
    field = b"".join(buffer)
    # transform the bytes into a string and return the string
    return field.decode(encoding=encoding, errors="ignore")


def convert_size(size):
    """
    Convert a given size in bytes to a human-readable string of the file size.
    :param size: the size in bytes
    :return: a human-readable string of the size
    """
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    power = int(floor(log(size, 1024)))
    base = pow(1024, power)
    size = round(size/base, 2)
    return '%s %s' % (size, size_name[power])


HEADER = [  # define the HEADER structure of a ZIM file
    Field("I", "magicNumber"),
    Field("I", "version"),
    Field("Q", "uuid_low"),
    Field("Q", "uuid_high"),
    Field("I", "articleCount"),
    Field("I", "clusterCount"),
    Field("Q", "urlPtrPos"),
    Field("Q", "titlePtrPos"),
    Field("Q", "clusterPtrPos"),
    Field("Q", "mimeListPos"),
    Field("I", "mainPage"),
    Field("I", "layoutPage"),
    Field("Q", "checksumPos")
]

ARTICLE_ENTRY = [  # define the ARTICLE ENTRY structure of a ZIM file
    Field("H", "mimetype"),
    Field("B", "parameterLen"),
    Field("c", "namespace"),
    Field("I", "revision"),
    Field("I", "clusterNumber"),
    Field("I", "blobNumber")
    # zero terminated url of variable length; not a Field
    # zero terminated title of variable length; not a Field
    # variable length parameter data as per parameterLen; not a Field
]

REDIRECT_ENTRY = [  # define the REDIRECT ENTRY structure of a ZIM file
    Field("H", "mimetype"),
    Field("B", "parameterLen"),
    Field("c", "namespace"),
    Field("I", "revision"),
    Field("I", "redirectIndex")
    # zero terminated url of variable length; not a Field
    # zero terminated title of variable length; not a Field
    # variable length parameter data as per parameterLen; not a Field
]

CLUSTER = [  # define the CLUSTER structure of a ZIM file
    Field("B", "compressionType")
]


class Block:
    def __init__(self, structure, encoding):
        self._structure = structure
        self._encoding = encoding
        # Create a new Struct object to correctly read the binary data in this
        # block in particular, pass it along that it is a little endian (<),
        # along with all expected fields.
        self._compiled = Struct("<" + "".join(
            [field.format for field in self._structure]))
        self.size = self._compiled.size

    def unpack(self, buffer, offset=0):
        # Use the Struct to read the binary data in the buffer
        # where this block appears at the given offset.
        values = self._compiled.unpack_from(buffer, offset)
        # Match up each value with the corresponding field in the block
        # and put it in a dictionary for easy reference.
        return {field.field_name: value for value, field in
                zip(values, self._structure)}

    def _unpack_from_file(self, file, offset=None):
        if offset is not None:
            # move the pointer in the file to the specified offset;
            # this is not index 0
            file.seek(offset)
        # read in the amount of data corresponding to the block size
        buffer = file.read(self.size)
        # return the values of the fields after unpacking them
        return self.unpack(buffer)

    def unpack_from_file(self, file, seek=None):
        # When more advanced behaviour is needed,
        # this method can be overridden by subclassing.
        return self._unpack_from_file(file, seek)


class HeaderBlock(Block):
    def __init__(self, encoding):
        super().__init__(HEADER, encoding)


class MimeTypeListBlock(Block):
    def __init__(self, encoding):
        super().__init__("", encoding)

    def unpack_from_file(self, file, offset=None):
        # move the pointer in the file to the specified offset as
        # this is not index 0 when an offset is specified
        if offset is not None:
            file.seek(offset)
        mimetypes = []  # prepare an empty list to store the mimetypes
        while True:
            # get the next zero terminated field
            s = read_zero_terminated(file, self._encoding)
            mimetypes.append(s)  # add the newly found mimetype to the list
            if s == "":  # the last entry must be an empty string
                mimetypes.pop()  # pop the last entry
                return mimetypes  # return the list of mimetypes we found


class ClusterBlock(Block):
    def __init__(self, encoding):
        super().__init__(CLUSTER, encoding)


@lru_cache(maxsize=32)  # provide an LRU cache for this object
class ClusterData(object):
    def __init__(self, file, offset, encoding):
        self.file = file  # store the file
        self.offset = offset  # store the offset
        cluster_info = ClusterBlock(encoding).unpack_from_file(
            self.file, self.offset)  # Get the cluster fields.
        # Verify whether the cluster has LZMA2 compression
        self.compressed = cluster_info['compressionType'] == 4
        # at the moment, we don't have any uncompressed data
        self.uncompressed = None
        self._decompress()  # decompress the contents as needed
        # Prepare storage to keep track of the offsets
        # of the blobs in the cluster.
        self._offsets = []
        # proceed to actually read the offsets of the blobs in this cluster
        self._read_offsets()

    def _decompress(self, chunk_size=32768):
        if self.compressed:
            # create a bytes stream to store the uncompressed cluster data
            self.buffer = io.BytesIO()
            decompressor = lzma.LZMADecompressor()  # prepare the decompressor
            # move the file pointer to the start of the blobs as long as we
            # don't reach the end of the stream.
            self.file.seek(self.offset + 1)

            while not decompressor.eof:
                chunk = self.file.read(chunk_size)  # read in a chunk
                data = decompressor.decompress(chunk)  # decompress the chunk
                self.buffer.write(data)  # and store it in the buffer area

    def _source_buffer(self):
        # get the file buffer or the decompressed buffer
        buffer = self.buffer if self.compressed else self.file
        # move the buffer to the starting position
        buffer.seek(0 if self.compressed else self.offset + 1)
        return buffer

    def _read_offsets(self):
        # get the buffer for this cluster
        buffer = self._source_buffer()
        # read the offset for the first blob
        offset0 = unpack("<I", buffer.read(4))[0]
        # store this one in the list of offsets
        self._offsets.append(offset0)
        # calculate the number of blobs by dividing the first blob by 4
        number_of_blobs = int(offset0 / 4)
        for idx in range(number_of_blobs - 1):
            # store the offsets to all other blobs
            self._offsets.append(unpack("<I", buffer.read(4))[0])

    def read_blob(self, blob_index):
        # check if the blob falls within the range
        if blob_index >= len(self._offsets) - 1:
            raise IOError("Blob index exceeds number of blobs available: %s" %
                          blob_index)
        buffer = self._source_buffer()  # get the buffer for this cluster
        # calculate the size of the blob
        blob_size = self._offsets[blob_index+1] - self._offsets[blob_index]
        # move to the position of the blob relative to current position
        buffer.seek(self._offsets[blob_index], 1)
        return buffer.read(blob_size)


class DirectoryBlock(Block):
    def __init__(self, structure, encoding):
        super().__init__(structure, encoding)

    def unpack_from_file(self, file, seek=None):
        # read the first fields as defined in the ARTICLE_ENTRY structure
        field_values = super()._unpack_from_file(file, seek)
        # then read in the url, which is a zero terminated field
        field_values["url"] = read_zero_terminated(file, self._encoding)
        # followed by the title, which is again a zero terminated field
        field_values["title"] = read_zero_terminated(file, self._encoding)
        field_values["namespace"] = field_values["namespace"].decode(
            encoding=self._encoding, errors="ignore")
        return field_values


class ArticleEntryBlock(DirectoryBlock):
    def __init__(self, encoding):
        super().__init__(ARTICLE_ENTRY, encoding)


class RedirectEntryBlock(DirectoryBlock):
    def __init__(self, encoding):
        super().__init__(REDIRECT_ENTRY, encoding)        


class ZIMFile:
    """
    The main class to access a ZIM file.
    Two important public methods are:
        get_article_by_url(...)
      is used to retrieve an article given its namespace and url.

        get_main_page()
      is used to retrieve the main page article for the given ZIM file.
    """
    def __init__(self, filename, encoding):
        self._enc = encoding
        # open the file as a binary file
        self.file = open(filename, "rb")
        # retrieve the header fields
        self.header_fields = HeaderBlock(self._enc).unpack_from_file(self.file)
        self.mimetype_list = MimeTypeListBlock(self._enc).unpack_from_file(
            self.file, self.header_fields["mimeListPos"])
        # create the object once for easy access
        self.redirectEntryBlock = RedirectEntryBlock(self._enc)

        self.articleEntryBlock = ArticleEntryBlock(self._enc)
        self.clusterFormat = ClusterBlock(self._enc)

    def _read_offset(self, index, field_name, field_format, length):
        # move to the desired position in the file
        if index != 0xffffffff:
            self.file.seek(self.header_fields[field_name] + int(length*index))

            # and read and return the particular format
            read = self.file.read(length)
            # return unpack("<" + field_format, self.file.read(length))[0]
            return unpack("<" + field_format, read)[0]
        return None

    def _read_url_offset(self, index):
        return self._read_offset(index, "urlPtrPos", "Q", 8)

    def _read_title_offset(self, index):
        return self._read_offset(index, "titlePtrPos", "L", 4)

    def _read_cluster_offset(self, index):
        return self._read_offset(index, "clusterPtrPos", "Q", 8)

    def _read_directory_entry(self, offset):
        """
        Read a directory entry using an offset.
        :return: a DirectoryBlock - either as Article Entry or Redirect Entry
        """
        logging.debug("reading entry with offset " + str(offset))

        self.file.seek(offset)  # move to the desired offset

        # retrieve the mimetype to determine the type of block
        fields = unpack("<H", self.file.read(2))

        # get block class
        if fields[0] == 0xffff:
            directory_block = self.redirectEntryBlock
        else:
            directory_block = self.articleEntryBlock
        # unpack and return the desired Directory Block
        return directory_block.unpack_from_file(self.file, offset)

    def read_directory_entry_by_index(self, index):
        """
        Read a directory entry using an index.
        :return: a DirectoryBlock - either as Article Entry or Redirect Entry
        """
        # find the offset for the given index
        offset = self._read_url_offset(index)
        if offset is not None:
            # read the entry at that offset
            directory_values = self._read_directory_entry(offset)
            # set the index in the list of values
            directory_values["index"] = index
            return directory_values  # and return all these directory values

    def _read_blob(self, cluster_index, blob_index):
        # get the cluster offset
        offset = self._read_cluster_offset(cluster_index)
        # get the actual cluster data
        cluster_data = ClusterData(self.file, offset, self._enc)
        # return the data read from the cluster at the given blob index
        return cluster_data.read_blob(blob_index)

    def _get_article_by_index(self, index, follow_redirect=True):
        # get the info from the DirectoryBlock at the given index
        entry = self.read_directory_entry_by_index(index)
        if entry is not None:
            # check if we have a Redirect Entry
            if 'redirectIndex' in entry.keys():
                # if we follow up on redirects, return the article it is
                # pointing to
                if follow_redirect:
                    logging.debug("redirect to " + str(entry['redirectIndex']))
                    return self._get_article_by_index(entry['redirectIndex'],
                                                      follow_redirect)
                # otherwise, simply return no data
                # and provide the redirect index as the metadata.
                else:
                    return Article(None, entry['namespace'],
                                   entry['redirectIndex'])
            else:  # otherwise, we have an Article Entry
                # get the data and return the Article
                data = self._read_blob(entry['clusterNumber'],
                                       entry['blobNumber'])
                return Article(data, entry['namespace'],
                               self.mimetype_list[entry['mimetype']])
        else:
            return None

    def _get_entry_by_url(self, namespace, url, linear=False):
        if linear:  # if we are performing a linear search ...
            # ... simply iterate over all articles
            for idx in range(self.header_fields['articleCount']):
                # get the info from the DirectoryBlock at that index
                entry = self.read_directory_entry_by_index(idx)
                # if we found the article ...
                if entry['url'] == url and entry['namespace'] == namespace:
                    # return the DirectoryBlock entry and index of the entry
                    return entry, idx
            # return None, None if we could not find the entry
            return None, None
        else:
            front = middle = 0
            end = len(self)
            title = full_url(namespace, url)
            logging.debug("performing binary search with boundaries " +
                          str(front) + " - " + str(end))
            found = False
            # continue as long as the boundaries don't cross and
            # we haven't found it
            while front <= end and not found:
                middle = floor((front + end) / 2)  # determine the middle index
                entry = self.read_directory_entry_by_index(middle)
                logging.debug("checking " + entry['url'])
                found_title = full_url(entry['namespace'], entry['url'])
                if found_title == title:
                    found = True  # flag it if the item is found
                else:
                    if found_title < title:  # if the middle is too early ...
                        # move the front index to middle
                        # (+ 1 to ensure boundaries can be crossed)
                        front = middle + 1
                    else:  # if the middle falls too late ...
                        # move the end index to middle
                        # (- 1 to ensure boundaries can be crossed)
                        end = middle - 1
            if found:
                # return the tuple with directory entry and index
                # (note the comma before the second argument)
                return self.read_directory_entry_by_index(middle), middle
            return None, None

    def get_article_by_url(self, namespace, url, follow_redirect=True):
        entry, idx = self._get_entry_by_url(namespace, url)  # get the entry
        if idx:  # we found an index and return the article at that index
            return self._get_article_by_index(
                idx, follow_redirect=follow_redirect)

    def get_main_page(self):
        """
        Get the main page of the ZIM file.
        """
        main_page = self._get_article_by_index(self.header_fields['mainPage'])
        if main_page is not None:
            return main_page

    def metadata(self):
        """
        Retrieve the metadata attached to the ZIM file.
        :return: a dict with the entry url as key and the metadata as value
        """
        metadata = {}
        # iterate backwards over the entries
        for i in range(self.header_fields['articleCount'] - 1, -1, -1):
            entry = self.read_directory_entry_by_index(i)  # get the entry
            if entry['namespace'] == 'M':  # check that it is still metadata
                # turn the key to lowercase as per Kiwix standards
                m_name = entry['url'].lower()
                # get the data, which is encoded as an article
                metadata[m_name] = self._get_article_by_index(i)[0]
            else:  # stop as soon as we are no longer looking at metadata
                break
        return metadata

    def __len__(self):  # retrieve the number of articles in the ZIM file
        return self.header_fields['articleCount']

    def __iter__(self):
        """
        Create an iterator generator to retrieve all articles in the ZIM file.
        :return: a yielded entry of an article, containing its full URL,
                  its title, and the index of the article
        """
        for idx in range(self.header_fields['articleCount']):
            # get the Directory Entry
            entry = self.read_directory_entry_by_index(idx)
            if entry['namespace'] == "A":
                # add the full url to the entry
                entry['fullUrl'] = full_url(entry['namespace'], entry['url'])
                yield entry['fullUrl'], entry['title'], idx

    def close(self):
        self.file.close()

    def __exit__(self, *_):
        """
        Ensure the ZIM file is properly closed when the object is destroyed.
        """
        self.close()

### Descargar el modelo de BERT

In [5]:
modelo_de_bert = 'bert_multi_cased_L-12_H-768_A-12/1'  #@param ["bert_uncased_L-12_H-768_A-12/1", "bert_cased_L-12_H-768_A-12/1", "bert_uncased_L-24_H-1024_A-16/1", "bert_cased_L-24_H-1024_A-16/1", "bert_multi_cased_L-12_H-768_A-12/1"]
bert = hub.Module('https://tfhub.dev/google/' + modelo_de_bert)

# instanciar el tokenizador
tokenization_info = bert(signature='tokenization_info', as_dict=True)
vocab_file, do_lower_case = sess.run([
    tokenization_info['vocab_file'],
    tokenization_info['do_lower_case'],
])
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)

W0818 23:10:25.301802 139839697622848 deprecation_wrapper.py:119] From bert_repo/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



In [6]:
batch_size = 32
max_len = 512

input_ids = tf.placeholder(name='input_ids',
                           shape=(batch_size, max_len),
                           dtype='int32')
input_mask = tf.placeholder(name='input_mask',
                            shape=(batch_size, max_len),
                            dtype='int32')
segment_ids = tf.placeholder(name='segment_ids',
                             shape=(batch_size, max_len),
                             dtype='int32')

In [7]:
# instanciar el modelo
bert_model = bert(dict(input_ids=input_ids,
                       input_mask=input_mask,
                       segment_ids=segment_ids),
                  signature="tokens",
                  as_dict=True)
sess.run(tf.global_variables_initializer())

### Calcular los embedding para todos los artículos de Wikipedia

In [8]:
def prepare_inputs_for_bert(texts):
    examples = []
    mask = []
    segment = []
    label = []
    for text in texts:
        q = tokenizer.tokenize(text)
        pad = [0] * (max_len - (len(q) + 2))
        examples.append(
            tokenizer.convert_tokens_to_ids(['[CLS]'] + q + ['[SEP]'])[:max_len] + pad)
        mask.append([1] * min(len(q) + 2, max_len) + pad)
        segment.append([0] * max_len)
    return (np.array(examples), np.array(mask), np.array(segment))

In [9]:
zim_file = ZIMFile('wikipedia_es_all_nopic_2019-06.zim', 'utf-8')

In [None]:
save_every = 10
#embeddings = []
embeddings = pickle.load(open('embeddings.p', 'rb'))
print(f'Read {len(embeddings * batch_size)} embeddings')

for _, i in enumerate(
        tqdm_notebook(
            range(len(embeddings * batch_size),
                  zim_file.header_fields['articleCount'], batch_size))):
    texts = []
    for j in range(batch_size):
        soup = BeautifulSoup(
            zim_file._get_article_by_index(i + j).data, "lxml")

        # kill all script and style elements
        for script in soup(["script", "style"]):
            script.decompose()  # rip it out

        # get text
        text = soup.get_text()

        # break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines
                  for phrase in line.split("  "))
        # drop blank lines
        texts.append('\n'.join(chunk for chunk in chunks if chunk))

    examples, mask, segment = prepare_inputs_for_bert(texts)
    embeddings.append(
        sess.run(bert_model['pooled_output'],
                 feed_dict={
                     input_ids: examples,
                     input_mask: mask,
                     segment_ids: segment
                 }))
    if (_ + 1) % save_every == 0:
        pickle.dump(embeddings, open('embeddings.p', 'wb'))

Read 7680 embeddings


HBox(children=(IntProgress(value=0, max=106301), HTML(value='')))

In [97]:
pickle.dump(embeddings, open('embeddings', 'wb'))
zim_file.close()

### Probar el motor de búsqueda semántica

In [114]:
zim_file = ZIMFile('wikipedia_es_all_nopic_2019-06.zim', 'utf-8')
embeddings = pickle.load(open('embeddings.p', 'rb'))
embeddings = np.vstack(embeddings)

In [130]:
busqueda = "paréntisis en matématicas"  #@param {type: 'integer'}
top_n = 3  #@param {type: 'integer'}

input_ids, input_mask, segment_ids = prepare_inputs_for_bert([busqueda])
bert_model = bert(dict(input_ids=input_ids,
                       input_mask=input_mask,
                       segment_ids=segment_ids),
                  signature="tokens",
                  as_dict=True)
sess.run(tf.global_variables_initializer())
target_embedding = sess.run(bert_model['pooled_output'])

In [139]:
cosine_similarities = pd.Series(
    cosine_similarity(target_embedding, embeddings).flatten())
for _, (i, similarity) in enumerate(
        cosine_similarities.nlargest(top_n).iteritems()):
    print(
        f'{_+1}. [{similarity:.2f}] {zim_file.read_directory_entry_by_index(i)["title"]}'
    )
print()
for i, similarity in cosine_similarities.nlargest(top_n).iteritems():
    with open('article.html', 'wt', encoding='utf-8') as file:
        file.write(
            zim_file._get_article_by_index(i).data.decode(
                'utf-8', errors='backslashreplace'))
    display(HTML(filename=os.path.realpath('article.html')))

1. [0.97] 
2. [0.97] (15810) 1994 JR1
3. [0.97] "Chinche" Lafuente



← ẓāʾ ʿayn ġayn →,← ẓāʾ ʿayn ġayn →,← ẓāʾ ʿayn ġayn →,← ẓāʾ ʿayn ġayn →
Aislada,Final,Media,Inicial
ﻉ,ـﻊ,ـﻌ,ﻋ


Chinche Lafuente,Chinche Lafuente,Chinche Lafuente
Datos personales,Datos personales.1,Datos personales.2
Nombre completo,Francisco Javier Lafuente Torralba,Francisco Javier Lafuente Torralba
Nacimiento,Madrid España 23 de marzo de 1961,Madrid España 23 de marzo de 1961
Nacionalidad(es),Española,Española
Altura,1.86 m,1.86 m
Carrera,Carrera,Carrera
Deporte,Baloncesto,Baloncesto
Club,Retirado,Retirado
Liga,ACB,ACB
Posición,Base,Base
,,

0,1
Control de autoridades,Proyectos Wikimedia  Datos: Q5766560
