# Instalación de librerías e imports

In [68]:
!pip install spacy



In [69]:
!python -m spacy download es_dep_news_trf

Collecting es-dep-news-trf==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_dep_news_trf-3.0.0/es_dep_news_trf-3.0.0-py3-none-any.whl (409.9 MB)
[+] Download and installation successful
You can now load the package via spacy.load('es_dep_news_trf')


In [70]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.0.0/en_core_web_trf-3.0.0-py3-none-any.whl (459.7 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_trf')


In [1]:
import json
import numpy as np
import os
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import PorterStemmer
from joblib import Parallel, delayed
from gensim import corpora
from gensim import models
import pandas as pd
from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity, Similarity
from operator import itemgetter
import glob
from matplotlib import pyplot as plt
import re
from typing import List
import spacy
import pickle

In [2]:
ENGLISH_DICT = r""
SPANISH_DICT = r""
STOPWORDS_SPANISH = r"../resources/stopwords.txt"

# Preprocesado

## Eliminación de símbolos

In [48]:
class SymbolRemover():
    
    def __init__(self):
        self.replace_no_space = re.compile("(\&)|(\%)|(\$)|(\€)|(\.)|(\;)|(\:)|(\!)|(\')|(\¿)|(\¡)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)|(\⁰)|(\•)|(\\')")
        self.replace_space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)|(\t)|(  )|(\n)")
    
    def execute(self, input_text) -> str:
        """
        Recibe un texto en crudo y elimina los caracteres que no aportan significado.
        Devuelve el texto en formato string en minúsculas.
        """
        returned = self.replace_no_space.sub("", input_text.lower())
        returned = self.replace_space.sub(" ", returned)       
        return returned

In [32]:
symbolRemover = SymbolRemover()

In [33]:
symbolRemover.execute("esto ?¿ es una € prub&a")

'esto es una pruba'

## Tokenización

In [34]:
class Tokenizer():
    
    def __init__(self, component):
        self.component = component
    
    def execute(self, input_text) -> List[str]:
        """
        Recibe un texto y devuelve una lista de tokens.
        """
        output = self.component.execute(input_text)
        return output.split(" ")

In [35]:
tokenizer = Tokenizer(SymbolRemover())

In [36]:
tokenizer.execute("esto ?¿ es una € prub&a")

['esto', 'es', 'una', 'pruba']

## Eliminación de palabras vacías

In [39]:
class StopwordsRemover():
    
    def __init__(self, path, component):
        self.component = component
        self.stopwords = [line.strip() for line in open(path, "r", encoding = "utf-8").readlines()]
        
    def execute(self, input_text) -> List[str]:
        """
        Filtra los tokens eliminando aquellos que son palabras vacías.
        """
        tokens = self.component.execute(input_text)
        return [token for token in tokens if token not in self.stopwords]

In [40]:
spanishStopwordsRemover = StopwordsRemover(STOPWORDS_SPANISH, Tokenizer(SymbolRemover()))

In [41]:
spanishStopwordsRemover.execute("esto ?¿ es una € prub&a")

['pruba']

In [42]:
englishStopwordsRemover = StopwordsRemover("../resources/english_stopwords.txt", Tokenizer(SymbolRemover()))

In [43]:
englishStopwordsRemover.execute("this is a test")

['test']

## Lematización

In [44]:
class LemmatizerTagger():
    
    conversions = {
        "ADJ": "a",
        "ADV": "adv",
        "NOUN": "n",
        "NUM": "n",
        "PRON": "n",
        "PROPN": "n",
        "VERB": "v",
        "ADP": "prep",
        "JJ": "a",
        "JJR": "a",
        "JJS": "a",
        "NN": "n",
        "NNP": "n",
        "NNPS": "n",
        "NNS": "n",
        "RB": "adv",
        "RBR": "adv",
        "RBS": "adv",
        "RP": "adv",
        "VB": "v",
        "VBD": "v",
        "VBG": "v",
        "VBN": "v",
        "VBP": "v",
        "VBZ": "v",
    }
    
    def __init__(self, model, component):
        self.nlp = spacy.load(model)
        self.component = component
        
    def execute(self, input_text) -> List[str]:
        """
        Recibe una lista de tokens y devuelve sus lemas.
        """
        tokens = self.component.execute(input_text)
        returned = []
        for token in self.nlp(" ".join(tokens)):
            try:
                returned.append({
                    "lemma": token.lemma_,
                    "pos": self.conversions[token.tag_]
                })
            except:
                print(f"ERROR: {token} - {token.tag_}")
        return returned

In [45]:
spanishLemmatizer = LemmatizerTagger("es_dep_news_trf", StopwordsRemover(STOPWORDS_SPANISH, Tokenizer(SymbolRemover())))
englishLemmatizer = LemmatizerTagger("en_core_web_trf", StopwordsRemover("../resources/english_stopwords.txt", Tokenizer(SymbolRemover())))

In [46]:
spanishLemmatizer.execute("esto ?¿ es una € prueb&a")

[{'lemma': 'prueba', 'pos': 'n'}]

In [47]:
englishLemmatizer.execute("this is a test, beautiful girls")

[{'lemma': 'test', 'pos': 'v'},
 {'lemma': 'beautiful', 'pos': 'a'},
 {'lemma': 'girl', 'pos': 'n'}]

# Traducción

## Creación de los diccionarios

### Diccionario Inglés-UNL

In [88]:
eng_unl = pd.read_csv("../resources/eng-unl.txt", sep = "\t", names = ["lemma", "a", "uw", "ex1", "ex2", "pos", "_"])

In [89]:
eng_unl = eng_unl[["lemma", "uw", "pos"]]

In [90]:
eng_unl["lemma"] = eng_unl["lemma"].apply(lambda x: x.strip("\[\]"))

In [91]:
eng_unl["pos"] = eng_unl["pos"].apply(lambda x: x.strip("{}"))

In [92]:
eng_unl.head()

Unnamed: 0,lemma,uw,pos
0,1 chronicles,1_chronicles(iof>sacred_text>information),n
1,1 kings,1_kings(iof>sacred_text>information),n
2,1 samuel,1_samuel(iof>sacred_text>information),n
3,24 hours,"24_hours(icl>period>time,icl>unit)",n
4,24-karat gold,24-karat_gold(icl>gold>thing),n


In [93]:
dic = {}
for _,row in eng_unl.iterrows():
    if row["lemma"] in dic:
        lemma = row["lemma"]
        uw = row["uw"]
        pos = row["pos"]
        entry = {
            "uw": uw,
            "pos": pos
        }
        dic[lemma].append(entry)
    else:
        lemma = row["lemma"]
        uw = row["uw"]
        pos = row["pos"]
        entry = {
            "uw": uw,
            "pos": pos
        }
        dic[lemma] = [entry]

In [96]:
with open("../resources/eng2unl.pickle", "wb") as f:
    pickle.dump(dic, f)

### Diccionario UNL-Español

In [105]:
esp_unl = pd.read_csv("../resources/esp-unl.txt", sep = "\t", names = ["lemma", "a", "uw", "ex1", "ex2", "pos", "_"])

In [106]:
esp_unl = esp_unl[["lemma", "uw", "pos"]]

In [107]:
esp_unl["lemma"] = esp_unl["lemma"].apply(lambda x: x.strip("\[\]"))

In [108]:
esp_unl["pos"] = esp_unl["pos"].apply(lambda x: x.strip("{}"))

In [110]:
esp_unl.head()

Unnamed: 0,lemma,uw,pos
0,abalanzarse,"brood(icl>hang>occur,obj>thing)",v
1,abalanzarse,"hover(icl>hang>occur,cob>thing,obj>thing)",v
2,abalanzarse,"loom(icl>hang>occur,equ>brood,obj>thing)",v
3,abandonar,"abandon(icl>leave>do,agt>person,obj>person)",v
4,abandonar,"abandon(icl>leave>do,equ>vacate,agt>thing,obj>...",v


In [112]:
dic = {}
for _,row in esp_unl.iterrows():
    if row["uw"] in dic:
        lemma = row["lemma"]
        uw = row["uw"]
        pos = row["pos"]
        entry = {
            "lemma": lemma,
            "pos": pos
        }
        dic[uw].append(entry)
    else:
        lemma = row["lemma"]
        uw = row["uw"]
        pos = row["pos"]
        entry = {
            "lemma": lemma,
            "pos": pos
        }
        dic[uw] = [entry]

In [113]:
dic

{'brood(icl>hang>occur,obj>thing)': [{'lemma': 'abalanzarse', 'pos': 'v'},
  {'lemma': 'aparecer', 'pos': 'v'}],
 'hover(icl>hang>occur,cob>thing,obj>thing)': [{'lemma': 'abalanzarse',
   'pos': 'v'},
  {'lemma': 'aparecer', 'pos': 'v'}],
 'loom(icl>hang>occur,equ>brood,obj>thing)': [{'lemma': 'abalanzarse',
   'pos': 'v'},
  {'lemma': 'aparecer', 'pos': 'v'}],
 'abandon(icl>leave>do,agt>person,obj>person)': [{'lemma': 'abandonar',
   'pos': 'v'}],
 'abandon(icl>leave>do,equ>vacate,agt>thing,obj>thing)': [{'lemma': 'abandonar',
   'pos': 'v'},
  {'lemma': 'desalojar', 'pos': 'v'}],
 'bequeath(icl>give>do,agt>thing,obj>thing,gol>thing)': [{'lemma': 'abandonar',
   'pos': 'v'},
  {'lemma': 'dejar', 'pos': 'v'},
  {'lemma': 'entregar', 'pos': 'v'}],
 'break_away(icl>separate>occur,equ>break,obj>thing)': [{'lemma': 'abandonar',
   'pos': 'v'}],
 'break(icl>interrupt>do,plt>thing,agt>person,obj>abstract_thing)': [{'lemma': 'abandonar',
   'pos': 'v'}],
 'defect(icl>flee>do,plf>thing,agt>vol

In [114]:
with open("../resources/unl2esp.pickle", "wb") as f:
    pickle.dump(dic, f)

## Traductor Eng2UNL

In [118]:
class Eng2UNLTranslator():
    
    def __init__(self, file = "../resources/eng2unl.pickle"):
        with open(file, "rb") as f:
            self.dic = pickle.load(f)
            
    def get_combinations(self, tokens):
        """
        Calcula todas las combinaciones de los tokens dados.
        """
        result = [" ".join(tokens[i: j]) for i in range(len(tokens)) for j in range(i + 1, len(tokens) + 1) if j-i > 1]
        result.sort(reverse = True, key = len)
        return result
            
    def translate(self, tokens):
        """
        Recibe una lista de tokens en inglés lematizados (salida de lematizados) y devuelve una lista de uws.
        """
        print(f"\tQuery original = {tokens}")
        lemmas = [token["lemma"] for token in tokens]
        combinations = self.get_combinations(lemmas)
        
        translated = []
        
        for combination in combinations: # primero buscamos la uw de las expresiones en inglés
            if combination in self.dic:
                print(f"\t\tCombinación traducida {combination}")
                uws = [translation["uw"] for translation in self.dic[combination]]
                translated += uws
                
                # borramos las expresiones que tienen un lema ya traducido y los lemas traducidos
                combinations.remove(combination)
                for token in combination.split(" "):
                    for c in combinations: # combinaciones que quedan
                        if token in c: # la combinación tiene un lemma a traducido
                            combinations.remove(c)
                    lemmas.remove(token)

        # ahora traducimos los lemmas restantes
        for token in tokens:
            if token["lemma"] in lemmas: # no se ha traducido
                lemma = token["lemma"]
                if lemma in self.dic:
                    uws = [translation["uw"] for translation in self.dic[lemma] if translation["pos"] == token["pos"]]
                    translated += uws
                else:
                    print(f"\t\tNo se ha podido encontrar {lemma} en el diccionario Eng2UNL")
                
        print(f"\tUWs encontradas = {translated}")
        return translated

In [21]:
eng2unl = Eng2UNLTranslator()

In [22]:
eng2unl.translate([
    {'lemma': 'girl', 'pos': 'n'},
    {'lemma': 'beautiful', 'pos': 'a'},
])

Query original = [{'lemma': 'girl', 'pos': 'n'}, {'lemma': 'beautiful', 'pos': 'a'}]
UWs encontradas = ['girl(icl>child>person,ant>boy)', 'girl(icl>child>person,equ>daughter,ant>boy,pos>person)', 'girl(icl>person,equ>girlfriend,pos>man)', 'girl(icl>person,equ>woman)', 'girl(icl>virgin>person)', 'girl(icl>woman>thing)', 'beautiful(icl>adj,ant>ugly)', 'beautiful(icl>adj)']


['girl(icl>child>person,ant>boy)',
 'girl(icl>child>person,equ>daughter,ant>boy,pos>person)',
 'girl(icl>person,equ>girlfriend,pos>man)',
 'girl(icl>person,equ>woman)',
 'girl(icl>virgin>person)',
 'girl(icl>woman>thing)',
 'beautiful(icl>adj,ant>ugly)',
 'beautiful(icl>adj)']

## Traductor UNL2Esp

In [117]:
class UNL2EspTranslator():
    
    def __init__(self, file = "../resources/unl2esp.pickle"):
        with open(file, "rb") as f:
            self.dic = pickle.load(f)
            
    def get_combinations(self, tokens):
        """
        Calcula todas las combinaciones de los tokens dados.
        """
        result = [" ".join(tokens[i: j]) for i in range(len(tokens)) for j in range(i + 1, len(tokens) + 1) if j-i > 1]
        result.sort(reverse = True, key = len)
        return result
            
    def translate(self, uws):
        """
        Recibe una lista de uws y devuelve una lista de tokens en español.
        """
        print(f"\tUWs = {uws}")
        
        translated = []
        
        for uw in uws:
            if uw in self.dic:
                translations = [translation["lemma"] for translation in self.dic[uw]]
                translated += translations
            else:
                print(f"\t\tUW no encontrada en el diccionario de español: {uw}")
                
        print(f"\tTraducción al español = {translated}")
        return list(set(translated)) # para eliminar duplicados

In [24]:
unl2esp = UNL2EspTranslator()
unl2esp.translate(['girl(icl>child>person,ant>boy)',
 'girl(icl>child>person,equ>daughter,ant>boy,pos>person)',
 'girl(icl>person,equ>girlfriend,pos>man)',
 'girl(icl>person,equ>woman)',
 'girl(icl>virgin>person)',
 'girl(icl>woman>thing)',
 'beautiful(icl>adj,ant>ugly)',
 'beautiful(icl>adj)'])

UWs = ['girl(icl>child>person,ant>boy)', 'girl(icl>child>person,equ>daughter,ant>boy,pos>person)', 'girl(icl>person,equ>girlfriend,pos>man)', 'girl(icl>person,equ>woman)', 'girl(icl>virgin>person)', 'girl(icl>woman>thing)', 'beautiful(icl>adj,ant>ugly)', 'beautiful(icl>adj)']
	UW no encontrada en el diccionario de español: girl(icl>child>person,equ>daughter,ant>boy,pos>person)
	UW no encontrada en el diccionario de español: girl(icl>person,equ>woman)
	UW no encontrada en el diccionario de español: girl(icl>virgin>person)
	UW no encontrada en el diccionario de español: girl(icl>woman>thing)
Traducción al español = ['chica', 'muchacha', 'chica', 'bello', 'bonito', 'hermoso', 'bello', 'bonito']


['chica', 'hermoso', 'bonito', 'bello', 'muchacha']

## Traductor

In [169]:
class Translator():
    
    def __init__(self):
        self.eng2unl = Eng2UNLTranslator()
        self.unl2esp = UNL2EspTranslator()
        
    def translate(self, salida_preprocesado):
        print("Traduciendo a UNL...")
        uws = self.eng2unl.translate(salida_preprocesado)
        print("Traduciendo a ESP...")
        translations = self.unl2esp.translate(uws)
        return list(set(translations))

# Indexador

In [100]:
class MyCorpus:
        
    def __init__(self, docs, dictionary):
        self.docs = docs
        self.dictionary = dictionary

    def __iter__(self):
        for doc in self.docs:
            yield self.dictionary.doc2bow(doc)

In [113]:
class Indexer():       
    
    def __init__(self, path_docs = "../documents/*/*.txt", dictionary = None, bow = None):
        self.pipeline = LemmatizerTagger("es_dep_news_trf", StopwordsRemover(STOPWORDS_SPANISH, Tokenizer(SymbolRemover())))
        self.repository = [open(doc, "r", encoding = "utf-8").read() for doc in glob.glob(path_docs)]
        if not (dictionary and bow):
            self.documents = Parallel(n_jobs = 12, verbose = 50)(delayed(self._preprocess_doc)(doc) for doc in glob.glob(path_docs))
        if dictionary:
            self.dictionary = corpora.Dictionary.load(dictionary)
        if bow:
            self.bow = corpora.MmCorpus(bow)
        
    def index(self):
        """
        Crea el diccionario de la colección y la bolsa de palabras.
        """
        self.dictionary = self._create_dictionary()
        self.dictionary.save("../resources/dictionary.dict")
        self.bow = MyCorpus(self.documents, self.dictionary)
        corpora.MmCorpus.serialize("../resources/bow.mm", self.bow, metadata=True)
        
    def _create_dictionary(self):
        """
        Función específica encargada de crear y guardar el diccionario.
        """
        dictionary = corpora.Dictionary(doc for doc in self.documents)
        return dictionary
        
    def _preprocess_doc(self, doc):
        with open(doc, "r", encoding = "utf-8") as f:
            content = f.read()
        return [token["lemma"] for token in self.pipeline.execute(content)]

In [98]:
indexer = Indexer()

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:    5.8s
[Parallel(n_jobs=12)]: Done   2 tasks      | elapsed:    7.0s
[Parallel(n_jobs=12)]: Done   3 tasks      | elapsed:    8.1s
[Parallel(n_jobs=12)]: Done   4 tasks      | elapsed:   10.2s
[Parallel(n_jobs=12)]: Done   5 tasks      | elapsed:   13.0s
[Parallel(n_jobs=12)]: Done   6 tasks      | elapsed:   13.1s
[Parallel(n_jobs=12)]: Done   7 tasks      | elapsed:   13.1s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:   14.2s
[Parallel(n_jobs=12)]: Done   9 tasks      | elapsed:   16.7s
[Parallel(n_jobs=12)]: Done  10 tasks      | elapsed:   17.6s
[Parallel(n_jobs=12)]: Done  11 tasks      | elapsed:   19.1s
[Parallel(n_jobs=12)]: Done  12 tasks      | elapsed:   20.1s
[Parallel(n_jobs=12)]: Done  13 tasks      | elapsed:   20.4s
[Parallel(n_jobs=12)]: Done  14 tasks      | elapsed:   22.7s
[Parallel(n_jobs=12)]: Done  15 tasks      | elapsed:  

[Parallel(n_jobs=12)]: Done 132 tasks      | elapsed:  2.8min
[Parallel(n_jobs=12)]: Done 133 tasks      | elapsed:  2.8min
[Parallel(n_jobs=12)]: Done 134 tasks      | elapsed:  2.8min
[Parallel(n_jobs=12)]: Done 135 tasks      | elapsed:  2.9min
[Parallel(n_jobs=12)]: Done 136 tasks      | elapsed:  2.9min
[Parallel(n_jobs=12)]: Done 137 tasks      | elapsed:  2.9min
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:  3.0min
[Parallel(n_jobs=12)]: Done 139 tasks      | elapsed:  3.0min
[Parallel(n_jobs=12)]: Done 140 tasks      | elapsed:  3.0min
[Parallel(n_jobs=12)]: Done 141 tasks      | elapsed:  3.0min
[Parallel(n_jobs=12)]: Done 142 tasks      | elapsed:  3.1min
[Parallel(n_jobs=12)]: Done 143 tasks      | elapsed:  3.1min
[Parallel(n_jobs=12)]: Done 144 tasks      | elapsed:  3.1min
[Parallel(n_jobs=12)]: Done 145 tasks      | elapsed:  3.1min
[Parallel(n_jobs=12)]: Done 146 tasks      | elapsed:  3.1min
[Parallel(n_jobs=12)]: Done 147 tasks      | elapsed:  3.1min
[Paralle

In [101]:
indexer.index()

In [102]:
indexer.documents[:5]

[['aceptémoslo',
  'perder',
  'peso',
  'rápido',
  'indoloro',
  'sencillo',
  'recomendación',
  'básico',
  'claro',
  'restringir',
  'comida',
  'favorito',
  'apostar',
  'alimento',
  'bajo',
  'grasa',
  'carbohidrato',
  'azúcar',
  'sumar él',
  'rutina',
  'ejercicio',
  'regular',
  'fuerza',
  'voluntad',
  'soler',
  'frecuente',
  'recaída',
  'oportunidad',
  'desperdiciada',
  'llegado',
  'punto',
  'conveniente',
  'apostar',
  'pérdida',
  'peso',
  'duradero',
  'prolongado',
  'fácil',
  'incómodo',
  'duro',
  'tarea',
  'adelgazar',
  'realidad',
  'truco',
  'sorprendentemente',
  'fácil',
  'forma',
  'deshacer yo',
  'kilo',
  'sobrar',
  'revista',
  'eat',
  'this',
  'not',
  'that',
  'elaborado',
  'lista',
  'ver',
  'continuación',
  'cepilir él tú',
  'diente',
  'parecer',
  'tontería',
  'día',
  'noche',
  'quedar',
  'hambre',
  'optado',
  'comida',
  'ligero',
  'forma',
  'eficiente',
  'resistir',
  'tentación',
  'abrir',
  'nevera',
  'pasa

In [103]:
print(indexer.dictionary)

Dictionary(12362 unique tokens: ['abrir', 'aceptémoslo', 'actitud', 'acuerdo', 'adelgazamiento']...)


In [108]:
print(indexer.bow)
for doc in indexer.bow:
    print(doc)
    break

<__main__.MyCorpus object at 0x00000282537652B0>
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 2), (6, 1), (7, 1), (8, 1), (9, 3), (10, 1), (11, 3), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 2), (26, 2), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 3), (35, 9), (36, 1), (37, 1), (38, 1), (39, 2), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 2), (59, 3), (60, 1), (61, 1), (62, 2), (63, 1), (64, 1), (65, 2), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 2), (73, 3), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 3), (81, 1), (82, 1), (83, 2), (84, 1), (85, 1), (86, 4), (87, 1), (88, 1), (89, 1), (90, 2), (91, 2), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 2), (101, 1), (102, 3), (103, 1), (104, 1), (105, 1), 

# Cálculo de similitudes

In [193]:
class Searcher():
    
    def __init__(self, indexer):
        self.bow = indexer.bow
        self.dictionary = indexer.dictionary
        self.model = models.TfidfModel(self.bow)
        self.index = Similarity(None, corpus = indexer.bow, num_features = len(indexer.dictionary))
        self.pipeline_eng = LemmatizerTagger("es_dep_news_trf", Tokenizer(SymbolRemover()))
        self.pipeline_esp = LemmatizerTagger("en_core_web_trf", Tokenizer(SymbolRemover()))
        self.traductor = Translator()
        self.documents = indexer.repository
    
    def search(self, lang, query, k = 100, verbose = True):
        """
        En función del lenguaje, ejecuta una pipeline u otra y realiza la búsqueda.
        """
        if lang == "eng":
            pq = self.traductor.translate(self.pipeline_eng.execute(query))
        elif lang == "esp":
            pq = [token["lemma"] for token in self.pipeline_esp.execute(query)]
                        
        vq = self.dictionary.doc2bow(pq)
        qtfidf = self.model[vq]
        sim = self.index[qtfidf]
        ranking = sorted(enumerate(sim), key=itemgetter(1), reverse=True)
        if verbose:
            print(f"Query ==> {pq}")
            for doc, score in ranking[:5]:
                print("[ Score = " + "%.3f" % round(score,3) + " ] " + self.documents[doc][:k])

In [194]:
searcher = Searcher(indexer)

In [195]:
searcher.search("eng", "group")

Traduciendo a UNL...
	Query original = [{'lemma': 'group', 'pos': 'n'}]
	UWs encontradas = ['group(icl>abstraction>thing)', 'group(icl>set>thing)', 'group(icl>unit>thing)']
Traduciendo a ESP...
	UWs = ['group(icl>abstraction>thing)', 'group(icl>set>thing)', 'group(icl>unit>thing)']
		UW no encontrada en el diccionario de español: group(icl>set>thing)
		UW no encontrada en el diccionario de español: group(icl>unit>thing)
	Traducción al español = ['agrupación', 'colectivo']
Query ==> ['colectivo', 'agrupación']
[ Score = 0.113 ] Ala Comisión del 8-M ‘se la suda’ el impacto sanitario de las manifestaciones feministas.

A pesar d
[ Score = 0.046 ] El Real Madrid visita este jueves al CSKA Moscú en el primer partido sin Facundo Campazzo, que se ha
[ Score = 0.031 ] Unidas Podemos urge a los territorios que utilicen sus competencias para ampliar la regulación de la
[ Score = 0.027 ] UP, ERC y Bildu buscan mantener viva su enmienda contra los desahucios pese al rechazo del PSOE
La C
[ Score =