# Multilingual Search Engine

### Imports

In [1]:
# Data Manipulation
import pandas as pd

# Deep Learning Libraries
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text

# NLP Libraries
from flair.embeddings import BertEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence

from abc import ABCMeta, abstractmethod

import faiss

from tqdm import tqdm

### Data Load

Data must be downloaded from Kaggle competition: <br>
https://www.kaggle.com/c/quora-question-pairs/data

In [2]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
# We remove missing values
data.dropna(inplace=True)

In [5]:
# Base Encoder
class TFEncoder(metaclass=ABCMeta):
    def __init__(self, model_path:str):
        self.model = hub.load(model_path)
    
    # Encodes text
    # The text should be a list of strings to encode
    #@abstractmethod
    #def encode(self, text:list):


In [6]:
# Universal sentence encoder
class USE(TFEncoder):
    def __init__(self, model_path):
        super().__init__(model_path)
        
    def encode(self, text):
        return self.model(text).numpy()

In [7]:
# Universal sentence encoder trained on Question Answer pairs
class USEQA(TFEncoder):
    def __init__(self, model_path):
        super().__init__(model_path)
        
    def encode(self, text):
        return self.model.signatures['question_encoder'](tf.constant(s))['outputs'].numpy()

In [8]:
# BERT models
class BERT():
    def __init__(self, model_name, layers="-2", pooling_operation="mean"):
        self.embeddings = BertEmbeddings(model_name, 
                                         layers=layers,
                                         pooling_operation=pooling_operation)

        self.document_embeddings = DocumentPoolEmbeddings([self.embeddings], fine_tune_mode='nonlinear')
        
    def encode(self, text):
        sentence = Sentence(text)
        self.document_embeddings.embed(sentence)
        return sentence.embedding.detach().numpy().reshape(1, -1)

In [9]:
# model_path = 'https://tfhub.dev/google/universal-sentence-encoder-qa/3'
# model_path = '../../models/universal-sentence-encoder-qa3/'

# https://arxiv.org/pdf/1803.11175.pdf
# model_path = '../../models/universal-sentence-encoder-large5/' #best for english

model_path = "https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3"
# model_path = '../../models/universal-sentence-encoder-multilingual-large3/'

# encoder = BERT('bert-base-uncased')
encoder = USE(model_path)

INFO:absl:Using /var/folders/0s/skg4xy3d4_z6br1c9rlxqpg00000gn/T/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'.
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3: 90.03MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3: 190.03MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3: 290.03MB
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3, Total size: 334.32MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'.


In [10]:
encoder.encode(['hello']).shape

(1, 512)

In [11]:
d = encoder.encode(['hello']).shape[-1]
d

512

#### Faiss Class

In [12]:
class FAISS:
    def __init__(self, dimensions:int):
        self.dimensions = dimensions
        self.index = faiss.IndexFlatL2(dimensions)
        self.vectors = {}
        self.counter = 0
    
    def add(self, text:str, v:list):
        self.index.add(v)
        self.vectors[self.counter] = (text, v)
        self.counter += 1
        
    def search(self, v:list, k:int=10):
        distance, item_index = self.index.search(v, k)
        for dist, i in zip(distance[0], item_index[0]):
            if i==-1:
                break
            else:
                print(f'{self.vectors[i][0]}, %.2f'%dist)

#### Vector Search Test

In [13]:
index = FAISS(d)

# index word
t1 = 'hello'
v1 = encoder.encode([t1])
index.add(t1, v1)

# index word
t1 = 'bye'
v1 = encoder.encode([t1])
index.add(t1, v1)

# search similar word
t1 = 'hi'
v1 = encoder.encode([t1])
print('word,  distance')
index.search(v1)

word,  distance
hello, 0.07
bye, 0.83


#### Generate Embeddings and Index all questions

In [15]:
index = FAISS(d)

for q in tqdm(data):
    emb = encoder.encode([q])
    index.add(q, emb)

  0%|          | 6/404287 [00:00<5:48:09, 19.35it/s]


In [16]:
def search(s, k=10):
    emb = encoder.encode([s])
    index.search(emb, k)

#### Search Examples

In [17]:
search('how to lose weight?')

is_duplicate, 1.76
id, 1.80
question2, 1.82
question1, 1.83
qid2, 1.93
qid1, 1.95


In [18]:
print('German')
search('wie man Gewicht verliert?')

German
is_duplicate, 1.84
id, 1.89
question2, 1.90
question1, 1.90
qid2, 1.96
qid1, 1.97


In [19]:
print('Chinese')
search('如何减肥?')

Chinese
id, 1.76
is_duplicate, 1.76
question2, 1.83
question1, 1.85
qid2, 1.92
qid1, 1.92


In [20]:
print('Japanese')
search('体重を減らす方法は？')

Japanese
is_duplicate, 1.78
id, 1.78
question1, 1.89
question2, 1.89
qid1, 1.92
qid2, 1.92
