## SentenceBERT

Install with `pip`

In [None]:
!pip install sentence_transformers

## Data

This note nook using data from [Quora Question Pairs](https://www.kaggle.com/c/quora-question-pairs)

### Read data
Data store in GDrive:

#### Connect to google drive

In [None]:
from google.colab import drive
drive.mount('/gdrive')

#### List the files

In [None]:
%ls /gdrive/MyDrive/Colab\ Notebooks/data/quora/input/train.csv.zip

In [None]:
import numpy as np
import pandas as pd
import os

for dirname, _, filenames in os.walk('/gdrive/MyDrive/Colab Notebooks/data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/gdrive/MyDrive/Colab Notebooks/data/quora/input/train.csv.zip', compression='zip', sep=',')
df.head()

In [None]:
question1 = df['question1'].unique()
question1

### Clean data

- Lowercase original sentences
- Remove some nonsense words, non-ASCII character
- Replace with common phrases

In [None]:
stopwords = set(['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'which', 'while', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'])


def cleantext(sent):
    # Removing non ASCII chars
    sent = str(sent).replace(r'[^\x00-\x7f]',r' ')

    # Replace some common paraphrases
    sent_norm = sent.lower()\
        .replace("how do you", "how do i")\
        .replace("how do we", "how do i")\
        .replace("how can we", "how can i")\
        .replace("how can you", "how can i")\
        .replace("how can i", "how do i")\
        .replace("really true", "true")\
        .replace("what are the importance", "what is the importance")\
        .replace("what was", "what is")\
        .replace("so many", "many")\
        .replace("would it take", "will it take")

    # Remove any punctuation characters
    for c in [",", "!", ".", "?", "'", '"', ":", ";", "[", "]", "{", "}", "<", ">"]:
        sent_norm = sent_norm.replace(c, " ")

    # Remove stop words
    tokens = sent_norm.split()
    tokens = [token for token in tokens if token not in stopwords]
    return " ".join(tokens)

cleantext('What is the approx annual cost of living while studying in UIC Chicago, for an Indian student?')

then, replace data with cleaned data: replace `question` with `cleantext(question)`

In [None]:
question1 = df['question1'].unique()
question1 = np.array(list(map(cleantext, question1)))
question1

In [None]:
question2 = df['question2'].unique()
question2 = np.array(list(map(cleantext, question2)))
question2

## Models
### Create the embeddings

In [None]:
from sentence_transformers import SentenceTransformer, util
from time import perf_counter


model = SentenceTransformer('paraphrase-distilroberta-base-v1')

startTime = perf_counter()
embeddings1 = model.encode(question1, convert_to_tensor=True)
embeddings2 = model.encode(question2, convert_to_tensor=True)
endTime = perf_counter()
print("Computed sentence embeddings in {:.4f} seconds".format(endTime - startTime))

#### Experiments
Create a simple query and search for top 10 results

In [None]:
from time import perf_counter
import torch

queries = ['What is the approx annual cost of living while studying in UIC Chicago, for an Indian student?'] # example from question1

top_5 = min(5, len(embeddings2))

time_t1 = perf_counter()
for query in queries:
    query_embedding = model.encode(cleantext(query), convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
    top_results = torch.topk(cos_scores, k=top_5)
    print("### Query:", query)
    print("Top 5 most similar queries:")
    for score, idx in zip(top_results[0], top_results[1]):
        print("({:.4f})".format(score), question2[idx])

time_t2 = perf_counter()
print("Compute consine-similarity in","{:.4f}".format(time_t2 - time_t1),"seconds")

Using the top 100 in Bi-encoder to evaluate with Cross-Encoder

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder
from time import perf_counter
import torch

query = 'What is the approx annual cost of living while studying in UIC Chicago, for an Indian student?' # example from question1

top_100 = min(100, len(embeddings2))

time_t1 = perf_counter()
query_embedding = model.encode(cleantext(query), convert_to_tensor=True)
cos_scores = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
top_results = torch.topk(cos_scores, k=top_100) # select top 100

top_sentences = [ question2[idx] for idx in zip(top_results[1])] # extract top 100 sentences

time_t2 = perf_counter()
sentence_combinations = [[query, sentence] for sentence in top_sentences]

cross_encoder = CrossEncoder('cross-encoder/stsb-distilroberta-base')
similarity_scores = cross_encoder.predict(sentence_combinations)
sim_scores = reversed(np.argsort(similarity_scores))

print("### Query:", query)
print("Top 5 most similar queries:")
for idx in [sim_score for _,sim_score in zip(range(5), sim_scores)]:
    print("({:.4f}) {}".format(similarity_scores[idx], top_sentences[idx]))

time_t3 = perf_counter()
print("Compute bi-encoder in","{:.4f}".format(time_t2 - time_t1),"seconds")
print("Compute cross-encoder from top 100 in","{:.4f}".format(time_t3 - time_t2),"seconds")
print("Total time: ", "{:.4f}".format(time_t3 - time_t1), "seconds")

#### Note and TODO
Cannot apply to caculate for all sentences in both sets (memory not enough for 230TB =)) so:
- we can apply one by one
- a signmoi function: threshold for similarity scores to mark a question is similar or not
    - linear regression to select the proper threshold
- calculate the accuracy

### Export and import the model

Export model to file. File can be used to restore model later.

In [None]:
import pickle

#Store sentences & embeddings on disc
with open('question1.pkl', "wb") as fOut:
    pickle.dump({'sentences': question1, 'embeddings': embeddings1}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
with open('question2.pkl', "wb") as fOut:
    pickle.dump({'sentences': question2, 'embeddings': embeddings2}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

Import model from file. In our case, kaggle generates model, then we use the pre-trained model to create the search engine.

In [None]:
#Load sentences & embeddings from disc
with open('question1.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    question1 = stored_data['sentences']
    embeddings1 = stored_data['embeddings']
with open('question2.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    question2 = stored_data['sentences']
    embeddings2 = stored_data['embeddings']