## Import model

In [1]:
import pickle
import torch
import io
import xgboost as xgb


# by default, Pickle does not support load model to cpu
class CpuUnpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else:
            return super().find_class(module, name)

with open('xgboost_pair.pkl', "rb") as fIn:
    stored_data = CpuUnpickler(fIn).load()
    bst = stored_data['bst']
    params = stored_data['params']
    





## Initialize functions

In [5]:
from collections import Counter
from nltk.corpus import stopwords
import pandas as pd
import numpy  as np
stops = set(stopwords.words("english"))

df_train = pd.read_csv('../input/train.csv.zip')
df_train.head()
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)

def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

    
eps = 5000 
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

    
def word_match_share(q1, q2):
    q1words = {}
    q2words = {}
    for word in str(q1).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(q2).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    return (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))


def tfidf_word_match_share(q1, q2):
    q1words = {}
    q2words = {}
    for word in str(q1).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(q2).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    return np.sum(shared_weights) / np.sum(total_weights)


from sentence_transformers import SentenceTransformer, util
sbert_model = SentenceTransformer('paraphrase-distilroberta-base-v1')
def cosine_sim(q1, q2):
    embeddings1 = sbert_model.encode([clean_text(q1)], convert_to_tensor=True)
    embeddings2 = sbert_model.encode([clean_text(q2)], convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
    return cosine_scores[0][0].item()


from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/quora-distilroberta-base')
def cross_sim(q1, q2):
    cross_scores = cross_encoder.predict([[clean_text(q1),clean_text(q2)]])
    return cross_scores[0].item()


from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

stops = set(stopwords.words("english"))
wnl = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower().replace(r'[^\x00-\x7f]', r' ')
    for c in [",", "!", ".", "?", '"', ":", ";", "[", "]", "{", "}", "<", ">"]:
        text = text.replace(c, " ")
    tokens = text.split(" ")
    tokens = [wnl.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in stops]
    return " ".join(tokens)

## Example query

In [10]:
from time import perf_counter

t0 = perf_counter()
x_test = pd.DataFrame()

q1 = 'How are falling prices of food articles balanced by an equitable increase in the farmer\'s income in an economy?'
q2 = 'How is the falling of prices of food articles balanced by an equitable increase in the farmer\'s income in an economy?'
x_test['word_match'] = [word_match_share(q1, q2)]
x_test['tfidf_word_match'] = [tfidf_word_match_share(q1, q2)]
x_test['cross_sim'] =  [cross_sim(q1, q2)]
x_test['cosine_sim'] =  [cosine_sim(q1, q2)]
d_test = xgb.DMatrix(x_test)

y_est = bst.predict(d_test)
t1 = perf_counter()
print(y_est)
print("Took {:.4f} seconds".format(t1-t0))

[0.84312904]
Took 0.1903 seconds
