In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# import unidecode
# import contractions
import re
# from word2number import w2n
import numpy as np
import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Keras package
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Embedding, Dropout, Activation, LSTM, Lambda, Bidirectional
from keras.layers import concatenate
from keras.models import Model
from keras.layers import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import GlobalAveragePooling1D
import keras.backend as K

In [2]:
## initialize global variables

MAX_SEQUENCE_LENGTH = 60  
MAX_NUM_WORDS = 200000  # There are about 201000 unique words in training dataset, 200000 is enough for tokenization
EMBEDDING_DIM = 300  # word-embedded-vector dimension(300 is for 'glove.42B.300d')
N_HIDDEN = 512
N_DENSE = 256

DROPOUT_RATE_LSTM = 0.10 # drop-out possibility, random set to avoid outfitting
DROPOUT_RATE_DENSE = 0.15

ACTIVE_FUNC = 'relu'
VERSION = 'bilstm3'

PATH_TO_GLOVE_FILE = './data/glove.42B.300d.txt'
# PATH_TO_GLOVE_FILE = './data/glove.840B.300d.txt'

model = None

In [3]:
print(f'Nodes Hidden: {N_HIDDEN}')
print(f'Nodes Dense: {N_DENSE}')

print(f'Dropout Rate LSTM: {DROPOUT_RATE_LSTM}')
print(f'Dropout Rate Dense: {DROPOUT_RATE_DENSE}')


Nodes Hidden: 512
Nodes Dense: 256
Dropout Rate LSTM: 0.1
Dropout Rate Dense: 0.15


In [4]:
# Create word embedding dictionary from 'glove.840B.300d.txt', {key:value} is {word: glove vector(300,)}
print('Create word embedding dictionary')

embeddings_index = {}
with open(PATH_TO_GLOVE_FILE) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
        
print("Found %s word vectors." % len(embeddings_index))

Create word embedding dictionary
Found 1917494 word vectors.


In [5]:
df = pd.read_csv('./data/questions.csv.zip')
df.head(10)
pair = df.iloc[11]
print(pair['question1'])
print(pair['question2'])
print(pair['is_duplicate'])
df.tail()

How do I read and find my YouTube comments?
How can I see all my Youtube comments?
1


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
404346,404346,789792,789793,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404347,404347,789794,789795,Do you believe there is life after death?,Is it true that there is life after death?,1
404348,404348,789796,789797,What is one coin?,What's this coin?,0
404349,404349,789798,789799,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0
404350,404350,789800,789801,What is like to have sex with cousin?,What is it like to have sex with your cousin?,0


In [6]:
print(df[df['is_duplicate'] == 1].head())
print(f'\ncount: \n{df.count()}')
print(f"\nsum is_duplicate: {df['is_duplicate'].sum()}")

    id  qid1  qid2                                          question1  \
5    5    11    12  Astrology: I am a Capricorn Sun Cap moon and c...   
7    7    15    16                     How can I be a good geologist?   
11  11    23    24        How do I read and find my YouTube comments?   
12  12    25    26               What can make Physics easy to learn?   
13  13    27    28        What was your first sexual experience like?   

                                            question2  is_duplicate  
5   I'm a triple Capricorn (Sun, Moon and ascendan...             1  
7           What should I do to be a great geologist?             1  
11             How can I see all my Youtube comments?             1  
12            How can you make physics easy to learn?             1  
13             What was your first sexual experience?             1  

count: 
id              404351
qid1            404351
qid2            404351
question1       404350
question2       404349
is_duplicate    4

In [7]:
from sklearn.model_selection import train_test_split
y = df['is_duplicate']
# dft = df.drop('is_duplicate', axis=1)
dft = df
X_train, X_test, y_train, y_test = train_test_split(dft, y, test_size=0.1, random_state=42)



In [8]:
print(f"\nsum is_duplicate: {X_train['is_duplicate'].sum()}")
X_train.count()


sum is_duplicate: 134382


id              363915
qid1            363915
qid2            363915
question1       363914
question2       363913
is_duplicate    363915
dtype: int64

In [9]:
X_test.count()

id              40436
qid1            40436
qid2            40436
question1       40436
question2       40436
is_duplicate    40436
dtype: int64

In [10]:

train_q1 = X_train['question1'].values
train_q2 = X_train['question2'].values
train_labels = X_train['is_duplicate'].values

sample = X_test.sample(n=10000, random_state=42)
test_q1 = X_test['question1'].values
test_q2 = X_test['question2'].values
test_labels = X_test['is_duplicate'].values
test_ids = X_test['id'].values  # id..

In [11]:
# Preprocess text in dataset
print('Processing text dataset')

def text_to_wordlist(text):
    
    # split words
    text = str(text).split()
    
    text = " ".join(text)

    # Use re to clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text, re.IGNORECASE)
    text = re.sub(r"what's", "what is ", text, re.IGNORECASE)
    text = re.sub(r"\’s", " ", text, re.IGNORECASE)
    text = re.sub(r"\'s", " ", text, re.IGNORECASE)
    text = re.sub(r"\'ve", " have ", text, re.IGNORECASE)
    text = re.sub(r"can't", "cannot ", text, re.IGNORECASE)
    text = re.sub(r"n't", " not ", text, re.IGNORECASE)
    text = re.sub(r"i'm", "i am ", text, re.IGNORECASE)
    text = re.sub(r"\'re", " are ", text, re.IGNORECASE)
    text = re.sub(r"\'d", " would ", text, re.IGNORECASE)
    text = re.sub(r"\'ll", " will ", text, re.IGNORECASE)
    text = re.sub(r"\‘", " ", text, re.IGNORECASE)
    text = re.sub(r"\’", " ", text, re.IGNORECASE)
    text = re.sub(r"\"", " ", text, re.IGNORECASE)
    text = re.sub(r"\“", " ", text, re.IGNORECASE)
    text = re.sub(r"\”", " ", text, re.IGNORECASE)
    text = re.sub(r",", " ", text, re.IGNORECASE)
    text = re.sub(r"\.", " ", text, re.IGNORECASE)
    text = re.sub(r"!", " ! ", text, re.IGNORECASE)
    text = re.sub(r"\/", " ", text, re.IGNORECASE)
    text = re.sub(r"\^", " ^ ", text, re.IGNORECASE)
    text = re.sub(r"\+", " + ", text, re.IGNORECASE)
    text = re.sub(r"\-", " - ", text, re.IGNORECASE)
    text = re.sub(r"\=", " = ", text, re.IGNORECASE)
    text = re.sub(r"'", " ", text, re.IGNORECASE)
    text = re.sub(r":", " : ", text, re.IGNORECASE)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text, re.IGNORECASE)
    text = re.sub(r" e g ", " eg ", text, re.IGNORECASE)
    text = re.sub(r" b g ", " bg ", text, re.IGNORECASE)
    text = re.sub(r" u s ", " american ", text, re.IGNORECASE)
    text = re.sub(r" 9 11 ", "911", text, re.IGNORECASE)
    text = re.sub(r"e - mail", "email", text, re.IGNORECASE)
    text = re.sub(r"j k", "jk", text, re.IGNORECASE)
    text = re.sub(r"\s{2,}", " ", text, re.IGNORECASE)
    text = re.sub(r"\？", " ", text, re.IGNORECASE)
    
    # Return a list of words
    return(text)

train_text_q1 = [] # preprocessed text of q1
train_text_q2 = [] # preprocessed text of q2

text_set = set() # complete set of words for building embeddings

for text in train_q1:
    tt = text_to_wordlist(text)
    text_set.add(tt)
    train_text_q1.append(tt)
for text in train_q2:
    tt = text_to_wordlist(text)
    text_set.add(tt)
    train_text_q2.append(tt)

test_text_q1 = [] # preprocessed text of q1
test_text_q2 = [] # preprocessed text of q2

for text in test_q1:
    tt = text_to_wordlist(text)
    text_set.add(tt)
    test_text_q1.append(tt)
for text in test_q2:
    tt = text_to_wordlist(text)
    text_set.add(tt)
    test_text_q2.append(tt)

train_test_text = list(text_set)

# vectorizer = TextVectorization(max_tokens=MAX_NUM_WORDS, output_sequence_length=EMBEDDING_DIM)
# text_ds = tf.data.Dataset.from_tensor_slices(train_test_text).batch(128)
# vectorizer.adapt(text_ds)
# voc = vectorizer.get_vocabulary()
# word_index = dict(zip(voc, range(len(voc))))
# print(f'word_index len: {len(word_index)}')

Processing text dataset


In [12]:
# Keras.Tokenize for all text:
# First construct a Tokenizer()
# Then use tokenizer_on_texts() method to learn the dictionary of the corpus(all texts(sentences)). 
#    We can use .word_index to map between the each word (distinct) with the corresponding number.
# Then use text_to_sequence() method to transfer every text(sentence) in texts into sequences of word_indexes.
# Then add the same length by padding method: padding_sequences().
# Finally use the embedding layer in keras to carry out a vectorization, and input it into LSTM.

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_text_q1 + train_text_q2 + test_text_q1 + test_text_q2)  # generate a token dictionary, 

train_sequences_1 = tokenizer.texts_to_sequences(train_text_q1)  # sequence of q1
train_sequences_2 = tokenizer.texts_to_sequences(train_text_q2)  # sequence of q2
test_sequences_1 = tokenizer.texts_to_sequences(test_text_q1)  # sequence of q1_test
test_sequences_2 = tokenizer.texts_to_sequences(test_text_q2)  # sequence of q2_test

word_index = tokenizer.word_index
print('{} unique tokens are found'.format(len(word_index)))

# Pad all train with Max_Sequence_Length: 60
train_data_1 = pad_sequences(train_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)  # padded_sequence of q1 as train_data
train_data_2 = pad_sequences(train_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)  # padded_sequence of q2 as train_data
print('Shape of train data tensor:', train_data_1.shape)
print('Shape of train labels tensor:', train_labels.shape)

# Pad all test with Max_Sequence_Length
test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)  # padded_sequence of q1_test as test_data
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)  # padded_sequence of q2_test as test_data
print('Shape of test data vtensor:', test_data_2.shape)
print('Shape of test ids tensor:', test_ids.shape)
      

86061 unique tokens are found
Shape of train data tensor: (363915, 60)
Shape of train labels tensor: (363915,)
Shape of test data vtensor: (40436, 60)
Shape of test ids tensor: (40436,)


In [13]:
from sklearn.metrics import roc_auc_score
from tensorflow.keras.layers import Dot

num_tokens = len(word_index) + 2
hits = 0
misses = 0
misses_txt = []

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    # print(embedding_vector.shape)
    # if embedding_vector.shape[0] == 0:
    #    print(f'word: {word}')
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
        misses_txt.append(word)
print("Converted %d words (%d misses)" % (hits, misses))
# print(f'misses: {misses_txt}')

embedding_layer = Embedding(
    num_tokens,
    EMBEDDING_DIM,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

# BiLSTM layer
from tensorflow.keras.layers import Bidirectional, LSTM
lstm_layer = Bidirectional(LSTM(N_HIDDEN, dropout=DROPOUT_RATE_LSTM, recurrent_dropout=DROPOUT_RATE_LSTM))


# Define inputs
seq1 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
seq2 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

# Run inputs through embedding
emb1 = embedding_layer(seq1)
emb2 = embedding_layer(seq2)

# Run through LSTM layers
lstm_a = lstm_layer(emb1)
lstm_b = lstm_layer(emb2)

# cosin_sim_func = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([lstm_a, lstm_b])
dotted = Dot(axes=-1, normalize=True)([lstm_a, lstm_b])

l1_norm = lambda x: 1 - K.abs(x[0] - x[1])
l1_dist = Lambda(function=l1_norm, output_shape=lambda x: x[0], name='L1_distance')([lstm_a, lstm_b])

merged = concatenate([lstm_a, lstm_b, l1_dist, dotted])
merged = BatchNormalization()(merged)
merged = Dropout(DROPOUT_RATE_DENSE)(merged)

merged = Dense(N_DENSE, activation=ACTIVE_FUNC)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(DROPOUT_RATE_DENSE)(merged)

merged = Dense(N_DENSE, activation=ACTIVE_FUNC)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(DROPOUT_RATE_DENSE)(merged)

merged = Dense(N_DENSE, activation=ACTIVE_FUNC)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(DROPOUT_RATE_DENSE)(merged)

merged = Dense(N_DENSE, activation=ACTIVE_FUNC)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(DROPOUT_RATE_DENSE)(merged)

preds = Dense(1, activation='sigmoid')(merged)

Converted 73278 words (12783 misses)


2022-02-05 22:54:22.931221: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-05 22:54:22.942441: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-05 22:54:22.943032: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-05 22:54:22.944017: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [14]:
# Train the model
from sklearn.metrics import roc_auc_score

# def auroc(y_true, y_pred):
#     return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

bst_model_path = VERSION + '.h5' 

model = Model(inputs=[seq1, seq2], outputs=preds)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

# Summerization of model
model.summary()



Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 60, 300)      25818900    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 1024)         3330048     embedding[0][0]              

In [15]:
# print(f'embedding misses: {misses_txt}')

print('Starting the model training')
# Set early stopping (large patience should be useful)
early_stopping =EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([train_data_1, train_data_2], train_labels, \
        validation_split=.1, \
        epochs=25, batch_size=128, shuffle=True, \
        callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path) # sotre model parameters in .h5 file
bst_val_score = min(hist.history['val_loss'])

Starting the model training
Epoch 1/25


2022-02-05 22:54:24.300012: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25


In [16]:

# save the model
model.save(VERSION)
tokenizer_json = tokenizer.to_json()
print(f'size of tokenizer json: {len(tokenizer_json)}')
with open(f'tokenizer.{VERSION}.json', 'w') as token_json:
    token_json.write(tokenizer_json)

2022-02-06 02:26:45.499811: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: bilstm3/assets
size of tokenizer json: 7969149


In [17]:
del model
model = tf.keras.models.load_model(VERSION) 
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 60, 300)      25818900    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 1024)         3330048     embedding[0][0]              

In [18]:


def tokenize_text(q1: str, q2: str, model_name: str):
    # tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    # tokenizer.fit_on_texts([q1, q2])  # generate a token dictionary
    
    with open(f'tokenizer.{model_name}.json', 'r') as f:
        token_json = f.read()
    tokenizer = keras.preprocessing.text.tokenizer_from_json(token_json)
    
    seq_q1 = tokenizer.texts_to_sequences([q1])
    seq_q2 = tokenizer.texts_to_sequences([q2])

    q1_data = pad_sequences(seq_q1, maxlen=MAX_SEQUENCE_LENGTH)
    q2_data = pad_sequences(seq_q2, maxlen=MAX_SEQUENCE_LENGTH)

    word_index = tokenizer.word_index

    return (q1_data, q2_data, word_index)

def clean_text(text):
    text = str(text).split()
    text = " ".join(text)

    # Use re to clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text, re.IGNORECASE)
    text = re.sub(r"what's", "what is ", text, re.IGNORECASE)
    text = re.sub(r"\’s", " ", text, re.IGNORECASE)
    text = re.sub(r"\'s", " ", text, re.IGNORECASE)
    text = re.sub(r"\'ve", " have ", text, re.IGNORECASE)
    text = re.sub(r"can't", "cannot ", text, re.IGNORECASE)
    text = re.sub(r"n't", " not ", text, re.IGNORECASE)
    text = re.sub(r"i'm", "i am ", text, re.IGNORECASE)
    text = re.sub(r"\'re", " are ", text, re.IGNORECASE)
    text = re.sub(r"\'d", " would ", text, re.IGNORECASE)
    text = re.sub(r"\'ll", " will ", text, re.IGNORECASE)
    text = re.sub(r"\‘", " ", text, re.IGNORECASE)
    text = re.sub(r"\’", " ", text, re.IGNORECASE)
    text = re.sub(r"\"", " ", text, re.IGNORECASE)
    text = re.sub(r"\“", " ", text, re.IGNORECASE)
    text = re.sub(r"\”", " ", text, re.IGNORECASE)
    text = re.sub(r",", " ", text, re.IGNORECASE)
    text = re.sub(r"\.", " ", text, re.IGNORECASE)
    text = re.sub(r"!", " ! ", text, re.IGNORECASE)
    text = re.sub(r"\/", " ", text, re.IGNORECASE)
    text = re.sub(r"\^", " ^ ", text, re.IGNORECASE)
    text = re.sub(r"\+", " + ", text, re.IGNORECASE)
    text = re.sub(r"\-", " - ", text, re.IGNORECASE)
    text = re.sub(r"\=", " = ", text, re.IGNORECASE)
    text = re.sub(r"'", " ", text, re.IGNORECASE)
    text = re.sub(r":", " : ", text, re.IGNORECASE)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text, re.IGNORECASE)
    text = re.sub(r" e g ", " eg ", text, re.IGNORECASE)
    text = re.sub(r" b g ", " bg ", text, re.IGNORECASE)
    text = re.sub(r" u s ", " american ", text, re.IGNORECASE)
    text = re.sub(r" 9 11 ", "911", text, re.IGNORECASE)
    text = re.sub(r"e - mail", "email", text, re.IGNORECASE)
    text = re.sub(r"j k", "jk", text, re.IGNORECASE)
    text = re.sub(r"\s{2,}", " ", text, re.IGNORECASE)
    text = re.sub(r"\？", " ", text, re.IGNORECASE)

    return text

def get_model(model_name: str):
    global model
    if model is None:
        model = tf.keras.models.load_model(model_name) 
    
    return model

def predict(q1: str, q2: str, model_name: str):
    q1 = clean_text(q1)
    q2 = clean_text(q2)

    q1_data, q2_data, word_index = tokenize_text(q1, q2, model_name)
    
    model = get_model(model_name)
    predictions = model([q1_data, q2_data])
    print(f'q1: {q1}')
    print(f'q2: {q2}')
    print(f'same probabilty: {predictions}\n' + '-' * 20)
    


In [19]:
model_name = 'bilstm5'
model = tf.keras.models.load_model(model_name) 

q1 = 'What can make Physics easy to learn?'
q2 = 'How can you make physics easy to learn?'
predict(q1, q2, model_name)
# embed = get_word_embeddings()

q1 = 'What should I do to be a great geologist?'
q2 = 'How can I be a good geologist?'
predict(q1, q2, model_name)

# Do you believe there is life after death? 	Is it true that there is life after death? 	1
q1 = 'Do you believe there is life after death?'
q2 = 'Is it true that there is life after death?'
predict(q1, q2, model_name)

# How do I read and find my YouTube comments? How can I see all my Youtube comments? 1
q1 = 'How do I read and find my YouTube comments?'
q2 = 'How can I see all my Youtube comments?'
predict(q1, q2, model_name)

q1 = 'How do I see the color blue?'
q2 = 'How do I see the color blue?'
predict(q1, q2, model_name)

 # What is one coin? 	What's this coin? 	0
q1 = 'What is one coin?'
q2 = 'What\'s this coin?'
predict(q1, q2, model_name)

# Why do girls want to be friends with the guy they reject? How do guys feel after rejecting a girl? 0
q1 = 'Why do girls want to be friends with the guy they reject?'
q2 = 'How do guys feel after rejecting a girl?'
predict(q1, q2, model_name)

q1: What can make Physics easy to learn?
q2: How can you make physics easy to learn?
same probabilty: [[0.8847427]]
--------------------
q1: What should I do to be a great geologist?
q2: How can I be a good geologist?
same probabilty: [[0.76968783]]
--------------------
q1: Do you believe there is life after death?
q2: Is it true that there is life after death?
same probabilty: [[0.9720581]]
--------------------
q1: How do I read and find my YouTube comments?
q2: How can I see all my Youtube comments?
same probabilty: [[0.8024525]]
--------------------
q1: How do I see the color blue?
q2: How do I see the color blue?
same probabilty: [[0.9968671]]
--------------------
q1: What is one coin?
q2: What this coin?
same probabilty: [[0.16731362]]
--------------------
q1: Why do girls want to be friends with the guy they reject?
q2: How do guys feel after rejecting a girl?
same probabilty: [[7.3990814e-05]]
--------------------


In [20]:
model_name = 'bilstm3'
model = tf.keras.models.load_model(model_name) 

q1 = 'What can make Physics easy to learn?'
q2 = 'How can you make physics easy to learn?'
predict(q1, q2, model_name)
# embed = get_word_embeddings()

q1 = 'What should I do to be a great geologist?'
q2 = 'How can I be a good geologist?'
predict(q1, q2, model_name)

# Do you believe there is life after death? 	Is it true that there is life after death? 	1
q1 = 'Do you believe there is life after death?'
q2 = 'Is it true that there is life after death?'
predict(q1, q2, model_name)

# How do I read and find my YouTube comments? How can I see all my Youtube comments? 1
q1 = 'How do I read and find my YouTube comments?'
q2 = 'How can I see all my Youtube comments?'
predict(q1, q2, model_name)

q1 = 'How do I see the color blue?'
q2 = 'How do I see the color blue?'
predict(q1, q2, model_name)

 # What is one coin? 	What's this coin? 	0
q1 = 'What is one coin?'
q2 = 'What\'s this coin?'
predict(q1, q2, model_name)

# Why do girls want to be friends with the guy they reject? How do guys feel after rejecting a girl? 0
q1 = 'Why do girls want to be friends with the guy they reject?'
q2 = 'How do guys feel after rejecting a girl?'
predict(q1, q2, model_name)

q1: What can make Physics easy to learn?
q2: How can you make physics easy to learn?
same probabilty: [[0.843715]]
--------------------
q1: What should I do to be a great geologist?
q2: How can I be a good geologist?
same probabilty: [[0.84447056]]
--------------------
q1: Do you believe there is life after death?
q2: Is it true that there is life after death?
same probabilty: [[0.9288534]]
--------------------
q1: How do I read and find my YouTube comments?
q2: How can I see all my Youtube comments?
same probabilty: [[0.88565695]]
--------------------
q1: How do I see the color blue?
q2: How do I see the color blue?
same probabilty: [[0.92360485]]
--------------------
q1: What is one coin?
q2: What this coin?
same probabilty: [[0.24006581]]
--------------------
q1: Why do girls want to be friends with the guy they reject?
q2: How do guys feel after rejecting a girl?
same probabilty: [[0.00982438]]
--------------------


In [21]:
model_name = 'bilstm10'
model = tf.keras.models.load_model(model_name) 

q1 = 'What can make Physics easy to learn?'
q2 = 'How can you make physics easy to learn?'
predict(q1, q2, model_name)
# embed = get_word_embeddings()

q1 = 'What should I do to be a great geologist?'
q2 = 'How can I be a good geologist?'
predict(q1, q2, model_name)

# Do you believe there is life after death? 	Is it true that there is life after death? 	1
q1 = 'Do you believe there is life after death?'
q2 = 'Is it true that there is life after death?'
predict(q1, q2, model_name)

# How do I read and find my YouTube comments? How can I see all my Youtube comments? 1
q1 = 'How do I read and find my YouTube comments?'
q2 = 'How can I see all my Youtube comments?'
predict(q1, q2, model_name)

q1 = 'How do I see the color blue?'
q2 = 'How do I see the color blue?'
predict(q1, q2, model_name)

 # What is one coin? 	What's this coin? 	0
q1 = 'What is one coin?'
q2 = 'What\'s this coin?'
predict(q1, q2, model_name)

# Why do girls want to be friends with the guy they reject? How do guys feel after rejecting a girl? 0
q1 = 'Why do girls want to be friends with the guy they reject?'
q2 = 'How do guys feel after rejecting a girl?'
predict(q1, q2, model_name)

q1: What can make Physics easy to learn?
q2: How can you make physics easy to learn?
same probabilty: [[0.9345015]]
--------------------
q1: What should I do to be a great geologist?
q2: How can I be a good geologist?
same probabilty: [[0.8976296]]
--------------------
q1: Do you believe there is life after death?
q2: Is it true that there is life after death?
same probabilty: [[0.9052738]]
--------------------
q1: How do I read and find my YouTube comments?
q2: How can I see all my Youtube comments?
same probabilty: [[0.9159307]]
--------------------
q1: How do I see the color blue?
q2: How do I see the color blue?
same probabilty: [[0.9665545]]
--------------------
q1: What is one coin?
q2: What this coin?
same probabilty: [[0.07237303]]
--------------------
q1: Why do girls want to be friends with the guy they reject?
q2: How do guys feel after rejecting a girl?
same probabilty: [[0.00618028]]
--------------------
