In [1]:
from Models import *
from Utils import *
from FastModels import *
import warnings
from livelossplot import PlotLossesKeras
# warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
from gensim.models import KeyedVectors
import sentencepiece as spm

max_len = 20
enablePadding = False

sp = spm.SentencePieceProcessor()
sp.Load('/work/data/bpe/en.wiki.bpe.op50000.model')
bpe = KeyedVectors.load_word2vec_format("/work/data/bpe/en.wiki.bpe.op50000.d200.w2v.bin", binary=True)
bpe.index2word = [''] + bpe.index2word + ['<sos>']  # add empty string 
nb_words = len(bpe.index2word)
# word2index
bpe_dict = {bpe.index2word[i]: i for i in range(len(bpe.index2word))}
# construct embedding_matrix
embedding_matrix = np.concatenate([np.zeros((1, bpe.vector_size)), bpe.vectors, np.zeros((1, bpe.vector_size))]) # add zero vector for empty string (i.e. used for padding)

embedding_layer = Embedding(nb_words,
                    embedding_matrix.shape[-1],
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=True)

In [3]:
enablePadding = True
df_may, qrel_may = get_test_data("MayFlower", "/work/")
df_june, qrel_june = get_test_data("JuneFlower", "/work/")
df_july, qrel_july = get_test_data("JulyFlower", "/work/")

q_may = parse_texts_bpe(df_may.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_may = parse_texts_bpe(df_may.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_june = parse_texts_bpe(df_june.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_june = parse_texts_bpe(df_june.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_july = parse_texts_bpe(df_july.q.tolist(), sp, bpe_dict, max_len, enablePadding)
d_july = parse_texts_bpe(df_july.d.tolist(), sp, bpe_dict, max_len, enablePadding)

q_july_ = parse_texts_bpe(df_july.q.tolist(), sp, bpe_dict, max_len, enablePadding, "post")
d_july_ = parse_texts_bpe(df_july.d.tolist(), sp, bpe_dict, max_len, enablePadding, "post")

test_set = [[q_may, d_may, qrel_may, df_may, "MayFlower"], [q_june, d_june, qrel_june, df_june, "JuneFlower"], [q_july, d_july, qrel_july, df_july, "JulyFlower"]]


b'Skipping line 19898: expected 6 fields, saw 8\nSkipping line 20620: expected 6 fields, saw 8\nSkipping line 38039: expected 6 fields, saw 8\n'


In [4]:
def toBOW(x, nb_words):
    x_ = np.zeros((len(x), nb_words))
    for idx, i in enumerate(x):
        x_[idx][i] = 1
    return x_

In [5]:
q_bow_may = toBOW(q_may, nb_words)
d_bow_may = toBOW(d_may, nb_words)

q_bow_june = toBOW(q_june, nb_words)
d_bow_june = toBOW(d_june, nb_words)

q_bow_july = toBOW(q_july, nb_words)
d_bow_july = toBOW(d_july, nb_words)



In [6]:
from scipy.stats import wasserstein_distance
def eval_july(run, q_july, d_july, mode="cos"):
    q_ = run.predict(q_july)
    d_ = run.predict(d_july)
    
    if mode == "cos":
        cosine = CosineSim(q_.shape[-1])
        pred = cosine.model.predict([q_, d_])        
    else:
        pred = []
        for i in range(len(q_)):
            pred.append(wasserstein_distance(q_[i], d_[i]))
        pred = np.array(pred)

    print(auc(qrel_july, pred.flatten()))

In [7]:
class BOW_VAE(object):
    

    def __init__(self, input_size, max_len, embedding_matrix, dim, comp_topk=None, ctype=None, epsilon_std=1.0, optimizer=Adadelta(lr=2.), kl_weight=0, PoolMode="max"):
        self.input_size = input_size
        self.dim = dim
        self.embedding_matrix = embedding_matrix
        self.comp_topk = comp_topk
        self.ctype = ctype
        self.epsilon_std = epsilon_std
        self.max_len = max_len
        self.nb_words = input_size
        self.optimizer = optimizer
        self.kl_weight = kl_weight
        self.enableKL = True if kl_weight == 0 else False
        self.PoolMode = PoolMode
        self.build()

    def build(self):
        act = 'tanh'
        input_layer = Input(shape=(self.max_len,))
        emb_layer = Embedding(self.nb_words,
                            self.embedding_matrix.shape[-1],
                            weights=[self.embedding_matrix],
                            input_length=self.max_len,
                            trainable=True)

        hidden_layer1 = Dense(self.dim[0], kernel_initializer='glorot_normal', activation=act)
        
        Pooling = GlobalMaxPooling1D() if self.PoolMode == "max" else GlobalAveragePooling1D()

        h1 = Pooling(emb_layer(input_layer))
        
#         h1 = hidden_layer1(input_layer)

        self.z_mean = Dense(self.dim[1], kernel_initializer='glorot_normal')(h1)
        self.z_log_var = Dense(self.dim[1], kernel_initializer='glorot_normal')(h1)

        if self.comp_topk != None:
            self.z_mean_k = KCompetitive(self.comp_topk, self.ctype)(self.z_mean)
            encoded = Lambda(self.sampling, output_shape=(self.dim[1],))([self.z_mean_k, self.z_log_var])
        else:
            encoded = Lambda(self.sampling, output_shape=(self.dim[1],))([self.z_mean, self.z_log_var])


        decoder_h = Dense(self.dim[0], kernel_initializer='glorot_normal', activation=act)

        hidden2vocab = Dense(self.nb_words, activation='sigmoid', name="rec")
        h_decoded = decoder_h(encoded)
        h_decoded = hidden2vocab(h_decoded)
     
        self.model = Model(input_layer, [h_decoded, h_decoded])
        self.model.compile(optimizer=self.optimizer, loss=["binary_crossentropy", self.kl_loss])

        self.encoder = Model(outputs=self.z_mean, inputs=input_layer)


    def name(self):
        n = "vae_%s" % self.PoolMode if self.comp_topk == None else "kate_%s" % self.PoolMode
        n =  n + "_kl" if self.enableKL else n
        return n
    def sampling(self, args):
        z_mean, z_log_var = args
        epsilon = K.random_normal(shape=(K.shape(z_mean)[0], self.dim[1]), mean=0.,\
                                  stddev=self.epsilon_std)

        return z_mean + K.exp(z_log_var / 2) * epsilon


    def kl_loss(self, x, x_):
        if self.comp_topk != None:
            kl_loss = - 0.5 * K.sum(1 + self.z_log_var - K.square(self.z_mean_k) - K.exp(self.z_log_var), axis=-1)
        else:
            kl_loss = - 0.5 * K.sum(1 + self.z_log_var - K.square(self.z_mean) - K.exp(self.z_log_var), axis=-1)
        return self.kl_weight * kl_loss



In [12]:
K.clear_session()
run = BOW_VAE(nb_words, max_len, embedding_matrix, [200, 200], optimizer=Adam(), kl_weight=0.0001)

In [13]:
x_train = np.concatenate([q_july, d_july])
y_train = [np.concatenate([q_bow_july, d_bow_july]), np.concatenate([q_bow_july, d_bow_july])]
# y_train = [y_train, y_train]

In [14]:
for i in range(50):
    run.model.fit(x_train, y_train, verbose=2, batch_size=8)
#     eval_july(run.encoder, q_july, d_july, "wes")
    
#     kl_rate = 0.01
#     run.kl_weight = min(run.kl_weight + kl_rate, 1)
#     run.model.compile(optimizer=run.optimizer, loss=["binary_crossentropy", run.kl_loss])

    
    print(evaluate(run.encoder, test_set))

Epoch 1/1
 - 3s - loss: 0.1236 - rec_loss_1: 0.0925 - rec_loss_2: 0.0310
(0.5069370523131211, 0.8383219857882652, 0.565622984722809, 0, 0, 0)
Epoch 1/1
 - 3s - loss: 0.0211 - rec_loss_1: 0.0048 - rec_loss_2: 0.0163
(0.508622162796155, 0.838533040906044, 0.5702372745669239, 0, 0, 0)
Epoch 1/1
 - 2s - loss: 0.0206 - rec_loss_1: 0.0038 - rec_loss_2: 0.0168
(0.5078873864194655, 0.838329313674712, 0.5556939224799307, 0, 0, 0)
Epoch 1/1
 - 3s - loss: 0.0293 - rec_loss_1: 0.0041 - rec_loss_2: 0.0252
(0.5068571407521472, 0.8384083860643323, 0.5724943850207921, 0, 0, 0)
Epoch 1/1
 - 3s - loss: 0.0314 - rec_loss_1: 0.0053 - rec_loss_2: 0.0261
(0.5052393923513481, 0.8385065615124149, 0.5728168293713447, 0, 0, 0)
Epoch 1/1
 - 3s - loss: 0.0152 - rec_loss_1: 0.0020 - rec_loss_2: 0.0132
(0.5070293705376486, 0.8386823433920142, 0.5595743734572706, 0, 0, 0)
Epoch 1/1
 - 2s - loss: 0.0149 - rec_loss_1: 0.0020 - rec_loss_2: 0.0130
(0.5071541787472903, 0.8387767856751971, 0.5591185038582134, 0, 0, 0)
Epo

In [273]:
w2v = Sequential([Embedding(nb_words,
                    embedding_matrix.shape[-1],
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=True), GlobalAveragePooling1D()])


In [254]:
w2v.predict(q_july).shape

(426, 20, 200)

In [236]:
df_sts_train = pd.read_csv("/work/data/train_data/sts-b/train.tsv", sep="\t",encoding='utf-8',quoting=3,  header=None, error_bad_lines=False, usecols=[5,6,4], names=["label","q","d"])
df_sts_dev = pd.read_csv("/work/data/train_data/sts-b/dev.tsv", sep="\t",encoding='utf-8',quoting=3,  header=None, error_bad_lines=False, usecols=[5,6,4], names=["label","q","d"])
df_sts_test = pd.read_csv("/work/data/train_data/sts-b/test.tsv", sep="\t",encoding='utf-8',quoting=3,  header=None, error_bad_lines=False, usecols=[5,6,4], names=["label","q","d"])

for df in [df_sts_train, df_sts_dev, df_sts_test]:
    df.q = df.q.astype(str).str.lower()
    df.d = df.d.astype(str).str.lower()

In [235]:
df_sts_test

Unnamed: 0,label,q,d
0,2.500,A girl is styling her hair.,A girl is brushing her hair.
1,3.600,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.
2,5.000,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.
3,4.200,A man is cutting up a cucumber.,A man is slicing a cucumber.
4,1.500,A man is playing a harp.,A man is playing a keyboard.
5,1.800,A woman is cutting onions.,A woman is cutting tofu.
6,3.500,A man is riding an electric bicycle.,A man is riding a bicycle.
7,2.200,A man is playing the drums.,A man is playing the guitar.
8,2.200,A man is playing guitar.,A lady is playing the guitar.
9,1.714,A man is playing a guitar.,A man is playing a trumpet.


In [171]:
# No labels on test set for Quora
df_quora_train = pd.read_csv("/work/data/train_data/quora/train.tsv", sep="\t",encoding='utf-8',quoting=3,  header=None, error_bad_lines=False, usecols=[3,4,5], names=["q","d","label"])
df_quora_dev = pd.read_csv("/work/data/train_data/quora/dev.tsv", sep="\t",encoding='utf-8',quoting=3,  header=None, error_bad_lines=False, usecols=[3,4,5], names=["q","d","label"])

for df in [df_quora_train, df_quora_val]:
    df.q = df.q.astype(str).str.lower()
    df.d = df.d.astype(str).str.lower()

In [215]:
# No labels on test set for Quora
df_para_train = pd.read_csv("/work/data/train_data/para/ParaphraseIdeal.txt", sep="\t",encoding='utf-8',quoting=3,  header=None, error_bad_lines=False, usecols=[0,1,2], names=["q","d","label"])

for df in [df_para_train]:
    df.q = df.q.astype(str).str.lower()
    df.d = df.d.astype(str).str.lower()

In [267]:
from scipy import stats
from sklearn.metrics import accuracy_score

def eval(run, df, mode="auc"):
    
    enablePadding = True
    q = parse_texts_bpe(df.q.tolist(), sp, bpe_dict, max_len, enablePadding)
    d = parse_texts_bpe(df.d.tolist(), sp, bpe_dict, max_len, enablePadding)
    
    q_ = run.predict(q)
    d_ = run.predict(d)
    cosine = CosineSim(q_.shape[-1])

    pred = cosine.model.predict([q_, d_]).flatten()
    
    if mode == "auc":
        print(auc(df.label.values, pred))
    else:
        print(scipy.stats.pearsonr(pred, df.label.values)[0])
#     print(scipy.stats.spearmanr(pred, label)[0])

In [229]:
eval(w2v, df_para_train)

0.9388301628808409


In [230]:
eval(w2v, df_quora_dev)

0.704084157177363


In [237]:
eval(w2v, df_sts_dev, "pcc")
eval(w2v, df_sts_test, "pcc")
# Minh's result = 0.75, max = 0.65

0.5432863604684736
0.43498635136840813


In [12]:
length = []
for i in [q_sts2015_df, d_sts2015_df]:
    for j in i:
        l = len(j)
        length.append(l)
print(np.max(length), np.mean(length))

659 16.516648853079012


In [301]:
run = SeqVAE(nb_words, max_len, embedding_matrix, [200,200], optimizer=Adam(), kl_weight=1)
