In [1]:
%load_ext autoreload
%autoreload 2 

In [2]:
import os
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import OrderedDict

from nltk import word_tokenize

from embeddings import load_embeddings, word_index

Using TensorFlow backend.


## 0. Model parameters

In [3]:
parameters = OrderedDict()

parameters["embeds"] = "glove"
parameters["w_embed_size"] = 300
parameters["load_embeds"] = True
parameters["freeze"] = True

## 1. Data Processing

### Load pretrained embeddings

In [4]:
if parameters["embeds"] == "glove":
    embeddings_path = "word_embeddings/glove.6B/glove.6B.%sd_w2vformat.txt" % parameters["w_embed_size"]
    binary = False
else:
    embeddings_path = "word_embeddings/google/GoogleNews-vectors-negative300.bin"
    binary = True
    
if parameters["load_embeds"]:
    loaded_embeddings, (w2idx, idx2w) = load_embeddings(embeddings_path, binary=binary)
else:
    parameters["freeze"] = False

Loading from saved word_embeddings
Loading vocab


### Load Quora question pairs

In [5]:
data_path = "data/quora/"

# Download train/test from https://www.kaggle.com/c/quora-question-pairs/data
df_train = pd.read_csv(data_path + "train.csv")
df_test = pd.read_csv(data_path + "test.csv")

In [6]:
df_train[:10]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [7]:
q_dict = {}

qid1 = np.array(df_train["qid1"])
qid2 = np.array(df_train["qid2"])

q1 = np.array(df_train["question1"])
q2 = np.array(df_train["question2"])
is_duplicate = np.array(df_train["is_duplicate"])

In [8]:
for i,qid in enumerate(qid1):
    if qid not in q_dict:
        q_dict[qid] = q1[i]
        
for i,qid in enumerate(qid2):
    if qid not in q_dict:
        q_dict[qid] = q2[i]

In [9]:
q_dict[222]

'Where can I watch sarrainodu with subtitles?'

In [10]:
for k,v in q_dict.items():
    if not type(v) == str:
        print(k, v)

174364 nan


In [11]:
q_dict[174364] = "nan"
df_train[df_train["qid2"]==174364]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0


In [12]:
def lower_list(l):
    return([elt.lower() for elt in l])


def tokenize(sents, lower=True, stemmer=None):
    if stemmer == "english":
        snowball = SnowballStemmer("english")
        return [lower_list([snowball.stem(token) for token in word_tokenize(sent)]) if lower 
                else  [snowball.stem(token) for token in word_tokenize(sent)] 
                for sent in sents]
    else:
        return [lower_list(word_tokenize(sent)) if lower
                else word_tokenize(sent)
                for sent in sents]
    
    
def tokenize_dict(q_dict, lower=True,stemmer=None):
    if stemmer == "english":
        snowball = SnowballStemmer("english")
        return {k:lower_list([snowball.stem(token) for token in word_tokenize(q_dict[k])]) if lower
                else [snowball.stem(token) for token in word_tokenize(q_dict[k])]
                for k in q_dict.keys()}
    else:
        return {k:lower_list(word_tokenize(q_dict[k])) if lower
                else word_tokenize(q_dict[k])
                for k in q_dict.keys()}
    
def sent2ids(sent, w2idx):
    return [w2idx[w] for w in sent]

def ids2sent(ids, idx2w):
    return [idx2w[i] for i in ids]

In [13]:
tokenized = tokenize_dict(q_dict)

In [14]:
w2idx_train, idx2w_train = word_index(tokenized.values())
print(len(w2idx_train))

113351


In [15]:
s = tokenized[222]

ids = sent2ids(s, w2idx_train)
s2 = ids2sent(ids, idx2w_train)

print(s)
print(ids)
print(s2)

['where', 'can', 'i', 'watch', 'sarrainodu', 'with', 'subtitles', '?']
[385, 22, 23, 587, 22436, 136, 589, 13]
['where', 'can', 'i', 'watch', 'sarrainodu', 'with', 'subtitles', '?']


### Fit embeddings to vocabulary

In [16]:
loaded_embeddings.shape

(400000, 300)

In [17]:
embeddings = np.random.normal(scale=0.001, size=(len(w2idx_train), parameters["w_embed_size"]))

if parameters["load_embeds"]:
    for w, i in w2idx_train.items():
        idx = w2idx.get(w)
        if idx is not None:
            embeddings[i] = loaded_embeddings[idx][:parameters["w_embed_size"]]

In [18]:
embeddings

array([[ -4.15930079e-04,  -9.98584309e-04,  -7.57852110e-04, ...,
         -2.99900648e-04,  -9.97346122e-04,   9.79419447e-04],
       [ -2.00169995e-01,   1.43020004e-01,   5.20550013e-02, ...,
          3.49389985e-02,  -1.25990003e-01,   2.18630001e-01],
       [ -1.74899995e-01,   2.29560003e-01,   2.49239996e-01, ...,
         -2.41310000e-01,  -4.04020011e-01,   5.47440015e-02],
       ..., 
       [ -2.06540003e-02,   5.19459993e-02,  -1.97559997e-01, ...,
         -1.90200001e-01,   2.75139987e-01,   4.51590002e-01],
       [  3.10790002e-01,   5.72499990e-01,   1.07009999e-01, ...,
          1.45359993e-01,   5.73599994e-01,   5.94009995e-01],
       [ -4.35460001e-01,  -1.40729994e-01,  -2.65529990e-01, ...,
          4.26380008e-01,  -3.74700017e-02,   2.60300010e-01]])