In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from nltk.corpus import stopwords
import numpy as np
from gensim.models.wrappers.fasttext import FastTextKeyedVectors
from nltk import word_tokenize
import subprocess

Using TensorFlow backend.


### Data read

In [2]:
df = pd.read_csv('data/quora-train.csv')
df = df.fillna('a')

### FastText + Gensim sex

In [3]:
FASTTEXT_EXECUTABLE = 'fasttext'
PRETRAINED_MODEL_FILE = 'model/fasttext/wiki.en.bin'
VOCAB_FILE = 'model/fasttext/quora.vocab'
OUTPUT_FILE = 'model/fasttext/quora.vec'
EMBEDDING_DIM = 300

In [4]:
all_words = set(word_tokenize('\n'.join(list(df.question1) + list(df.question2)).lower()))
all_words.add("<pad>")

In [5]:
with open(VOCAB_FILE, 'w') as f:
    for word in all_words:
        f.write(word + "\n")

In [6]:
!rm model/fasttext/quora.vec

In [7]:
with open(VOCAB_FILE) as f_vocab:
    with open(OUTPUT_FILE, 'a') as f_output:
        print(f'{len(all_words)} {EMBEDDING_DIM}', file=f_output)
        subprocess.run(
            [FASTTEXT_EXECUTABLE, 'print-word-vectors', PRETRAINED_MODEL_FILE],
            stdin=f_vocab,
            stdout=f_output,
        )

In [8]:
with open(OUTPUT_FILE, 'r+') as f:
    content = f.read()
    f.seek(0, 0)
    print(f'{len(all_words)} {EMBEDDING_DIM}\n{content}', file=f)

### Model

In [9]:
model = FastTextKeyedVectors.load_word2vec_format('model/fasttext/quora.vec')

In [21]:
model.vocab

{'/it': <gensim.models.keyedvectors.Vocab at 0x7f727c1ad320>,
 'kroes': <gensim.models.keyedvectors.Vocab at 0x7f727c1767b8>,
 'x^2−x−4': <gensim.models.keyedvectors.Vocab at 0x7f727c1ae080>,
 'olivetti': <gensim.models.keyedvectors.Vocab at 0x7f727c172358>,
 'scala+akka': <gensim.models.keyedvectors.Vocab at 0x7f727c1ae0f0>,
 'dragging': <gensim.models.keyedvectors.Vocab at 0x7f727c1725f8>,
 'needing': <gensim.models.keyedvectors.Vocab at 0x7f727c1ae128>,
 'squeezed': <gensim.models.keyedvectors.Vocab at 0x7f727c172780>,
 'fiiitjee': <gensim.models.keyedvectors.Vocab at 0x7f727c1ae160>,
 'downoad': <gensim.models.keyedvectors.Vocab at 0x7f727c1727f0>,
 'salish': <gensim.models.keyedvectors.Vocab at 0x7f727c1ae278>,
 'stargardt': <gensim.models.keyedvectors.Vocab at 0x7f727c172860>,
 'by…': <gensim.models.keyedvectors.Vocab at 0x7f727c1ae2b0>,
 'hernia.july': <gensim.models.keyedvectors.Vocab at 0x7f727c172908>,
 '650-155': <gensim.models.keyedvectors.Vocab at 0x7f727c1ae2e8>,
 'positi

In [15]:
sw = stopwords.words('english')
def preprocess(q):
    q = q.lower()
    tokens = list(filter(lambda t: t in model.vocab and t not in sw, word_tokenize(q)))
    return tokens

In [16]:
model.wmdistance(preprocess(df.loc[0]['question1']), preprocess(df.loc[0]['question2']))

0.6495802873639844

In [17]:
wm_distance = np.zeros(len(df))
for i in range(len(df)):
    wm_distance[i] = model.wmdistance(preprocess(df.loc[i]['question1']), preprocess(df.loc[i]['question2']))

In [18]:
wm_distance

array([ 0.64958029,  2.16065929,  2.32230936, ...,  1.99782613,
        4.38424129,  0.        ])

In [19]:
df['wmd'] = wm_distance

In [20]:
df.ix[df['wmd'] == np.inf, 'wmd'] = 10000
df.to_csv('data/quora-train.csv', index=False)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.
