<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"></ul></div>

In [44]:
import gc
import re
import sys
import random
import pickle
from collections import Counter

import numpy as np

from sklearn.datasets import fetch_20newsgroups

from keras.models import Model
from keras.layers import Embedding, Dense, Input, Dot, Multiply, Lambda
from keras.layers import dot
# from keras

In [2]:
WORD_RE = re.compile(r'[a-z]+')

In [3]:
data = fetch_20newsgroups()

In [4]:
data.keys()

dict_keys(['target_names', 'data', 'filenames', 'description', 'DESCR', 'target'])

In [5]:
texts = list(map(lambda x: x.lower(), data['data']))

In [26]:
VOCABULARY = []
for text in texts:
    VOCABULARY += WORD_RE.findall(text)
counter = Counter(VOCABULARY)

In [33]:
VOCABULARY = [word for word, _ in counter.most_common(10000)]
VOCABULARY.sort()
VOCABULARY.insert(0, 'UNKNOWN')
WORD2IND = {word: i for i, word in enumerate(VOCABULARY)}

In [34]:
INDEXES = list(range(len(VOCABULARY)))

In [128]:
word_input = Input(shape=(1, ))
word_embedding = Embedding(input_dim=len(VOCABULARY), output_dim=300)(word_input)

context_input = Input(shape=(1, ))
context_embedding = Embedding(input_dim=len(VOCABULARY), output_dim=300)(context_input)

merge = Dot(axes=2)([context_embedding, word_embedding])
t_slice = Lambda(lambda x: x[:, 0, :])(merge)
target = Dense(1, activation='sigmoid')(t_slice)

model = Model(inputs=[word_input, context_input], outputs=target)

In [129]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [37]:
WINDOW_SIZE = 5

inputs = []
contexts = []
targets = []
for text in texts:
    for sentence in text.split('.'):
        words = WORD_RE.findall(sentence)
        words_indexes = [WORD2IND.get(word, 0) for word in words]
        for i in range(len(words_indexes)):
            words_sample = words_indexes[max(0, i - WINDOW_SIZE): i + WINDOW_SIZE]
            words_sample = [ind for ind in words_sample if ind != words_indexes[i]]
            
            if not words_sample:
                continue
            
            # add positive
            inputs.append([words_indexes[i]])
            contexts.append([random.choice(words_sample)])
            targets.append(1)
            
            # add negatives
            for j in range(5):
                inputs.append([words_indexes[i]])
                contexts.append([random.choice(INDEXES)])
                targets.append(0)

In [38]:
sys.getsizeof(inputs)

185941288

In [39]:
inputs = np.array(inputs, dtype=np.int16)

In [40]:
gc.collect()

0

In [41]:
contexts = np.array(contexts, dtype=np.int16)

In [42]:
gc.collect()

0

In [130]:
model.fit([inputs, contexts], targets, batch_size=512)

Epoch 1/1


<keras.callbacks.History at 0x7ff2f06e93c8>

# get and store embeddings

In [131]:
embedding_layer = model.layers[2]

In [132]:
embedding_weights = embedding_layer.get_weights()[0]

In [133]:
word2vec = {
    'weights': embedding_weights,
    'vocabulary': VOCABULARY,
    'word2index': WORD2IND,
}

In [134]:
with open('../data/word2vec.pkl', 'wb') as f:
    f.write(pickle.dumps(word2vec))

In [135]:
with open('../data/word2vec.pkl', 'rb') as f:
    word2vec = pickle.loads(f.read())

# get similar words

In [164]:
index = VOCABULARY.index('wrong')

In [165]:
similarity = np.dot(embedding_weights, embedding_weights[index])

In [166]:
similar_indexes = similarity.argsort()[-10:]

In [167]:
for i in similar_indexes:
    print(VOCABULARY[i])

blindly
raider
complaining
wrong
retract
heal
wondering
warned
mistaken
hoped


In [168]:
for text in texts:
    if 'utsa' in text:
        print(text)
        break

from: aa888@freenet.carleton.ca (mark baker)
subject: re: the arrogance of christians
reply-to: aa888@freenet.carleton.ca (mark baker)
organization: the national capital freenet
lines: 22

in a previous article, mhsu@lonestar.utsa.edu (melinda . hsu) says:

>
>well the argument usually stops right there.  in the end,
>aren't we all just kids, groping for the truth?  if so, do we have
>the authority to declare all other beliefs besides our own as
>false?
>

if i don't think my belief is right and everyone else's belief is wrong,
then i don't have a belief. this is simply what belief means. where does
the authority for a belief come from? nowhere, for a belief is itself
authoratative. if i produce authority for a belief, where will i find
authority for my belief in the legitimacy of the authority. in short, 
the mind has to start somewhere. (by the way, the majority of christians,
i.e. catholics, believe in the authority of the church, and derive the
authority of the bible from its accep

In [154]:
text = texts[0]

In [155]:
print(text)

from: lerxst@wam.umd.edu (where's my thing)
subject: what car is this!?
nntp-posting-host: rac3.wam.umd.edu
organization: university of maryland, college park
lines: 15

 i was wondering if anyone out there could enlighten me on this car i saw
the other day. it was a 2-door sports car, looked to be from the late 60s/
early 70s. it was called a bricklin. the doors were really small. in addition,
the front bumper was separate from the rest of the body. this is 
all i know. if anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

thanks,
- il
   ---- brought to you by your neighborhood lerxst ----





