# word2vec

## imports

In [4]:
import collections
import math
import os
import errno
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf

## data

In [7]:
data_dir = "../../FULL-TENSORFLOW-NOTES-AND-DATA/Tensorflow-Bootcamp-master/04-Recurrent-Neural-Networks/word2vec_data/words/"
data_url = "http://mattmahoney.net/dc/text8.zip"

### fetch data

In [8]:
def fetch_words_data(url=data_url, words_data=data_dir):
    #make dir if doesn't exist
    os.makedirs(words_data, exist_ok=True)
    
    #path to zip file
    zip_path = os.path.join(words_data, "words.zip")
    
    #if zip file isn't present, download
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(url, zip_path)
    
    #zip file is there now. grab data
    with zipfile.ZipFile(zip_path) as f:
        data = f.read(f.namelist()[0])
        
    #return all words from data source
    return data.decode("ascii").split()

In [9]:
words = fetch_words_data()

In [10]:
len(words)

17005207

### viewing a small sample

In [11]:
words[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [14]:
print(" ".join(words[9000:9040]))

feelings and the auditory system of a person without autism often cannot sense the fluctuations what seems to non autistic people like a high pitched sing song or flat robot like voice is common in autistic children some autistic children


### counting frequencies

In [16]:
from collections import Counter

In [17]:
my_list = ["one", "two", "three", "two"]
Counter(my_list)

Counter({'one': 1, 'three': 1, 'two': 2})

In [34]:
list(enumerate(my_list))

[(0, 'one'), (1, 'two'), (2, 'three'), (3, 'two')]

In [28]:
#most common 2 words
Counter(my_list).most_common(2)

[('two', 2), ('one', 1)]

In [36]:
def create_counts(vocab_size=50000):
    vocab = Counter(words).most_common(vocab_size)
    vocab = np.array([word for word, _ in vocab])
    dictionary = {word: code for code, word in enumerate(vocab)}
    data = np.array([dictionary.get(word, 0) for word in words])
    return data, vocab

In [37]:
data, vocab = create_counts()

In [38]:
data.shape

(17005207,)

In [39]:
vocab.shape

(50000,)

### relating words, data, and vocab
* **words** is the list of all text. this is the imported data
* **data** is the list containing the popularity of the word at index *i* in **words** out of *vocab_size*. if the value is 0, the word was not one of the most popular *vocab_size* words
* **vocab** contains the list of all of the *vocab_size* most popular words, from most popular to least popular in order


In [40]:
words[100]

'interpretations'

In [41]:
data[100]

4193

In [42]:
vocab[4193]

'interpretations'

## batching function
This guy just in the tutorial just copy and pasted this from some code online at a TensorFlow tutorial. Pretty sure he doesn't really understand what he's doing at a low level.

In [44]:
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
    if data_index == len(data):
        buffer[:] = data[:span]
        data_index = span
    else:
        buffer.append(data[data_index])
        data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

In [46]:
#constants
batch_size = 128
embedding_size = 150
#how many words to consider to the left/right
skip_window = 1
#how many times to reuse an input to generate a label
num_skips = 2

#validation set size
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
valid_examples

array([41, 24, 65, 45,  1, 73, 18, 97, 60, 68, 59, 92, 38, 43, 88, 98])

In [47]:
num_sampled = 64
learning_rate = 0.01
vocab_size = 50000

## placeholders and constants

In [48]:
train_inputs = tf.placeholder(tf.int32, shape=[None])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, tf.int32)

## variables

In [49]:
#initial embeddings
init_embeds = tf.random_uniform([vocab_size, embedding_size], minval=-1, maxval=1)
embeddings = tf.Variable(init_embeds)
embed = tf.nn.embedding_lookup(embeddings, train_inputs)