# Continuous Bag-Of-Word Models (CBOW)

## A. ONE-WORD CONTEXT

In [652]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [653]:
import nltk
from nltk.corpus import brown
from nltk.stem import PorterStemmer
from sklearn.cross_validation import train_test_split

In [654]:
brown_words_raw = brown.words(categories='news')

In [655]:
# CLEAN DATA
def word_normalize(word):
    return str(PorterStemmer().stem(word.lower()))

In [656]:
brown_words = [word_normalize(word) for word in brown_words_raw] 

### MODEL 1: 1-CONTEXT + NUMPY

In [694]:
class Word2Vec1WordNumpy:
    
    def __init__(self, X, N=10, lr=.001, verbose=True): 
        # Vocab, Dimensions, Learning Rate.
        self.vocab = list(set(X)) # vocab.
        self.V = len(self.vocab) # vocab size.
        self.N = N # word-vec's size.
        self.lr = lr # learning rate.
        self.verbose = verbose
        # Weights random initialization.
        self.W1 = np.random.rand(self.V,self.N) # weights between one-hot and word-vecs.
        self.h = np.random.rand(self.N) # word-vec, begotten by x^T * W1.
        self.W2 = np.random.rand(self.N,self.V) # weights between word-vecs and output layer.
    
    def softmax(self, v):
        Z = sum(np.exp(v_i) for v_i in v) # normalizing constant.
        return [np.exp(v_i)/Z for v_i in v]
    
    def one_hot_transform(self, word): 
        one_hot = np.zeros(len(self.vocab))
        if word in self.vocab:
            one_hot[self.vocab.index(word)] = 1.
        return one_hot
    
    def word2vec(self, word):
        return np.dot(self.one_hot_transform(word), self.W1)
    
    def forward_propagation(self, word):
        x = self.one_hot_transform(word)
        self.h = np.dot(x, self.W1)
        output = self.softmax(np.dot(self.h, self.W2))
        return output
    
    def backward_propagation(self, word, next_word):
        x, next_word = self.one_hot_transform(word), self.one_hot_transform(next_word)
        next_word_hat = self.forward_propagation(word) # prediction.
        # update W2 (note: word-vec -- W2 -- output).
        for i in range(self.N):
            for j in range(self.V):
                self.W2[i][j] -= self.lr * (next_word_hat[j]-next_word[j]) * self.h[i]
        # update W1 (note: one-hot -- W1 -- word-vec).
        EH = [ sum( (next_word_hat[j]-next_word[j])*self.W2[i][j] for j in range(self.V) ) for i in range(self.N)  ]
        for j in range(self.V):
            self.W1[j] -= self.lr * np.array(EH)
    
    def train(self, X_train, Y_train, repeat=1): # X: a list of words; Y: a list of next-words of X.
        counter = 0
        for _ in range(repeat):
            for word,next_word in zip(X_train,Y_train):
                counter +=1
                self.backward_propagation(word, next_word)
                if self.verbose and counter%100==0: print "Trained %d words." % counter
    
    def predict(self, word, verbose=True):
        next_word_hat = self.vocab[np.argmax(self.forward_propagation(word))]
        print "The predicted next-word for '%s' is '%s'" % (word,next_word_hat)
        return next_word_hat
        
    def evaluate(self, X_test, Y_test, verbose=True):
        counter = 0
        correct = 0
        for word,next_word in zip(X_test,Y_test):
            next_word_hat = self.predict(word, verbose)
            counter += 1
            print next_word, next_word_hat
            correct += (next_word==next_word_hat)
            if verbose: print "Correct" if next_word==next_word_hat else "Wrong"
        print correct                                
    
# NB: ADDING MAY IMPROVE PERFORMANCE.
# NB: DIFFERENT BIAS TERMS FOR W1 & W2.

#### a. Toy Data

In [672]:
X = ['this','is','extremely','good']
Y = ['is', 'extremely', 'good']
N = 3
mdl = Word2Vec1WordNumpy(X, N)

In [673]:
print "Vocab: ", mdl.vocab
for i in range(len(mdl.vocab)-1):
    print "Correct Next Word for '%s' is '%s'." % (mdl.vocab[i], mdl.vocab[i+1])

Vocab:  ['this', 'is', 'good', 'extremely']
Correct Next Word for 'this' is 'is'.
Correct Next Word for 'is' is 'good'.
Correct Next Word for 'good' is 'extremely'.


In [674]:
# BEFORE-TRAINING: RANDOM WEIGHTS
print mdl.predict('this')

The predicted next-word for 'this' is 'this'
this


In [675]:
%%time
for _ in range(10000):
    for w,nxt in zip(X,next_words):
        mdl.backward_propagation(w,nxt)

CPU times: user 1.89 s, sys: 8.15 ms, total: 1.9 s
Wall time: 1.89 s


In [676]:
# PREDICTING NEXT WORDS
#  NB: the model is not expected to know the next word of 'good', as it's not trained for that.
mdl.predict('this')
mdl.predict('is')
mdl.predict('extremely')

The predicted next-word for 'this' is 'is'
The predicted next-word for 'is' is 'extremely'
The predicted next-word for 'extremely' is 'good'


'good'

In [677]:
# LEARN WORD VECTORS
#  NB: this will only be useful when the training is large, 
#   we will then have semantically comparable to inspect.
print mdl.word2vec('this')
print mdl.word2vec('is')
print mdl.word2vec('extremely')
print mdl.word2vec('good')

[ 0.98455355  1.36145238  1.09804509]
[ 1.03038062  0.69486647  1.75002165]
[ 0.47375052  1.45647869  1.28320007]
[ 1.1462976   0.99098018  1.15359568]


#### b. Brown News

In [695]:
# CREAT TRAINING & TESTING
X_train, Y_train = brown_words[:1000], brown_words[1:1001]
print X_train[:10]
print Y_train[:10]
X_test, Y_test = brown_words[2001:2101], brown_words[2002:2102]

['the', 'fulton', 'counti', 'grand', 'juri', 'said', 'friday', 'an', 'investig', 'of']
['fulton', 'counti', 'grand', 'juri', 'said', 'friday', 'an', 'investig', 'of', "atlanta'"]


In [696]:
%%time
# TRAINING
mdl = Word2Vec1WordNumpy(X_train,5) # 5D word vectors.
mdl.train(X_train, Y_train, 5)

Trained 100 words.
Trained 200 words.
Trained 300 words.
Trained 400 words.
Trained 500 words.
Trained 600 words.
Trained 700 words.
Trained 800 words.
Trained 900 words.
Trained 1000 words.
Trained 1100 words.
Trained 1200 words.
Trained 1300 words.
Trained 1400 words.
Trained 1500 words.
Trained 1600 words.
Trained 1700 words.
Trained 1800 words.
Trained 1900 words.
Trained 2000 words.
Trained 2100 words.
Trained 2200 words.
Trained 2300 words.
Trained 2400 words.
Trained 2500 words.
Trained 2600 words.
Trained 2700 words.
Trained 2800 words.
Trained 2900 words.
Trained 3000 words.
Trained 3100 words.
Trained 3200 words.
Trained 3300 words.
Trained 3400 words.
Trained 3500 words.
Trained 3600 words.
Trained 3700 words.
Trained 3800 words.
Trained 3900 words.
Trained 4000 words.
Trained 4100 words.
Trained 4200 words.
Trained 4300 words.
Trained 4400 words.
Trained 4500 words.
Trained 4600 words.
Trained 4700 words.
Trained 4800 words.
Trained 4900 words.
Trained 5000 words.
CPU times

In [697]:
this_vec = mdl.word2vec('thi')
that_vec = mdl.word2vec('that')
doctor_vec = mdl.word2vec('doctor')
fulton_vec = mdl.word2vec('fulton')

In [698]:
import scipy
print scipy.spatial.distance.cosine(this_vec, that_vec)
print scipy.spatial.distance.cosine(this_vec, doctor_vec)
print scipy.spatial.distance.cosine(this_vec, fulton_vec)
    # NB: this is intuitive: 
    #  - 'this' and 'that' frequently appear in similar contexts.
    #  - 'this' frequently modifies 'doctor' (thus appear together).
    #  - 'this' almost never appear with 'fulton'.

0.898138339613
0.286958743578
0.111600231987
