* Calculate the word distribution
* Word embedding model
    * calculate negative sampling for each training sample
    * calculate cost function
    * calculate derivatives of the cost function w.r.t. input-to-hidden weights and hidden-to-output weights respectively.
* Calculate word analogies

In [6]:
d = range(3,3)
d

range(3, 3)

In [1]:
def get_word_dist(X, v_sz):
    freq = {}
    count = len(sum(x) for x in X)
    for x in X:
        for xi in x:
            if xi not in freq:
                freq[xi] = 0
            freq[xi] += 1
    wd = np.zeros(v_sz)
    for j in range(2, v_sz):
        wd[j] = (wd[j] / float(count))**0.75
    return wd


In [2]:
def get_neg_samples(word_dist, v_sz, context, num_neg_samples):
    saved = {}
    for context_idx in context:
        saved[context_idx] = word_dist[context_idx]
        word_dist[context_idx] = 0
    
    neg_samples = np.random.choice(
        range(v_sz),
        size=num_neg_samples,
        replace=False,
        p=(word_dist/np.sum(word_dist)),
    )
    
    for idx, p in saved.items():
        word_dist[idx] = p
        
    return neg_samples
    

In [4]:
def init_weight(M1, M2):
    return np.random.randn(M1, M2) / np.sqrt(M1 + M2)

In [3]:
def sigmoid(A):
    return 1 / (1 + np.exp(-A))

In [7]:
class Word2Vec(object):
    def __init__(self, V, D):
        self.V = V
        self.D = D

    def fit(X, context_sz, num_neg_samples=10, learning_rate=1e-4, mu=0.99, reg=0.1, epochs=100):
        
        N = len(X)
        v_sz = len(self.V)
        word_dist = get_word_dist(X, v_sz)
        
        self.W = init_weight(V, D)
        self.V = init_weight(D, V)
        
        dW = np.zeros(self.W.shape)
        dV = np.zeros(self.V.shape)
        
        
        # track cost for every single word
        costs = []
        
        # track cost for each epoch
        cost_per_epoch = []
        
        sample_indices = range(N)
        for ep in range(epochs):
            t0 = datetime.now()
            
            sample_indices = shuffle(sample_indices)
            
            # track cost for each sentence in epoch i
            cost_per_epoch_i = []
            for it in range(N):

                j = sample_indices[it]
                x = X[j]
                
                # too short
                if len(x) < 2 * self.context_sz + 1:
                    continue

                # track cost for each word in sentence x
                cj = []
                n = len(x)
                for jj in range(n):
                    
                    ## x[jj] is the index of word at position jj in x
                    Z = self.W[x[jj], :]
                    
                    start = max(0, jj - context_sz)
                    end = min(n, jj + 1 + context_sz)
                    
                    ctxt = np.concatenate(x[start: jj], x[(jj+1): end])
                    ctxt = np.array(list(set(ctxt)), dtype=np.int32)
                    neg = get_neg_samples(word_dist, v_sz, ctxt, num_neg_samples)

                    posD = Z.dot(self.V[:, ctxt])
                    posS = sigmoid(posD)

                    negD = z.dot(self.V[:, neg])
                    negS = sigmoid(-negD)
                    
                    cost = -np.log(posS).sum() - np.log(negS).sum()
                    cj.append(cost / (num_neg_samples + len(ctxt)))
                    
                    pos_err = posS - 1
                    gV_pos = np.outer(Z, pos_err)
                    dV[:, ctxt] = mu*dV[:, ctxt] - learning_rate*(gV_pos + reg * self.V[:, ctxt])
                    
                    neg_err = 1 - negS 
                    gV_neg = np.outer(Z, neg_err)
                    dV[:, neg] = mu*dV[:, neg] - learning_rate*(gV_neg + reg * self.V[:, neg])
                    
                    self.V[:, ctxt] += dV[:, ctxt]
                    self.V[:, neg] += dV[:, neg]

                    gW = pos_err.dot(V[:, ctxt].T) + neg_err.dot(V[:, neg].T)
                    dW[x[jj],:] = mu*dW[x[jj],:] - learning_rate*(gW + reg * self.W[x[jj],:])
                    self.W[x[jj],:] += dW[x[jj],:]
                    
                cj = np.mean(cj)
                cost_per_epoch_i.append(cj)
                cost.append(cj)
            
            epoch_cost = np.mean(cost_per_epoch_i)
            cost_per_epoch.append(epoch_cost)
            print("time to complete epoch %d:" % i, (datetime.now - t0), "cost:", epoch_cost)
        
        plt.plot(costs)
        plt.title("Numpy costs")
        plt.show()
        
        plt.plot(cost_per_epoch)
        plt.title("Numpy cost at each epoch")
        plt.show()
    
    def save(self, fn):
        arrays = [self.W, self.V]
        np.savez(fn, *arrays)


In [None]:
sentences, word2idx = get_wikipedia_date(n_files=50, n_vocab=2000)
with open('w2v_word2idx.json', 'w') as f:
    json.dump(word2idx, f)

V = len(words2idx)
model = Model(80, V)
model.fit(sentences, 10, learning_rate=10e-4, mu=0, epochs=7)
model.save('w2v_model.npz')

