In [2]:
import numpy as np

x = np.array([1, 2])
y = np.array([4, 5])
np.c_[x, y]

array([[1, 4],
       [2, 5]])

In [3]:
np.r_[x, y]

array([1, 2, 4, 5])

In [8]:
import collections

cross_entropy_error = lambda y, t: -np.sum(t * np.log(y + 1e-7))
class SigmoidwithLoss:
    def __init__(self):
        self.params = []
        self.grads = []
        self.loss = None
        self.y = None
        self.t = None

    def forward(self, x, t):
        self.t = t
        self.y = 1 / (1 + np.exp(-x))

        self.loss = cross_entropy_error(np.c_[1 - self.y, self.y], self.t)
        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]

        dx = (self.y - self.t) * dout / batch_size
        return dx
    
class UnigramSampler:
    def __init__(self, corpus, power, sample_size):
        self.sample_size = sample_size
        self.vocab_size = None
        self.word_p = None

        counts = collections.Counter()
        for word_id in corpus:
            counts[word_id] += 1

        vocab_size = len(counts)
        self.vocab_size = vocab_size

        self.word_p = np.zeros(vocab_size)
        for i in range(vocab_size):
            self.word_p[i] = counts[i]

        self.word_p = np.power(self.word_p, power)
        self.word_p /= np.sum(self.word_p)
        
    def get_negative_sampler(self, target):
        batch_size = target.shape[0]

        if not GPU:
            negative_sample = np.zeros((batch_size, self.sample_size), dtype=np.int32)

            for i in range(batch_size):
                p = self.word_p.copy()
                target_idx = target[i]
                p[target_idx] = 0
                p /= p.sum()
                negative_sample[i, :] = np.random.choice(self.vocab_size, size=self.sample_size, replace=False, p=p)
        else:
            nagetive_sample = np.random.choice(self.vocab_size, size=(batch_size, self.sample_size), replace=True, p=self.word_p)
            
        return negative_sample

In [7]:
counts = collections.Counter()
counts['apple'] += 1
counts['orange'] += 2
print(counts)

Counter({'orange': 2, 'apple': 1})


In [11]:
words = ['you', 'say', 'goodbye', 'I', 'hello', '.']
np.random.choice(words, size=5)
p = [0.5, 0.1, 0.05, 0.2, 0.05, 0.1]
np.random.choice(words, size=5, p=p)

p = np.array([0.7, 0.29, 0.01])
new_p = np.power(p, 0.75)
new_p /= np.sum(new_p)
new_p

array([0.64196878, 0.33150408, 0.02652714])

In [3]:
class EmbeddingDot:
    def __init__(self, W):
        self.embed = Embedding(W)
        self.params = self.embed.params
        self.grads = self.embed.grads
        self.cache = None

    def forward(self, h):
        target_W = self.embed.forward(h)
        out = np.sum(target_W, axis=1)

        self.cache = (h, target_W)
        return out

    def backward(self, dout):
        h, target_W = self.cache
        dout = dout.reshape(dout.shape[0], 1)

        dtarget_W = dout * h
        self.embed.backward(dtarget_W)
        return None
    

class NegativeSamplingLoss:
    def __init__(self, W, corpus, power=0.75, sample_size=5):
        self.sample_size = sample_size
        self.sampler = UnigramSampler(corpus, power, sample_size)
        self.loss_layers = [SigmoidwithLoss() for _ in range(sample_size + 1)]
        self.embed_dot_layers = [EmbeddingDot(W) for _ in range(sample_size + 1)]

        self.params, self.grads = [], []
        for layer in self.embed_dot_layers:
            self.params += layer.params
            self.grads += layer.grads
            
    def forward(self,x):
        batch_size = x.shape[0]
        negative_sample = self.sampler.get_negative_sampler(x)
        
        score = self.embed_dot_layers[0].forward(x)
        corret_label = np.ones(batch_size, dtype=np.int32)
        loss = self.loss_layers[0].forward(score, corret_label)
        
        negative_label = np.zeros(batch_size, dtype=np.int32)
        for i in range(self.sample_size):
            negative_target = negative_sample[:, i]
            score = self.embed_dot_layers[1 + i].forward(negative_target)
            loss += self.loss_layers[1 + i].forward(score, negative_label)
            
        return loss
    
    def backward(self, dout=1):
        dh = 0
        for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):
            dscore = l0.backward(dout)
            dh += l1.backward(dscore)
            
        return dh
    
    
text = ['you', 'say', 'goodbye', 'I', 'hello', '.']
word_to_id, id_to_word = {}, {}
for word in text:
    if word not in word_to_id:
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word

corpus = np.array([word_to_id[w] for w in text])




In [4]:
import collections
import numpy as np
from common.util import preprocess

text = "The sky is vert blue and the sun is bright."
corpus, word_to_id, id_to_word = preprocess(text)
counts = collections.Counter()
for word_id in corpus:
    counts[word_id] += 1
print(counts)
vocab_size = len(word_to_id)
print(vocab_size)
word_p = np.zeros(vocab_size)
for i in range(vocab_size):
    word_p[i] = counts[i]
word_p = np.power(word_p, 2)
print(word_p)
word_p /= np.sum(word_p)
print(word_p)


Counter({0: 2, 2: 2, 1: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1})
9
[4. 1. 4. 1. 1. 1. 1. 1. 1.]
[0.26666667 0.06666667 0.26666667 0.06666667 0.06666667 0.06666667
 0.06666667 0.06666667 0.06666667]
