# word2vec

## ニューラルネットワークにおける単語の処理方法

In [3]:
import numpy as np

In [6]:
c = np.array([[1, 0, 0, 0, 0, 0, 0]]) # 入力(単語ベクトル)
W = np.random.randn(7, 3) # 重み
h = np.dot(c, W) # 中間ノード 
print(h)

[[-0.93452934 -1.14687647  0.1351455 ]]


In [20]:
def preprocess(text):
    text = text.lower()
    tect = text.replace('.',' .')
    words = text.split(' ')
    
    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
            
    corpus = np.array([word_to_id[w] for w in words])
    return corpus, word_to_id, id_to_word, 

## ニューラルネットワークの学習

In [12]:
class MatMul:
    def __init__ (self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)] 
        self.x = None
        
    def forward(self, x):
        W, = self.params 
        out = np.dot(x, W) 
        self.x = x
        return out
    
    def backward(self, dout):
        W, = self.params
        dx = np.dot(dout, W.T)
        dW = np.dot(self.x.T, dout) 
        self.grads[0][...] = dW 
        return dx

## CBOW モデルの推論処理

In [8]:
# サンプルデータ
c0 = np.array([1, 0, 0, 0, 0, 0, 0])
c1 = np.array([0, 0, 1, 0, 0, 0, 0])

In [10]:
# 重みベクトルの初期化
w_in = np.random.randn(7, 3)
w_out = np.random.randn(3, 7)

In [13]:
# レイヤの生成
in_layer0 = MatMul(w_in)
in_layer1 = MatMul(w_in)
out_layer = MatMul(w_out)

In [14]:
# 順伝播
h0 = in_layer0.forward(c0)
h1 = in_layer1.forward(c1)
h = (h0 + h1) * 0.5
s = out_layer.forward(h)

In [15]:
print(s)

[-1.09982798  2.25227201 -0.28279901 -0.26455171 -1.87148863  1.31783572
 -0.56190043]


## コーパスからターゲットとコンテキストを抜き取る処理

In [18]:
text = "You say goodbey and I say hello."

In [21]:
corpus, word_to_id, id_to_word = preprocess(text)

In [32]:
print(corpus)
print(corpus[1:-1])

[0 1 2 3 4 1 5]
[1 2 3 4 1]


In [23]:
id_to_word

{0: 'you', 1: 'say', 2: 'goodbey', 3: 'and', 4: 'i', 5: 'hello.'}

In [36]:
def create_contexts_target(corpus, window_size=1):
    target = corpus[window_size: -window_size]
    contexts = []
    for idx in range(window_size, len(corpus)-window_size):
        cs = []
        for t in range(-window_size, window_size + 1):
            if t == 0:
                continue
            cs.append(corpus[idx+t])
        contexts.append(cs)
    return np.array(contexts), np.array(target)

In [37]:
contexts, target = create_contexts_target(corpus, window_size=1)

0
2
[0, 2]
1
3
[1, 3]
2
4
[2, 4]
3
1
[3, 1]
4
5
[4, 5]
