## 3.3　学習データの準備

In [1]:
import sys
import numpy as np

In [2]:
sys.path.append('../input/deeplearningfromscratch2master/deep-learning-from-scratch-2-master')

In [3]:
from common.util import preprocess

### 3.3.1　コンテキストとターゲット

In [4]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

print('corpus')
print(corpus)
print('id_to_word')
print(id_to_word)

corpus
[0 1 2 3 4 1 5 6]
id_to_word
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


`corpus` は、単語 ID の配列である。

In [5]:
def create_contexts_target(corpus, window_size=1):
    target = corpus[window_size:-window_size]
    contexts = []
    
    for idx in range(window_size, len(corpus)-window_size):
        cs = []
        for t in range(-window_size, window_size + 1):
            if t == 0:
                continue
            cs.append(corpus[idx + t])
        contexts.append(cs)
        
    return np.array(contexts), np.array(target)

In [6]:
contexts, target = create_contexts_target(corpus, window_size=1)

In [7]:
print('corpus')
print(corpus)
print('contexts')
print(contexts)
print('target')
print(target)

corpus
[0 1 2 3 4 1 5 6]
contexts
[[0 2]
 [1 3]
 [2 4]
 [3 1]
 [4 5]
 [1 6]]
target
[1 2 3 4 1 5]


`contexts` は、`corpus` の単語 ID に対応している。

### 3.3.2　one-hot 表現への変換

In [8]:
from  common.util import preprocess, create_contexts_target, convert_one_hot

In [9]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

print('corpus')
print(corpus)
print('id_to_word')
print(id_to_word)

corpus
[0 1 2 3 4 1 5 6]
id_to_word
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


In [10]:
contexts, target = create_contexts_target(corpus, window_size=1)

print('contexts')
print(contexts)
print('target')
print(target)

contexts
[[0 2]
 [1 3]
 [2 4]
 [3 1]
 [4 5]
 [1 6]]
target
[1 2 3 4 1 5]


In [11]:
vocab_size = len(word_to_id)
target = convert_one_hot(target, vocab_size)
contexts = convert_one_hot(contexts, vocab_size)

print('vocab_size')
print(vocab_size)
print('target')
print(target)
print('contexts')
print(contexts)

vocab_size
7
target
[[0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 0 1 0 0]
 [0 1 0 0 0 0 0]
 [0 0 0 0 0 1 0]]
contexts
[[[1 0 0 0 0 0 0]
  [0 0 1 0 0 0 0]]

 [[0 1 0 0 0 0 0]
  [0 0 0 1 0 0 0]]

 [[0 0 1 0 0 0 0]
  [0 0 0 0 1 0 0]]

 [[0 0 0 1 0 0 0]
  [0 1 0 0 0 0 0]]

 [[0 0 0 0 1 0 0]
  [0 0 0 0 0 1 0]]

 [[0 1 0 0 0 0 0]
  [0 0 0 0 0 0 1]]]
