## 2.3　カウントベースの手法

In [1]:
import os
import sys
import numpy as np
import pandas as pd

In [2]:
os.getcwd()

'/kaggle/working'

In [3]:
sys.path.append('../input/deeplearningfromscratch2master/deep-learning-from-scratch-2-master')

In [4]:
from common.util import preprocess, create_co_matrix, cos_similarity, most_similar

### 2.3.1　Python によるコーパスの下準備

In [5]:
text = 'You say goodbye and I say hello.'

In [6]:
text = text.lower()
text = text.replace('.', ' .')
text

'you say goodbye and i say hello .'

`split` でスペースを区切り文字に変換する。

In [7]:
words = text.split(' ')
words

['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello', '.']

In [8]:
word_to_id = {}
id_to_word = {}

for word in words:
    if word not in word_to_id:
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word

In [9]:
id_to_word

{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}

In [10]:
word_to_id

{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}

In [11]:
print(id_to_word[1])
print(word_to_id['hello'])

say
5


In [12]:
corpus = [word_to_id[w] for w in words]
corpus = np.array(corpus)
corpus

array([0, 1, 2, 3, 4, 1, 5, 6])

In [13]:
def preprocess(text):
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')
    
    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
            
    corpus = np.array([word_to_id[w] for w in words])
    
    return corpus, word_to_id, id_to_word

In [14]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

In [15]:
print(corpus)
print(word_to_id)
print(id_to_word)

[0 1 2 3 4 1 5 6]
{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


### 2.3.4　共起行列

In [16]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

print(corpus)
print(id_to_word)

[0 1 2 3 4 1 5 6]
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


In [17]:
C = np.array([
    [0, 1, 0, 0, 0, 0, 0],
    [1, 0, 1, 0, 1, 1, 0],
    [0, 1, 0, 1, 0, 0, 0],
    [0, 0, 1, 0, 1, 0, 0],
    [0, 1, 0, 1, 0, 0, 0],
    [0, 1, 0, 0, 0, 0, 1],
    [0, 0, 0, 0, 0, 1, 0]
], dtype=np.int32)

In [18]:
print(C[0])
print(C[4])
print(C[word_to_id['goodbye']])

[0 1 0 0 0 0 0]
[0 1 0 1 0 0 0]
[0 1 0 1 0 0 0]


`create_co_matrix` は、`corpus` （単語 ID リスト）から共起ベクトルを作成する。

In [19]:
def create_co_matrix(corpus, vocab_size, window_size=1):
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
    
    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size + 1):
            left_idx = idx - i
            right_idx = idx + i
            
            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1
                
            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
                
    return co_matrix

### 2.3.5　ベクトル間の類似度

In [20]:
def cos_similarity(x, y):
    nx = x / np.sqrt(np.sum(x**2))
    ny = y / np.sqrt(np.sum(y**2))
    return np.dot(nx, ny)

In [21]:
def cos_similarity(x, y, eps=1e-8):
    nx = x / (np.sqrt(np.sum(x**2)) + eps)
    ny = y / (np.sqrt(np.sum(y**2)) + eps)
    return np.dot(nx, ny)

In [22]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)

c0 = C[word_to_id['you']]
c1 = C[word_to_id['i']]
print(cos_similarity(c0, c1))

0.7071067691154799


### 2.3.6　類似単語のランキング表記

In [23]:
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    if query not in word_to_id:
        print('%s is not found' % query)
        
    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]
    
    vocab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)
        
    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity))
        
        count += 1
        if count >= top:
            return

In [24]:
x = np.array([100, -20, 2])
print(x.argsort())
print((-x).argsort())

[1 2 0]
[0 2 1]


In [25]:
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)

In [26]:
print(text)
print(corpus)
print(word_to_id)
print(id_to_word)
print(C)

You say goodbye and I say hello.
[0 1 2 3 4 1 5 6]
{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}
[[0 1 0 0 0 0 0]
 [1 0 1 0 1 1 0]
 [0 1 0 1 0 0 0]
 [0 0 1 0 1 0 0]
 [0 1 0 1 0 0 0]
 [0 1 0 0 0 0 1]
 [0 0 0 0 0 1 0]]


`corpus` は単語 ID を並べたものである。<br>

In [27]:
print(corpus)
print(id_to_word)

[0 1 2 3 4 1 5 6]
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


また、`C` は共起ベクトルで、となりにある単語をベクトル化している。

In [28]:
text_list = ['You', 'say', 'goodbye', 'and', 'I', 'say', 'hello']
pd.DataFrame(C, index=text_list, columns=text_list)

Unnamed: 0,You,say,goodbye,and,I,say.1,hello
You,0,1,0,0,0,0,0
say,1,0,1,0,1,1,0
goodbye,0,1,0,1,0,0,0
and,0,0,1,0,1,0,0
I,0,1,0,1,0,0,0
say,0,1,0,0,0,0,1
hello,0,0,0,0,0,1,0


In [29]:
most_similar('you', word_to_id, id_to_word, C, top=5)


[query] you
 goodbye: [0.99999998 0.         0.70710677 0.         0.70710677 0.70710677
 0.        ]
 i: [0.99999998 0.         0.70710677 0.         0.70710677 0.70710677
 0.        ]
 hello: [0.99999998 0.         0.70710677 0.         0.70710677 0.70710677
 0.        ]
 say: [0.99999998 0.         0.70710677 0.         0.70710677 0.70710677
 0.        ]
 and: [0.99999998 0.         0.70710677 0.         0.70710677 0.70710677
 0.        ]


`corpus` と `vocab_size` のサイズの違いに気を付ける。

In [30]:
print(corpus)
print(len(corpus))
print(id_to_word)
print(vocab_size)

[0 1 2 3 4 1 5 6]
8
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}
7
