In [27]:
text = 'You say goodbye and I say hello.'

In [12]:
text = text.lower()
text = text.replace('.',' .')
text

'you say goodbye and i say hello .'

In [13]:
words = text.split(' ')
words

['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello', '.']

In [14]:
word_to_id = {}
id_to_word = {}

for word in words :
    if word not in word_to_id :
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word

In [15]:
id_to_word

{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}

In [16]:
word_to_id

{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}

In [17]:
id_to_word[1]

'say'

In [18]:
import numpy as np

In [21]:
corpus = [word_to_id[w] for w in words]
corpus = np.array(corpus)
corpus

array([0, 1, 2, 3, 4, 1, 5, 6])

In [25]:
def preprocess(text) :
    text = text.lower()
    text = text.replace('.',' .')
    text = text.split(' ')
    word_to_id = {}
    id_to_word = {}
    for word in words :
        if word not in word_to_id :
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
    corpus = np.array([word_to_id[w] for w in words])
    
    return corpus, word_to_id, id_to_word

# corpus -> 단어 ID목록, word_to_id -> 단어에서 단어 ID로 , id_to_word -> 단어 ID에서 단어로

In [28]:
import sys
text
corpus, word_to_id, id_to_word = preprocess(text)

In [29]:
corpus

array([0, 1, 2, 3, 4, 1, 5, 6])

In [30]:
id_to_word

{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}

In [49]:
def create_co_matrix(corpus, vocab_size, window_size =1):
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
    
    for idx, word_id in enumerate(corpus) :
        for i in range(1, window_size+1) :
            left_idx = idx-i
            right_idx = idx+i
            
            if left_idx >= 0 :
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] +=1
            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] +=1
    return co_matrix

In [40]:
def cos_similarity(x,y, eps = 1e-8) :
    nx = x / np.sqrt(np.sum(x**2) + eps) # x의 정규화
    ny = y / np.sqrt(np.sum(y**2) + eps) # y의 정규화
    return np.dot(nx, ny)

In [47]:
vocab_size = len(word_to_id)

In [50]:
C = create_co_matrix(corpus, vocab_size)

In [52]:
C0 = C[word_to_id['you']]
C1 = C[word_to_id['i']]
print(cos_similarity(C0,C1))

0.7071067758832467


In [84]:
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    # 검색어를 꺼낸다
    if query not in word_to_id :
        print("%s(을)를 찾을 수 없습니다." % query)
        return
    
    print('\n[query]' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]
    
    # 코사인 유사도 계산
    vocab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size) :
        similarity[i] = cos_similarity(word_matrix[i],query_vec)
    
    # 코사인 유사도를 기준으로 내림차순으로 출력
    count = 0
    for i in (-1 * similarity).argsort() : # argsort() -> 넘파이 원소를 오름차순으로 정렬 후 인덱스 반환, -1 곱하면 내림차순
        if id_to_word[i] == query:
            continue
        print('%s : %s' %(id_to_word[i], similarity[i].round(2)))
        count +=1
        if count >= top :
            return

In [85]:
most_similar('you', word_to_id, id_to_word, C, top=5)


[query]you
goodbye : 0.71
i : 0.71
hello : 0.71
say : 0.0
and : 0.0
