<a href="https://colab.research.google.com/github/ttogle918/AI_practice/blob/main/book/%EB%B0%91%EB%B0%94%EB%8B%A5%EB%B6%80%ED%84%B0_%EC%8B%9C%EC%9E%91%ED%95%98%EB%8A%94_%EB%94%A5%EB%9F%AC%EB%8B%9D_2/02_%EB%8B%A8%EC%96%B4%EC%9D%98%EB%B6%84%EC%82%B0%ED%91%9C%ED%98%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
def preprocess(text) :
  text = text.lower()
  text = text.replace('.', ' .')
  words = text.split(' ')

  word_to_id, id_to_word = {}, {}
  for word in words :
    if word not in word_to_id :
      new_id = len(word_to_id)
      word_to_id[word] = new_id
      id_to_word[new_id] = word
  corpus = np.array([word_to_id[w] for w in words])
  return corpus, word_to_id, id_to_word
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
print(f'corpus : {corpus}\nword_to_id : {word_to_id}\nid_to_word : {id_to_word} ')

corpus : [0 1 2 3 4 1 5 6]
word_to_id : {'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}
id_to_word : {0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'} 


In [7]:
def create_co_matrix(corpus, vocab_size, window_size=1) :
  corpus_size = len(corpus)
  co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int16)

  for idx, word_id in enumerate(corpus) :
    for i in range(1, window_size+1):
      left_idx = idx - i
      right_idx = idx + i
      if left_idx >= 0 :
        left_word_id = corpus[left_idx]
        co_matrix[word_id, left_word_id] += 1
      if right_idx < corpus_size :
        right_word_id = corpus[right_idx]
        co_matrix[word_id, right_word_id] += 1
  return co_matrix
print(word_to_id.keys())
print(create_co_matrix(corpus, len(word_to_id), window_size=1), '\n----------------')
print(create_co_matrix(corpus, len(word_to_id), window_size=2), '\n----------------')
print(create_co_matrix(corpus, len(word_to_id), window_size=3))

dict_keys(['you', 'say', 'goodbye', 'and', 'i', 'hello', '.'])
[[0 1 0 0 0 0 0]
 [1 0 1 0 1 1 0]
 [0 1 0 1 0 0 0]
 [0 0 1 0 1 0 0]
 [0 1 0 1 0 0 0]
 [0 1 0 0 0 0 1]
 [0 0 0 0 0 1 0]] 
----------------
[[0 1 1 0 0 0 0]
 [1 0 1 2 1 1 1]
 [1 1 0 1 1 0 0]
 [0 2 1 0 1 0 0]
 [0 1 1 1 0 1 0]
 [0 1 0 0 1 0 1]
 [0 1 0 0 0 1 0]] 
----------------
[[0 1 1 1 0 0 0]
 [1 0 2 2 2 1 1]
 [1 2 0 1 1 0 0]
 [1 2 1 0 1 1 0]
 [0 2 1 1 0 1 1]
 [0 1 0 1 1 0 1]
 [0 1 0 0 1 1 0]]


In [8]:
def cos_similarity(x, y, eps=1e-08) :
  nx = x / np.sqrt(np.sum(x**2) + eps)  # x 정규화, eps는 0으로 나눔 방지
  ny = y / np.sqrt(np.sum(y**2) + eps)  # y 정규화
  return np.dot(nx, ny)
C = create_co_matrix(corpus, len(word_to_id), window_size=1)
c0 = C[word_to_id['you']]
c1 = C[word_to_id['i']]
print(cos_similarity(c0, c1))

0.7071067758832467


In [10]:
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    '''유사 단어 검색
    :param query: 쿼리(텍스트)
    :param word_to_id: 단어에서 단어 ID로 변환하는 딕셔너리
    :param id_to_word: 단어 ID에서 단어로 변환하는 딕셔너리
    :param word_matrix: 단어 벡터를 정리한 행렬. 각 행에 해당 단어 벡터가 저장되어 있다고 가정한다.
    :param top: 상위 몇 개까지 출력할 지 지정
    '''
    if query not in word_to_id:
        print('%s(을)를 찾을 수 없습니다.' % query)
        return

    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]

    # 코사인 유사도 계산
    vocab_size = len(id_to_word)

    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)

    # 코사인 유사도를 기준으로 내림차순으로 출력
    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity[i]))

        count += 1
        if count >= top:
            return
most_similar('you', word_to_id, id_to_word, C, top=5)


[query] you
 goodbye: 0.7071067758832467
 i: 0.7071067758832467
 hello: 0.7071067758832467
 say: 0.0
 and: 0.0
