In [2]:
from nltk import FreqDist
import numpy as np
import re
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

sw = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tjdwn\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [12]:
def buildDict(docs):          # python list
    doc_tokens = []
    for doc in docs:
        delim = re.compile(r'[\s,.]+')
        tokens = delim.split(doc.lower())
        tokens = [t for t in tokens if t not in sw]
        if tokens[-1] == '.': tokens = tokens[:-1]
        doc_tokens.append(tokens)

    vocab = FreqDist(np.hstack(doc_tokens))
    vocab = vocab.most_common()
    word_to_id = {word[0]: id for id, word in enumerate(vocab)}
    id_to_word = {id: word[0] for id, word in enumerate(vocab)}
    corpus = np.array([word_to_id[word[0]] for word in vocab])
    return doc_tokens, corpus, word_to_id, id_to_word

In [13]:
import pandas as pd

with open('./bts_korean.txt', 'r', encoding='utf-8') as f:
    kor_docs = f.readlines()

with open('./bts.txt', 'r', encoding='utf-8') as f:
    docs = f.readlines()

for id, doc in enumerate(docs):
    print('[{}] : {}...'.format(id, doc[:30]))

doc_tokens, corpus, word_to_id, id_to_word = buildDict(docs)


[0] : BTS, also known as the Bangtan...
[1] : [5] The septet—consisting of m...
[2] : Originally a hip hop group, th...
[3] : Their lyrics, often focused on...
[4] : Their work also often referenc...
[5] : After debuting in 2013 with th...
[6] : The group's second Korean stud...
[7] : By 2017, BTS crossed into the ...
[8] : They became the first Korean g...
[9] : BTS became one of the few grou...
[10] : In 2020, BTS became the first ...
[11] : Their follow-up releases "Sava...
[12] : Having sold over 20 million al...
[13] : They are the first Asian and n...
[14] : Featured on Time's internation...
[15] : The group's numerous accolades...
[16] : Outside of music, they partner...


In [14]:
def create_co_matrix(corpus, vocab_size, window_size=1):
    '''
    동시발생 행렬 생성
    :param corpus: 말뭉치(단어 ID 목록)
    :param vocab_size: 어휘 수
    :param window_size: 윈도우 크기(윈도우 크기가 1이면 타깃 단어 좌우 한 단어씩이 맥락에 포함)
    :return: 동시발생 행렬
    '''
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)

    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size + 1):
            left_idx = idx - i
            right_idx = idx + i

            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1

            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1

    return co_matrix


In [15]:
def ppmi(C, verbose=False, eps=1e-8):
    '''
    PPMI (점별 상호정보량) 생성
    :param C: 동시발생 행렬
    :param verbose: 진행 상황을 출력할지 여부
    :return:
    '''
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    S = np.sum(C, axis=0)
    total = C.shape[0] * C.shape[1]
    cnt = 0

    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j] * S[i]) + eps)
            M[i, j] = max(0, pmi)

            if verbose:
                cnt += 1
                if cnt % (total // 100 + 1) == 0:
                    print('%.1f%% 완료' % (100 * cnt / total))

    return M


In [16]:
window_size = 2

vocab_size = len(word_to_id)
print('동시발생행렬 계산')
C = create_co_matrix(corpus, vocab_size, window_size)
W = ppmi(C)

print(C[0, :10])
print(W[0, :10])


동시발생행렬 계산
[0 1 1 0 0 0 0 0 0 0]
[0.        7.4178524 7.0028152 0.        0.        0.        0.
 0.        0.        0.       ]


In [17]:
from sklearn.utils.extmath import randomized_svd
wordvec_size = 100
U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5, random_state=None)


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    if query not in word_to_id:
        print('{}를 찾을 수 없음.'.format(query))
        return

    word_vector = np.array([word_matrix[word_to_id[query]]])  # 쿼리단어 벡터 추출
    word_vector = word_vector.reshape(1, -1)  # cosine_similarity 위해 벡터 형상 조정
    sim = cosine_similarity(word_vector, word_matrix)
    sim = sim[0]  # 벡터 형상 조정 ([[]] -> [])

    sim = [(id, cos) for id, cos in enumerate(sim)]  # id, 유사도 쌍으로 정리
    sim = sorted(sim, key=lambda x: x[1], reverse=True)  # 유사도 높은 순 정렬

    return sim[1:top+1]  # 자기 자신 제외하고 top개 반환


In [19]:
rank = most_similar('world', word_to_id, id_to_word, U)
for r in rank:
    print(id_to_word[r[0]], r[1])


artist 0.52162355
known 0.5026817
influential 0.41057464
stadium 0.3954757
best-selling 0.22336486
