In [1]:
import re
import urllib.request
import zipfile
from lxml import etree
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
# 데이터 다운로드
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/09.%20Word%20Embedding/dataset/ted_en-20160408.xml", filename="ted_en-20160408.xml")

('ted_en-20160408.xml', <http.client.HTTPMessage at 0x1fab0136610>)

In [3]:
targetXML = open('ted_en-20160408.xml', 'r', encoding='UTF8')
target_text = etree.parse(targetXML)
# content 데이터만 가져옴
parse_text = '\n'.join(target_text.xpath('//content/text()'))

# (Audio), (Laughter) 등 제거
content_text = re.sub(r'\([^)]*\)', '', parse_text)

# 입력 코퍼스에 대해 NLTK를 이용하여 문장 토큰화 수행
sent_text = sent_tokenize(content_text)
sent_text[0]

"Here are two reasons companies fail: they only do more of the same, or they only do what's new."

In [4]:
# 각 문장에 대해 구두점을 제거하고 대문자를 소문자로 변환
normalized_text = []
for string in sent_text:
    tokens = re.sub(r'[^a-z0-0]', ' ', string.lower())
    normalized_text.append(tokens)

In [5]:
# 각 문장에 대해 NLTK를 이용하여 단어 토큰화 수행
result = [word_tokenize(sentence) for sentence in normalized_text]
print(result[0])

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']


### GloVe 훈련시키기

In [6]:
from glove import Corpus, Glove

corpus = Corpus()

In [7]:
# 훈련데이터로부터 GloVe에서 사용할 동시등장행렬 생성
corpus.fit(result, window=5)

In [8]:
glove = Glove(no_components=100, learning_rate=0.05)

In [9]:
# 학습에 이용할 쓰레드 개수 4, 에코크 20
glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

Performing 20 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


### 테스트

In [10]:
print(glove.most_similar("man"))

[('woman', 0.9565617376988225), ('guy', 0.889639917762371), ('girl', 0.8569227434157962), ('kid', 0.8355838534152334)]


In [11]:
print(glove.most_similar('boy'))

[('girl', 0.9402652404600705), ('kid', 0.8416383971200846), ('woman', 0.8406037110104396), ('man', 0.8174028308008443)]


In [12]:
print(glove.most_similar('university'))

[('harvard', 0.8966821120517038), ('mit', 0.863381827933563), ('cambridge', 0.8446023319763226), ('stanford', 0.8294566078706815)]
