### 1. GloVe training
* 영어와 한국어 word2Vec 학습에서 사용한 영어 데이터 재사용
* 모든 동일한 전처리를 마치고 이전과 동일하게 result에 결과가 저장되어 있다고 가정

In [3]:
## 전처리하고 result 만들기
import re
import urllib.request
import zipfile
from lxml import etree
from nltk.tokenize import word_tokenize, sent_tokenize

In [4]:
## 데이터 다운로드 
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/09.%20Word%20Embedding/dataset/ted_en-20160408.xml", filename="ted_en-20160408.xml")

('ted_en-20160408.xml', <http.client.HTTPMessage at 0x2b108ccb790>)

In [5]:
# <content> </content> 사이 데이터만 가져오기
targetXML = open('ted_en-20160408.xml', 'r', encoding='utf8')
target_text = etree.parse(targetXML)

parse_text = "\n".join(target_text.xpath('//content/text()'))

In [13]:
parse_text[:1000]

"Here are two reasons companies fail: they only do more of the same, or they only do what's new.\nTo me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but it can be too much of a good thing.\nConsider Facit. I'm actually old enough to remember them. Facit was a fantastic company. They were born deep in the Swedish forest, and they made the best mechanical calculators in the world. Everybody used them. And what did Facit do when the electronic calculator came along? They continued doing exactly the same. In six months, they went from maximum revenue ... and they were gone. Gone.\nTo me, the irony about the Facit story is hearing about the Facit engineers, who had bought cheap, small electronic calculators in Japan that they used to double-check their calculators.\n(Laughter)\nFacit did too much exploitation. But exploration can go wild, too.\nA few years back, I worked closely alongside a Eu

In [10]:
### () 단어 제거
content_text = re.sub(r'\([^)]*\)', '', parse_text)

In [14]:
content_text[:1000]

"Here are two reasons companies fail: they only do more of the same, or they only do what's new.\nTo me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but it can be too much of a good thing.\nConsider Facit. I'm actually old enough to remember them. Facit was a fantastic company. They were born deep in the Swedish forest, and they made the best mechanical calculators in the world. Everybody used them. And what did Facit do when the electronic calculator came along? They continued doing exactly the same. In six months, they went from maximum revenue ... and they were gone. Gone.\nTo me, the irony about the Facit story is hearing about the Facit engineers, who had bought cheap, small electronic calculators in Japan that they used to double-check their calculators.\n\nFacit did too much exploitation. But exploration can go wild, too.\nA few years back, I worked closely alongside a European bio

In [16]:
## 문장 토큰화 실시

sent_text = sent_tokenize(content_text)
sent_text[:10]

["Here are two reasons companies fail: they only do more of the same, or they only do what's new.",
 'To me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation.',
 'Both are necessary, but it can be too much of a good thing.',
 'Consider Facit.',
 "I'm actually old enough to remember them.",
 'Facit was a fantastic company.',
 'They were born deep in the Swedish forest, and they made the best mechanical calculators in the world.',
 'Everybody used them.',
 'And what did Facit do when the electronic calculator came along?',
 'They continued doing exactly the same.']

In [17]:
normalized_text = []
for string in sent_text:
    tokens = re.sub(r'[^a-z0-9]+', " ", string.lower())  # 소문자로 변환, 문자, 숫자가 아닌 것은 공백으로
    normalized_text.append(tokens)

In [19]:
normalized_text[:10]

['here are two reasons companies fail they only do more of the same or they only do what s new ',
 'to me the real real solution to quality growth is figuring out the balance between two activities exploration and exploitation ',
 'both are necessary but it can be too much of a good thing ',
 'consider facit ',
 'i m actually old enough to remember them ',
 'facit was a fantastic company ',
 'they were born deep in the swedish forest and they made the best mechanical calculators in the world ',
 'everybody used them ',
 'and what did facit do when the electronic calculator came along ',
 'they continued doing exactly the same ']

In [20]:
result = [word_tokenize(sentence) for sentence in normalized_text]

In [21]:
len(result)

273424

In [1]:
from glove import Corpus, Glove

In [22]:
corpus = Corpus()

### 학습데이터로부터 GloVe에서 사용할 동시발생행렬 생성
corpus.fit(result, window=5)

In [23]:
glove = Glove(no_components=100, learning_rate=0.05)

In [25]:
len(corpus.dictionary)

54775

In [26]:
# 학습에 이용한 쓰레드 갯수를 4로 설정, 에포크는 20
glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

Performing 20 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


In [27]:
print(glove.most_similar('man'))

[('woman', 0.9670028055891261), ('guy', 0.8956495491232932), ('girl', 0.8585354659980821), ('young', 0.8567492036858457)]


In [28]:
print(glove.most_similar('university'))

[('harvard', 0.8820661474525139), ('mit', 0.8472272505579834), ('stanford', 0.8428039481838339), ('cambridge', 0.8302789079032129)]


In [31]:
print(glove.most_similar('artificial'))

[('intelligence', 0.87710114225248), ('electric', 0.8598712500388953), ('electrical', 0.8563744919440548), ('electronic', 0.848367870522238)]


In [32]:
print(glove.most_similar('technology'))

[('design', 0.8555588217258701), ('information', 0.8319960452586143), ('power', 0.8284188775965895), ('material', 0.8247348768742808)]


In [33]:
print(glove.most_similar('war'))

[('world', 0.8145812368890084), ('ii', 0.8044471639913833), ('modern', 0.7755755214615346), ('western', 0.7518448035624175)]


In [34]:
print(glove.most_similar('water'))

[('clean', 0.8540012870115609), ('air', 0.8415245804059791), ('fresh', 0.8229850034001619), ('food', 0.802786521148359)]


In [35]:
print(glove.most_similar('physics'))

[('chemistry', 0.8964628608735887), ('economics', 0.8818017866301264), ('mathematics', 0.8655383153588303), ('beauty', 0.8646286286033729)]


In [36]:
print(glove.most_similar('clean'))

[('fresh', 0.8546075079192427), ('water', 0.8540012870115611), ('heat', 0.8307387859799972), ('wind', 0.8195151336054225)]


In [37]:
print(glove.most_similar('fresh'))

[('drinking', 0.8556368367641413), ('clean', 0.8546075079192424), ('water', 0.8229850034001619), ('supply', 0.7683332437350548)]
