# Word Embedding

### (설명 깃 참고)

### Word2Vec
- 2013년 구글에서 개발한 Word Embedding 방법
- 최초의 neural embedding model
- 매우 큰 corpus에서 자동 학습
    - 비지도 지도 학습 (자기 지도학습)이라 할 수 있음
    - 많은 데이터를 기반으로 label 값 유추하고 이를 지도학습에 사용
- ex)
    - **이사금**께 충성을 맹세하였다.
    - **왕**께 충성을 맹세하였다.

**WordVec 훈련 방식에 따른 구분**
1. CBOW : 주변 단어로 중심 단어를 예측
2. Skip-gram : 중심 단어로 주변 단어를 예측

##### CBOW (Continuous Bag of Words)

##### Skip-gram

In [74]:
#!pip install gensim

### 영어 Word Embedding
- 데이터 취득 및 전처리

In [120]:
import gdown

# url = 'https://drive.google.com/uc?id=1DCgLPJsfyLGZ99lB-aF8EvpKIWSZYgp4'
# output = 'ted_en.xml'

# gdown.download(url, output)

In [76]:
from lxml import etree
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [116]:
# xml 데이터 처리
f = open('ted_en.xml', 'r', encoding='utf-8')
xml = etree.parse(f)

contents = xml.xpath('//content/text()')    # content 태그 하위 텍스트
# print(contents[:5])

corpus = '\n'.join(contents)
# print(corpus)

# 정규식을 이용해 (Laughter), (Applause) 등 키워드 제거
corpus = re.sub(r'\([^)]*\)', '', corpus)
print(len(corpus))

24062319


In [78]:
sentences = sent_tokenize(corpus)

preprocessed_sentences = []
en_stopwords = stopwords.words('english')

for sentence in sentences:
    sentence = sentence.lower() # 소문자 변환
    sentence = re.sub(r'[^a-z0-9]', ' ', sentence)  # 들여쓰기를 써서 빈 문자로 치환해 위치를 확인하기 위함
    tokens = word_tokenize(sentence)    # 문장 토큰화
    tokens = [token for token in tokens if token not in en_stopwords]   # 불용어 제거
    preprocessed_sentences.append(tokens)

preprocessed_sentences[:5]

[['two', 'reasons', 'companies', 'fail', 'new'],
 ['real',
  'real',
  'solution',
  'quality',
  'growth',
  'figuring',
  'balance',
  'two',
  'activities',
  'exploration',
  'exploitation'],
 ['necessary', 'much', 'good', 'thing'],
 ['consider', 'facit'],
 ['actually', 'old', 'enough', 'remember']]

- Embedding 모델 학습

In [79]:
from gensim.models import Word2Vec

model = Word2Vec(
    sentences=preprocessed_sentences,   # corpur
    vector_size=100,                    # 임베딩 백터 차원
    sg=0,                               # 학습 알고리즘 선택 (0=CBOW, 1=Skip-gram)
    window=5,                           # 주변 단어 수 (앞뒤로 n개)
    min_count=5                         # 최소 빈도 (빈도 n개 미만은 제거)
)

# wv = wordVectors
model.wv.vectors.shape

(21462, 100)

In [80]:
import pandas as pd

pd.DataFrame(model.wv.vectors, index=model.wv.index_to_key).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
one,0.027698,-0.040998,0.158571,-0.470428,-0.263714,-1.256787,0.121505,0.704592,-2.48587,-0.654399,...,0.872553,-0.092913,-0.3293,0.157683,0.477647,-2.049717,-0.479907,-0.562799,0.759411,1.016732
people,-1.022522,-0.078259,0.770004,0.171636,-0.395702,-1.017727,-0.611419,0.840051,-1.823142,-1.775905,...,1.883874,0.505658,-1.19118,-0.09475,-0.187881,-0.461384,-0.65526,-0.315043,-2.428902,1.089525
like,-0.916031,-0.575384,-1.049485,-2.131569,-0.609538,-0.50179,0.232423,0.568657,-1.754133,1.305051,...,-0.25222,1.026599,0.267711,-0.096123,0.027727,1.151274,0.502011,0.107587,0.972407,-0.480801
know,-1.23078,-0.982156,-0.22202,-0.431599,0.205042,0.466354,-0.200373,0.861611,-1.018153,-0.525709,...,0.282478,0.334002,-0.164213,-0.101115,-0.011006,0.422575,0.360514,-0.422611,0.77893,0.046559
going,-0.748081,0.365348,-0.992013,-0.395927,1.764353,0.497214,-0.220614,0.749055,-0.976055,-0.922865,...,0.834435,-1.254717,0.344368,1.647566,0.311536,-1.058268,0.609624,-0.012224,-1.09317,0.695677
think,-0.944074,-0.224553,1.127822,-0.70341,-0.045512,-0.785713,0.243878,0.093,-1.295057,-0.72307,...,0.90626,1.368871,-0.748709,0.997247,0.49136,-0.213437,0.266885,-0.905242,0.018078,-0.039231
see,0.202748,0.206692,0.522867,-1.143844,-0.094198,-1.149259,-0.371186,0.633747,-1.962483,0.284893,...,-0.216656,0.994439,0.755237,0.106046,-0.206298,0.682962,1.245177,-0.881771,0.783663,0.230833
would,0.234937,-0.924556,0.900902,-0.925465,1.211878,0.225494,-0.243454,-0.334157,-0.547872,-0.492267,...,0.032763,-1.060649,-0.220926,1.37468,-0.720384,0.955177,-0.118896,-0.247413,-0.985856,-1.053836
really,-1.753632,-0.131246,0.620951,-0.217828,0.426406,0.337676,0.709356,1.437779,-0.833241,-0.438447,...,0.984802,0.119845,-0.140858,0.034552,0.964083,-0.07336,-0.027472,-0.906347,-0.216509,0.453587
get,-1.643041,-1.224293,-0.287421,-1.46847,-0.180019,-0.628351,-0.985874,0.53252,-0.251555,-2.153717,...,-0.692948,-0.093241,-0.466871,0.239595,0.696418,-0.147861,0.415199,0.632127,-1.343451,0.66976


In [81]:
# 학습된 임베딩 모델 저장
model.wv.save_word2vec_format('ted_en_w2v')

In [82]:
from gensim.models import KeyedVectors

load_model = KeyedVectors.load_word2vec_format('ted_en_w2v')

- 유사도 계산

In [83]:
model.wv.most_similar('man')
#model.most_similar('abracadabra')


[('woman', 0.9098701477050781),
 ('daughter', 0.8022341728210449),
 ('girl', 0.802148699760437),
 ('son', 0.7825210094451904),
 ('lady', 0.7789952158927917),
 ('father', 0.7750177979469299),
 ('grandfather', 0.772922694683075),
 ('sister', 0.7586817145347595),
 ('boy', 0.756505012512207),
 ('mother', 0.7526086568832397)]

In [84]:
# model.similarity('man', 'husband')
model.wv['man']

array([ 2.6428944e-01, -5.2279371e-01,  4.5172414e-01,  1.9783969e+00,
       -9.0508059e-02, -2.4471526e-01, -3.6044118e-01,  1.9186311e+00,
       -6.0436565e-01, -4.2431065e-01, -4.1613805e-01,  2.1701942e-01,
       -5.7081386e-02,  9.2797679e-01,  1.2785417e+00, -4.8099005e-01,
        6.6846836e-01,  8.6558700e-02, -8.9863938e-01, -6.8079168e-01,
        5.6410545e-01,  5.8704174e-01,  1.4090481e-01, -3.0355647e-01,
        6.3626438e-01, -4.5757435e-02, -1.1257330e+00, -7.5859845e-01,
        1.4859024e-01,  5.2016544e-01, -1.0515171e+00, -1.2073495e+00,
       -1.6166815e-01, -1.0051460e+00, -5.7580286e-01,  1.6995008e+00,
       -8.1218505e-01, -4.3135694e-01,  9.4421721e-01, -6.7667913e-01,
        1.3073893e+00,  1.4663270e-01, -5.4744165e-02,  2.1431743e-01,
        2.0057597e+00,  2.9436633e-01, -5.6715423e-01,  3.3403787e-01,
        7.4531919e-01,  7.6524942e-04,  8.7607872e-01,  1.6070992e-01,
       -5.1491004e-01, -3.1327149e-01,  3.7839991e-01,  5.3718954e-01,
      

In [118]:
load_model.most_similar('man')

[('woman', 0.6602159142494202),
 ('son', 0.5976096391677856),
 ('father', 0.5930886268615723),
 ('lady', 0.5888569355010986),
 ('daughter', 0.5769045948982239),
 ('di', 0.5628340840339661),
 ('sister', 0.5516031384468079),
 ('girl', 0.5451541543006897),
 ('mother', 0.541283130645752),
 ('boy', 0.5392701625823975)]

- 임베딩 시각화

https://projector.tensorflow.org/

- embedding vector(tensor) 파일 (.tsv)
- metadata 파일 (.tsv)

In [86]:
!python3 -m gensim.scripts.word2vec2tensor --input ted_en_w2v --output ted_en_w2v

/usr/local/bin/python3: Error while finding module specification for 'gensim.scripts.word2vec2tensor' (ModuleNotFoundError: No module named 'gensim')


##### 한국어 Word Embedding
- NSMC (Naver Sentiment Movie Corpus)

In [87]:
import numpy as np
import pandas as pd
import urllib.request
from konlpy.tag import Okt

In [101]:
urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt",
    filename='naver_movie_ratings.txt'
)

('naver_movie_ratings.txt', <http.client.HTTPMessage at 0x1436f9840>)

In [102]:
# 데이터 프레임 생성
ratings_df = pd.read_csv('naver_movie_ratings.txt',sep='\t')

In [103]:
# 결측치 확인 및 처리 (제거)
display(ratings_df.isnull().sum())

ratings_df = ratings_df.dropna(how='any')

id          0
document    8
label       0
dtype: int64

In [104]:
# 한글이 아닌 데이터 제거
ratings_df['document'] = ratings_df['document'].replace(r'[^0-9가-힣ㄱ-ㅎㅏ-ㅣ]', regex=True)

In [108]:
from tqdm import tqdm

okt = Okt()
ko_stopwords = ['은', '는', '이', '가', '을', '를', '와', '과', '들', '도', '에','게', '부터', '까지', '나', '너', '그', '걔', '얘']

preprocessed_data = []

for sentence in tqdm(ratings_df['document']):
    tokens = okt.morphs(sentence, stem=True)
    tokens = [token for token in tokens if token not in ko_stopwords]
    preprocessed_data.append(tokens)

100%|██████████| 199992/199992 [14:09<00:00, 235.38it/s]


In [109]:
model = Word2Vec(
    sentences=preprocessed_data,
    vector_size=100,
    window=5,
    min_count=5,
    sg=0
)

model.wv.vectors.shape

(17614, 100)

In [110]:
model.wv.similarity('김혜수','유해진')

0.7897541

In [None]:
# 모델 저장
model.wv.save_word2vec_format('naver_movie_ratings_w2v')

In [112]:
!python3 -m gensim.scripts.word2vec2tensor --input naver_movie_ratings_w2v --output naver_movie_ratings_w2v

/usr/local/bin/python3: Error while finding module specification for 'gensim.scripts.word2vec2tensor' (ModuleNotFoundError: No module named 'gensim')


- 사전 훈련된 임베딩

In [121]:
url = 'https://drive.google.com/uc?id=11MWLNUBLOJWpJePTbOJwCtcgEryPGKGj'
output = 'Googlenews_vecs.bin.gz'

gdown.download(url, output)

Downloading...
From (original): https://drive.google.com/uc?id=11MWLNUBLOJWpJePTbOJwCtcgEryPGKGj
From (redirected): https://drive.google.com/uc?id=11MWLNUBLOJWpJePTbOJwCtcgEryPGKGj&confirm=t&uuid=b84c8aa6-7fce-44e0-a31d-265b1c887846
To: /Users/isejin/Desktop/세진 폴더/SKN_Family_project/nlp/03_word_embedding/Googlenews_vecs.bin.gz
100%|██████████| 1.65G/1.65G [07:36<00:00, 3.61MB/s]


'Googlenews_vecs.bin.gz'

In [None]:
google_news_wv = KeyedVectors.load_word2vec_format('Googlenews_vecs.bin.gz', binary=True)
google_news_wv.vectors.shape

(3000000, 300)

In [126]:
google_news_wv.similarity('king', 'man')

0.22942673

In [129]:
google_news_wv.most_similar('king')

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474),
 ('sultan', 0.5864824056625366),
 ('ruler', 0.5797567367553711),
 ('princes', 0.5646552443504333),
 ('Prince_Paras', 0.5432944297790527),
 ('throne', 0.5422105193138123)]

In [130]:
google_news_wv.n_similarity(['king', 'queen'], ['man', 'woman'])

0.24791393

In [131]:
google_news_wv.similar_by_word('king', topn=5)

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474)]

In [132]:
google_news_wv.has_index_for('ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ')

False