# Word2Vec implementation with keras  

Word2Vec(Mikolov et al. 2013)은 한 문장 안에 함께 출현한 두 단어들 사이에는 의미적 유사도가 있다라는 아이디어를 통해 word의 semantic vector representation을 구할 수 있다는 것을 보임으로써 크게 주목을 받았습니다.  

본 실습에서는 word2vec을 실제로 구현하는 과정을 통해, 자체적인 의미(meaning) 정보를 갖지 않은 심볼을 embedding하는 개념을 익히고자 합니다.

(참고)  본 실습은 아래 튜토리얼을 참고하여 작성되었습니다.
https://byeongkijeong.github.io/Word2vec-from-scratch-using-keras/
https://adventuresinmachinelearning.com/word2vec-keras-tutorial/
https://www.d2l.ai/chapter_natural-language-processing/word2vec-gluon.html
https://towardsdatascience.com/art-of-vector-representation-of-words-5e85c59fee5

In [179]:
import os
from collections import Counter
from time import time

import numpy as np
import pandas as pd
from keras.layers import Dense, Dot, Embedding, Input, Reshape
from keras.models import Model
from keras.preprocessing.sequence import skipgrams
from nltk.corpus import stopwords

np.random.seed(777)
os.environ["CUDA_VISIBLE_DEVICES"] = '-1'

## step 1. define dataset  

본 실습은 abcnews-date-text.csv 라는 52MB짜리 corpus를 기반으로 진행합니다.  
52MB는 의미있는 word vector를 추출하기에는 매우 작은 사이즈입니다. 이 실습의 결과로 얻어진 word vector를 통해 NLP 성능을 기대하기는 어렵습니다. 하지만 구현 과정을 빠르게 살펴보기에는 용이합니다.  

In [180]:
corpus = pd.read_csv("datasets/corpus/abcnews-date-text.csv").iloc[:,1] 
corpus.head(10)

0    aba decides against community broadcasting lic...
1       act fire witnesses must be aware of defamation
2       a g calls for infrastructure protection summit
3             air nz staff in aust strike for pay rise
4        air nz strike to affect australian travellers
5                    ambitious olsson wins triple jump
6           antic delighted with record breaking barca
7    aussie qualifier stosur wastes four memphis match
8         aust addresses un security council over iraq
9           australia is locked into war timetable opp
Name: headline_text, dtype: object

In [181]:
type(corpus)
type(corpus.str)

pandas.core.strings.StringMethods

### preprocessing

In [182]:
# 소문자로 변경
corpus = corpus.str.lower()    
# 숫자/알파벳/공백 을 제외하고 전부 제거 (with pandas.core.strings.StringMethods)
corpus = corpus.str.replace('[^a-z0-9]+', ' ', regex=True)
corpus.head(10)

0    aba decides against community broadcasting lic...
1       act fire witnesses must be aware of defamation
2       a g calls for infrastructure protection summit
3             air nz staff in aust strike for pay rise
4        air nz strike to affect australian travellers
5                    ambitious olsson wins triple jump
6           antic delighted with record breaking barca
7    aussie qualifier stosur wastes four memphis match
8         aust addresses un security council over iraq
9           australia is locked into war timetable opp
Name: headline_text, dtype: object

In [183]:
corpus_list = corpus.values.tolist()
len(corpus_list)

1082168

In [184]:
corpus_list[0:5]

['aba decides against community broadcasting licence',
 'act fire witnesses must be aware of defamation',
 'a g calls for infrastructure protection summit',
 'air nz staff in aust strike for pay rise',
 'air nz strike to affect australian travellers']

### Build Vocabulary Dictionary

In [185]:
# corpus에 있는 모든 line에서 word를 추출해서 하나의 list에 모두 집어넣습니다.
words = np.concatenate(np.core.defchararray.split(corpus_list)).tolist()
words[0:100]

['aba',
 'decides',
 'against',
 'community',
 'broadcasting',
 'licence',
 'act',
 'fire',
 'witnesses',
 'must',
 'be',
 'aware',
 'of',
 'defamation',
 'a',
 'g',
 'calls',
 'for',
 'infrastructure',
 'protection',
 'summit',
 'air',
 'nz',
 'staff',
 'in',
 'aust',
 'strike',
 'for',
 'pay',
 'rise',
 'air',
 'nz',
 'strike',
 'to',
 'affect',
 'australian',
 'travellers',
 'ambitious',
 'olsson',
 'wins',
 'triple',
 'jump',
 'antic',
 'delighted',
 'with',
 'record',
 'breaking',
 'barca',
 'aussie',
 'qualifier',
 'stosur',
 'wastes',
 'four',
 'memphis',
 'match',
 'aust',
 'addresses',
 'un',
 'security',
 'council',
 'over',
 'iraq',
 'australia',
 'is',
 'locked',
 'into',
 'war',
 'timetable',
 'opp',
 'australia',
 'to',
 'contribute',
 '10',
 'million',
 'in',
 'aid',
 'to',
 'iraq',
 'barca',
 'take',
 'record',
 'as',
 'robson',
 'celebrates',
 'birthday',
 'in',
 'bathhouse',
 'plans',
 'move',
 'ahead',
 'big',
 'hopes',
 'for',
 'launceston',
 'cycling',
 'championsh

In [186]:
# 너무 자주 출현하는 word들은 word vector 구성에 방해가 됩니다. 이런 불용어들을 제거해 줍니다. 
stopWords = set(stopwords.words('english'))
stopWords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [187]:
len(words)

6907609

In [188]:
counter = Counter(words)
len(counter)

96035

In [189]:
counter.most_common(100)

[('to', 210213),
 ('in', 132680),
 ('for', 128091),
 ('of', 78259),
 ('on', 71547),
 ('over', 49557),
 ('the', 47359),
 ('police', 35420),
 ('at', 30776),
 ('with', 28817),
 ('after', 28804),
 ('new', 28470),
 ('man', 27627),
 ('a', 23804),
 ('and', 21669),
 ('up', 20688),
 ('as', 19890),
 ('says', 19340),
 ('from', 18493),
 ('us', 17137),
 ('by', 17081),
 ('govt', 16915),
 ('out', 16843),
 ('council', 16225),
 ('court', 16017),
 ('be', 15334),
 ('more', 15000),
 ('interview', 14868),
 ('fire', 13687),
 ('not', 13368),
 ('nsw', 12610),
 ('plan', 12197),
 ('australia', 12137),
 ('water', 11772),
 ('qld', 11632),
 ('wa', 11192),
 ('crash', 11019),
 ('death', 10925),
 ('into', 10643),
 ('off', 10511),
 ('sydney', 10409),
 ('against', 10236),
 ('health', 10199),
 ('charged', 9968),
 ('back', 9931),
 ('australian', 9926),
 ('no', 9853),
 ('report', 9441),
 ('down', 9363),
 ('call', 9165),
 ('murder', 9007),
 ('an', 8909),
 ('sa', 8896),
 ('hospital', 8669),
 ('day', 8595),
 ('car', 8520),
 

In [190]:
'into' in words

True

In [191]:
words = [word for word in words if word not in stopWords]

In [192]:
'into' in words

False

In [193]:
counter = Counter(words)
len(counter)

95889

In [194]:
counter.most_common(100)

[('police', 35420),
 ('new', 28470),
 ('man', 27627),
 ('says', 19340),
 ('us', 17137),
 ('govt', 16915),
 ('council', 16225),
 ('court', 16017),
 ('interview', 14868),
 ('fire', 13687),
 ('nsw', 12610),
 ('plan', 12197),
 ('australia', 12137),
 ('water', 11772),
 ('qld', 11632),
 ('wa', 11192),
 ('crash', 11019),
 ('death', 10925),
 ('sydney', 10409),
 ('health', 10199),
 ('charged', 9968),
 ('back', 9931),
 ('australian', 9926),
 ('report', 9441),
 ('call', 9165),
 ('murder', 9007),
 ('sa', 8896),
 ('hospital', 8669),
 ('day', 8595),
 ('car', 8520),
 ('may', 8395),
 ('calls', 8212),
 ('coast', 8211),
 ('win', 8161),
 ('woman', 8109),
 ('two', 8007),
 ('killed', 7999),
 ('accused', 7955),
 ('world', 7877),
 ('urged', 7808),
 ('found', 7674),
 ('home', 7671),
 ('government', 7651),
 ('south', 7541),
 ('missing', 7462),
 ('rural', 7329),
 ('first', 7307),
 ('set', 7203),
 ('claims', 7147),
 ('cup', 7034),
 ('attack', 6992),
 ('minister', 6970),
 ('election', 6965),
 ('boost', 6958),
 ('

In [195]:
top_n_ratio=0.8

counter = Counter(dict(counter.most_common(int(top_n_ratio * len(counter)))))
vocab = list(counter) + ['UNK']
vocab[-10:]

['bpa',
 'referal',
 'towms',
 'collating',
 'wallareenya',
 'lobbed',
 'samarasinghe',
 'springstown',
 'blinman',
 'UNK']

In [196]:
vocab_size = len(vocab)
vocab_size

76712

In [197]:
word2index = {word:index for index, word in enumerate(vocab)}
index2word = {index:word for word, index in word2index.items()}

In [198]:
word2index['iraq']

96

In [200]:
index2word[96]

'iraq'

### word corpus -> indexed corpus

In [201]:
indexed_corpus_list = []
for doc in corpus_list:
    indexed_corpus_list.append([word2index[word] if word in word2index else word2index['UNK'] for word in doc.split()])

In [202]:
corpus_list[0]

'aba decides against community broadcasting licence'

In [203]:
indexed_corpus_list[0]

[10889, 5930, 76711, 151, 10890, 1252]

In [204]:
word2index['aba']

10889

In [205]:
word2index['licence']

1252

### Make X-y Dataset with negative sampling

In [206]:
# (참고) https://keras.rstudio.com/reference/skipgrams.html

def generating_wordpairs(indexed_corpus, vocab_size, window_size=4):
    X = []
    Y = []
    for row in indexed_corpus:
        x, y = skipgrams(sequence=row, vocabulary_size=vocab_size, window_size=window_size,
                        negative_samples=1.0, shuffle=True, categorical=False, sampling_table=None, seed=None)
        X = X + list(x)
        Y = Y + list(y)
    return X, Y

In [207]:
X, Y = generating_wordpairs(indexed_corpus_list[0:100], vocab_size, window_size=4)

In [209]:
X[0]

[10890, 67427]

In [210]:
index2word[10890]

'broadcasting'

In [211]:
index2word[67427]

'maldinis'

In [212]:
Y[0]

0

### Model Construction

In [213]:
#embedding_dim=300
embedding_dim=30

input_target = Input((1,))
input_context = Input((1,))

embedding_layer = Embedding(vocab_size, embedding_dim, input_length=1)

target_embedding = embedding_layer(input_target)
target_embedding = Reshape((embedding_dim, 1))(target_embedding)
context_embedding = embedding_layer(input_context)
context_embedding = Reshape((embedding_dim, 1))(context_embedding)

hidden_layer = Dot(axes=1)([target_embedding, context_embedding])
hidden_layer = Reshape((1,))(hidden_layer)

output = Dense(16, activation='sigmoid')(hidden_layer)
output = Dense(1, activation='sigmoid')(output)

model = Model(inputs=[input_target, input_context], outputs=output)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='sgd')


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 30)        2301360     input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
reshape_7 (Reshape)             (None, 30, 1)        0           embedding_3[0][0]                
__________

### Training

In [217]:
epochs = 100000
batch_size = 512

for i in range(epochs):
    idx_batch = np.random.choice(len(indexed_corpus_list), batch_size)
    X, Y = generating_wordpairs(np.array(indexed_corpus_list)[idx_batch].tolist(), vocab_size)

    word_target, word_context = zip(*X)
    word_target = np.array(word_target, dtype=np.int32)
    word_context = np.array(word_context, dtype=np.int32)

    target = np.zeros((1,))
    context = np.zeros((1,))
    label = np.zeros((1,))
    idx = np.random.randint(0, len(Y)-1)
    target[0,] = word_target[idx]
    context[0,] = word_context[idx]
    label[0,] = Y[idx]
    loss = model.train_on_batch([target, context], label)
    if i % 10 == 0:
        print("Iteration {}, loss={}".format(i, loss))

Iteration 0, loss=1.0534988641738892
Iteration 10, loss=1.069413423538208
Iteration 20, loss=0.9909926056861877
Iteration 30, loss=0.8904520869255066
Iteration 40, loss=0.5047919750213623
Iteration 50, loss=1.015505313873291
Iteration 60, loss=1.0681167840957642
Iteration 70, loss=1.0520728826522827
Iteration 80, loss=0.4729217290878296
Iteration 90, loss=0.9086063504219055
Iteration 100, loss=0.9128470420837402
Iteration 110, loss=0.5534312725067139
Iteration 120, loss=0.6119768619537354
Iteration 130, loss=0.7974879741668701


KeyboardInterrupt: 

### save trained word2vec into file

In [224]:
word2vec_file_path = 'word2vec.txt'
f = open(word2vec_file_path, 'w')
f.write('{} {}\n'.format(vocab_size-1, embedding_dim))
vectors = model.get_weights()[0]
for word, i in word2index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

### Load trained word2vec

(참고) https://radimrehurek.com/gensim/models/keyedvectors.html

In [225]:
from gensim.models.keyedvectors import Word2VecKeyedVectors

word_vectors = Word2VecKeyedVectors.load_word2vec_format(word2vec_file_path, binary=False)
vector = word_vectors['computer']
vector

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


array([ 0.01219402,  0.03773722,  0.04487152,  0.01146974, -0.04884932,
        0.02861242,  0.01513971, -0.04361826, -0.00498766,  0.03830883,
       -0.00213171, -0.02341004, -0.02808711, -0.03443178,  0.03845153,
        0.00785465, -0.03831436, -0.02630966, -0.02292718, -0.04224662,
       -0.04796321,  0.00908273, -0.03531182,  0.01840568,  0.01029073,
       -0.0419643 ,  0.0182987 ,  0.00931753,  0.00032729, -0.04848105],
      dtype=float32)

In [226]:
word_vectors.similar_by_word("cat")

[('perisher', 0.6857004165649414),
 ('unloved', 0.6776821613311768),
 ('keller', 0.6709513664245605),
 ('phenomenal', 0.6538717150688171),
 ('creation', 0.6509047746658325),
 ('supersystem', 0.6281493306159973),
 ('bagshaw', 0.6279741525650024),
 ('appreciation', 0.6266207695007324),
 ('heathcote', 0.6193178296089172),
 ('0501', 0.6160426139831543)]