# 1. Negative Sampling
Word2Vec은 단어 집합이 수만 이상이면 학습에 무거움   
주변 단어 임베딩 벡터까지 업데이트는 비효율적   
Negative Sampling은 일부 단어 집합에만 집중

# 2. Skip-Gram with Negative Sampling
Skip-gram은 중심 단어로 주변 단어 예측
![Skip-gram](https://wikidocs.net/images/page/69141/%EA%B7%B8%EB%A6%BC1-1.PNG "Skip-gram")

SGNS는 중심/주변 단어로 서로 이웃 관계일 확률 예측
![SGNS](https://wikidocs.net/images/page/69141/%EA%B7%B8%EB%A6%BC1-2.PNG "SGNS")
  
중심 단어에 대한 작은 단어 집합을 통해 마지막을 이진 분류로 변환
![SGNS](https://wikidocs.net/images/page/69141/%EA%B7%B8%EB%A6%BC4.PNG "SGNS")

# 3. 20뉴스그룹 데이터 전처리

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print('총 샘플 수: ', len(documents))

총 샘플 수:  11314


In [3]:
news_df = pd.DataFrame({'document': documents})

news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ") # 특수문자 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) # 길이 3 이하 단어 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower()) # 소문자 변환

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")


In [4]:
# null값 유무 확인
news_df.isnull().values.any()

False

In [5]:
news_df.replace("", float("NaN"), inplace=True)
news_df.isnull().values.any()

True

In [6]:
news_df.dropna(inplace=True)
print('빈 값 제거 후 총 샘플 수: ', len(news_df))

빈 값 제거 후 총 샘플 수:  10995


In [7]:
# 불용어 제거
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
tokenized_doc = tokenized_doc.to_list()

In [8]:
# 단어가 1개 이하인 인덱스 저장 후 해당 단어 제거
drop_train = [index for index, sentence in enumerate(tokenized_doc) if len(sentence)<=1]
tokenized_doc = np.delete(tokenized_doc, drop_train, axis=0)
print('총 샘플 수: ', len(tokenized_doc))

총 샘플 수:  10940


  return array(a, dtype, copy=False, order=order)


In [9]:
# 단어 집합 생성
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)

# 정수 인코딩
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
encoded = tokenizer.texts_to_sequences(tokenized_doc)

In [10]:
print(encoded[:2])

[[9, 59, 603, 207, 3278, 1495, 474, 702, 9470, 13686, 5533, 15227, 702, 442, 702, 70, 1148, 1095, 1036, 20294, 984, 705, 4294, 702, 217, 207, 1979, 15228, 13686, 4865, 4520, 87, 1530, 6, 52, 149, 581, 661, 4406, 4988, 4866, 1920, 755, 10668, 1102, 7837, 442, 957, 10669, 634, 51, 228, 2669, 4989, 178, 66, 222, 4521, 6066, 68, 4295], [1026, 532, 2, 60, 98, 582, 107, 800, 23, 79, 4522, 333, 7838, 864, 421, 3825, 458, 6488, 458, 2700, 4730, 333, 23, 9, 4731, 7262, 186, 310, 146, 170, 642, 1260, 107, 33568, 13, 985, 33569, 33570, 9471, 11491]]


In [11]:
vocab_size = len(word2idx) + 1
print('단어 집합 크기: ', vocab_size)

단어 집합 크기:  64277


# 4. 데이터셋 구성

In [12]:
from tensorflow.keras.preprocessing.sequence import skipgrams
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded[:10]]

In [13]:
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print('({:s} ({:d}), {:s} ({:d}) -> {:d})'.format(idx2word[pairs[i][0]], pairs[i][0],
                                                      idx2word[pairs[i][1]], pairs[i][1],
                                                      labels[i]))

(makes (228), acts (1102) -> 1)
(seem (207), reconfig (63295) -> 0)
(reputation (5533), liptrap (36153) -> 0)
(europe (1095), israels (13686) -> 1)
(media (702), sargisian (21387) -> 0)


In [14]:
print('전체 샘플 수: ', len(skip_grams))

전체 샘플 수:  10


In [15]:
print(len(pairs))
print(len(labels))

2220
2220


In [16]:
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded]

# 5. SGNS 구현

In [17]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input
from tensorflow.keras.layers import Dot
from tensorflow.keras.utils import plot_model
from IPython.display import SVG

In [18]:
embed_size = 100

In [19]:
# 중심 단어 임베딩 테이블
w_inputs = Input(shape=(1,), dtype='int32')
word_embedding = Embedding(vocab_size, embed_size)(w_inputs)

# 주변 단어 임베딩 테이블
c_inputs = Input(shape=(1,), dtype='int32')
context_embedding = Embedding(vocab_size, embed_size)(c_inputs)

In [20]:
# 예측값
dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1,), input_shape=(1,1))(dot_product)
output = Activation('sigmoid')(dot_product)

In [21]:
model = Model(inputs=[w_inputs, c_inputs], outputs=output)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam')
plot_model(model, to_file='model3.png', show_shapes=True, show_layer_names=True, rankdir='TB')

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 100)       6427700     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 100)       6427700     input_2[0][0]                    
______________________________________________________________________________________________

In [22]:
for epoch in range(1, 6):
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X,Y)
    print('Epoch: ', epoch, 'Loss: ', loss)

Epoch:  1 Loss:  4630.118759036064
Epoch:  2 Loss:  3669.674777057022
Epoch:  3 Loss:  3508.549609189853
Epoch:  4 Loss:  3305.6848546154797
Epoch:  5 Loss:  3080.794129396789


# 6. 결과 확인

In [23]:
import gensim

In [24]:
f = open('vectors.txt', 'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

In [25]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [26]:
w2v.most_similar(positive=['soldiers'])

[('hamas', 0.8299499750137329),
 ('murdered', 0.8009821176528931),
 ('civilians', 0.7931509017944336),
 ('massacred', 0.7924225330352783),
 ('occupied', 0.7918510437011719),
 ('treaty', 0.7897014021873474),
 ('wounded', 0.7887318134307861),
 ('flee', 0.7853697538375854),
 ('killed', 0.7793642282485962),
 ('civilian', 0.7762709856033325)]

In [27]:
w2v.most_similar(positive=['doctor'])

[('blood', 0.5624978542327881),
 ('stomach', 0.5467677116394043),
 ('diet', 0.5446897149085999),
 ('pain', 0.5442729592323303),
 ('coated', 0.5351646542549133),
 ('parasites', 0.5261176824569702),
 ('disease', 0.5237654447555542),
 ('hurting', 0.5218662023544312),
 ('medicine', 0.5195770263671875),
 ('nose', 0.5111726522445679)]

In [28]:
w2v.most_similar(positive=['police'])

[('incontrovertibly', 0.6089888215065002),
 ('officers', 0.6009635925292969),
 ('agencies', 0.5896414518356323),
 ('authorities', 0.5895517468452454),
 ('tanks', 0.5851243138313293),
 ('homicide', 0.5835860967636108),
 ('constitution', 0.5808992385864258),
 ('ataman', 0.5798817276954651),
 ('federal', 0.5789436101913452),
 ('abroad', 0.5747097730636597)]

In [29]:
w2v.most_similar(positive=['knife'])

[('sinned', 0.685368001461029),
 ('realised', 0.6591967344284058),
 ('fantasies', 0.656895637512207),
 ('discriminatory', 0.642410397529602),
 ('emotional', 0.6338437795639038),
 ('neighbors', 0.6330261826515198),
 ('backs', 0.630382776260376),
 ('fallen', 0.6239956617355347),
 ('amusing', 0.6238863468170166),
 ('buried', 0.6236354112625122)]

In [30]:
w2v.most_similar(positive=['engine'])

[('rebuilt', 0.6259212493896484),
 ('seat', 0.5542675852775574),
 ('mufflers', 0.5221957564353943),
 ('bike', 0.5119962692260742),
 ('helmet', 0.5111349821090698),
 ('tires', 0.508537232875824),
 ('rack', 0.5048271417617798),
 ('metzeler', 0.5013248920440674),
 ('sucker', 0.5009517669677734),
 ('bmws', 0.49873510003089905)]