### 14주차 실습 - Skip-Gram with Negative Sampling(SGNS) ###
출처: 딥 러닝을 이용한 자연어 처리 입문(유원준, 안상준) https://wikidocs.net/69141  

### 1. 20뉴스그룹 데이터 전처리

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.preprocessing.text import Tokenizer  # 텍스트 코퍼스를 정수 시퀀스 등으로 변환

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# 'headers', 'footers', 'quotes'를 제거하여 데이터셋을 생성
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data[0:1000]  # 학습시간 단축을 위해 1000개 샘플만 사용
news_df = pd.DataFrame({'document': documents})

In [3]:
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3 이하인 단어는 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
# 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

# NaN 값 제거
news_df.replace("", float("NaN"), inplace=True)
news_df.dropna(inplace=True)

# 불용어 제거
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
tokenized_doc = tokenized_doc.to_list()

# 단어가 1개 이하인 경우 제거
drop_train = [index for index, sentence in enumerate(tokenized_doc) if len(sentence) <= 1]
tokenized_doc = np.delete(tokenized_doc, drop_train, axis=0)

print(len(tokenized_doc))

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")


971


  arr = asarray(arr)


### 2. 정수 시퀀스 변환

In [4]:
# Tokenizer로 토큰화된 텍스트 학습
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)

# 단어-인덱스 매핑
word2idx = tokenizer.word_index
# 인덱스-단어 매핑
idx2word = {value: key for key, value in word2idx.items()}
# 텍스트를 토큰 인덱스의 시퀀스로 변환
encoded = tokenizer.texts_to_sequences(tokenized_doc)

vocab_size = len(word2idx) + 1
print('단어 집합의 크기 :', vocab_size)

단어 집합의 크기 : 16039


In [13]:
word2idx

{'would': 1,
 'like': 2,
 'people': 3,
 'know': 4,
 'also': 5,
 'think': 6,
 'time': 7,
 'good': 8,
 'well': 9,
 'even': 10,
 'could': 11,
 'first': 12,
 'much': 13,
 'make': 14,
 'system': 15,
 'many': 16,
 'want': 17,
 'need': 18,
 'please': 19,
 'used': 20,
 'mail': 21,
 'anyone': 22,
 'years': 23,
 'graphics': 24,
 'since': 25,
 'said': 26,
 'government': 27,
 'space': 28,
 'work': 29,
 'right': 30,
 'going': 31,
 'data': 32,
 'really': 33,
 'find': 34,
 'back': 35,
 'last': 36,
 'something': 37,
 'without': 38,
 'year': 39,
 'information': 40,
 'using': 41,
 'might': 42,
 'thing': 43,
 'problem': 44,
 'send': 45,
 'better': 46,
 'another': 47,
 'point': 48,
 'things': 49,
 'support': 50,
 'sure': 51,
 'must': 52,
 'software': 53,
 'thanks': 54,
 'available': 55,
 'file': 56,
 'help': 57,
 'take': 58,
 'made': 59,
 'state': 60,
 'either': 61,
 'around': 62,
 'still': 63,
 'windows': 64,
 'best': 65,
 'believe': 66,
 'little': 67,
 'however': 68,
 'enough': 69,
 'someone': 70,
 'loo

In [14]:
encoded[0]

[9,
 51,
 343,
 117,
 3444,
 1591,
 524,
 408,
 8016,
 5556,
 4227,
 8017,
 408,
 315,
 408,
 125,
 1092,
 1592,
 951,
 5557,
 611,
 815,
 4228,
 408,
 195,
 117,
 1202,
 8018,
 5556,
 2911,
 4229,
 136,
 880,
 6,
 42,
 210,
 881,
 574,
 5558,
 3445,
 4230,
 1593,
 1019,
 5559,
 1020,
 8019,
 315,
 816,
 8020,
 575,
 27,
 153,
 2201,
 5560,
 196,
 71,
 211,
 2511,
 8021,
 80,
 5561]

### 2. Negative sampling 수행

In [6]:
from tensorflow.keras.preprocessing.sequence import skipgrams

# 10개의 데이터에 대해 negative sampling 수행
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded[:10]]

# 첫번째 데이터인 skip_grams[0] 내 skip-grams로 형성된 데이터셋 확인
print("첫번째 데이터 텍스트: ", tokenized_doc[0])
print("첫번째 정수 시퀀스: ", encoded[0])
print("첫번째 데이터에서 생성된 샘플 수: ", len(skip_grams[0][0]))
print('\n')

pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
        idx2word[pairs[i][0]], pairs[i][0],
        idx2word[pairs[i][1]], pairs[i][1],
        labels[i]))

# 전체 데이터에 대해 negative sampling 수행
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded]

첫번째 데이터 텍스트:  ['well', 'sure', 'story', 'seem', 'biased', 'disagree', 'statement', 'media', 'ruin', 'israels', 'reputation', 'rediculous', 'media', 'israeli', 'media', 'world', 'lived', 'europe', 'realize', 'incidences', 'described', 'letter', 'occured', 'media', 'whole', 'seem', 'ignore', 'subsidizing', 'israels', 'existance', 'europeans', 'least', 'degree', 'think', 'might', 'reason', 'report', 'clearly', 'atrocities', 'shame', 'austria', 'daily', 'reports', 'inhuman', 'acts', 'commited', 'israeli', 'soldiers', 'blessing', 'received', 'government', 'makes', 'holocaust', 'guilt', 'away', 'look', 'jews', 'treating', 'races', 'power', 'unfortunate']
첫번째 정수 시퀀스:  [9, 51, 343, 117, 3444, 1591, 524, 408, 8016, 5556, 4227, 8017, 408, 315, 408, 125, 1092, 1592, 951, 5557, 611, 815, 4228, 408, 195, 117, 1202, 8018, 5556, 2911, 4229, 136, 880, 6, 42, 210, 881, 574, 5558, 3445, 4230, 1593, 1019, 5559, 1020, 8019, 315, 816, 8020, 575, 27, 153, 2201, 5560, 196, 71, 211, 2511, 8021, 80, 5561]
첫번째 

### 3. Skip-Gram with Negative Sampling(SGNS) 구현

In [7]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input
from tensorflow.keras.layers import Dot
from tensorflow.keras.utils import plot_model
from IPython.display import SVG

embedding_dim = 100

# 중심 단어를 위한 임베딩 테이블
w_inputs = Input(shape=(1, ), dtype='int32')
word_embedding = Embedding(vocab_size, embedding_dim)(w_inputs)

# 주변(context) 단어를 위한 임베딩 테이블
c_inputs = Input(shape=(1, ), dtype='int32')
context_embedding  = Embedding(vocab_size, embedding_dim)(c_inputs)

# 두 임베딩의 내적 연산
dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1,), input_shape=(1, 1))(dot_product)
output = Activation('sigmoid')(dot_product)

# 모델 정의
model = Model(inputs=[w_inputs, c_inputs], outputs=output)
model.summary()

# 모델 컴파일
model.compile(loss='binary_crossentropy', optimizer='adam')

# 모델 구조 시각화 및 저장
plot_model(model, to_file='skip_gram.png', show_shapes=True, show_layer_names=True, rankdir='TB')

# 학습 수행
for epoch in range(1, 2): # 학습시간 단축을 위해 1 epoch만 수행, 실제로는 5 epoch 이상 필요
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X, Y)
    print('Epoch :', epoch, 'Loss :', loss)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 embedding (Embedding)       (None, 1, 100)               1603900   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 1, 100)               1603900   ['input_2[0][0]']             
                                                                                              

### 4. 학습된 모델에서 임베딩 확인

In [8]:
import gensim

# 임베딩 벡터를 저장할 파일 생성
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embedding_dim))

# 모델에서 학습된 임베딩 벡터를 저장
vectors = model.get_weights()[0]

# 단어와 해당하는 임베딩 벡터를 파일에 기록
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))

# 파일 닫기
f.close()

# Gensim의 KeyedVectors를 사용하여 Word2Vec 모델 로드
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [9]:
# 임베딩 벡터 확인
word_to_lookup = "disease"

if word_to_lookup in w2v:
    vector_for_word = w2v.get_vector(word_to_lookup)
    print(f"Vector for '{word_to_lookup}': {vector_for_word}")
else:
    print(f"'{word_to_lookup}' not found in the vocabulary.")

Vector for 'disease': [ 0.13758466 -0.1047382  -0.08835875 -0.07824744 -0.09222426  0.17779249
  0.13887963  0.11630329  0.14580305  0.12199148  0.14097801  0.14634782
  0.07734285 -0.07410634  0.16500396 -0.1589163  -0.14591235 -0.14849152
  0.0915256  -0.15351368 -0.00898439  0.08875638 -0.15063637  0.05665705
 -0.12960593 -0.02160536  0.16261818 -0.13992222 -0.08809896  0.07924185
 -0.09834685 -0.08715925  0.09079109 -0.15817913  0.08346014 -0.1455203
 -0.09317163 -0.12277817 -0.10904652 -0.07221082  0.0918714   0.14209071
 -0.17584167  0.04279476  0.10669453 -0.09176804  0.10810165  0.06571914
  0.07997713 -0.09418597  0.08555793 -0.09420658  0.07476047 -0.09745151
 -0.13252558  0.1190192  -0.06796204 -0.12191217  0.05802278 -0.08885982
  0.15799458 -0.00755611 -0.03945955 -0.12082712  0.10929142 -0.06992232
 -0.0860977  -0.08010818  0.12451589 -0.13911398 -0.07498546  0.13914913
  0.13147664  0.12656206  0.11344905 -0.07560133 -0.1546459  -0.06058378
 -0.11846206 -0.12416465  0.01

In [10]:
w2v.most_similar(positive=['disease'])

[('friend', 0.9357203841209412),
 ('following', 0.930649995803833),
 ('every', 0.9292210340499878),
 ('last', 0.9260523915290833),
 ('local', 0.924132227897644),
 ('small', 0.9224151372909546),
 ('children', 0.9219240546226501),
 ('remember', 0.9200649261474609),
 ('else', 0.9193099737167358),
 ('country', 0.9182180762290955)]

In [11]:
w2v.most_similar(positive=['soldiers'])

[('seem', 0.7989414930343628),
 ('else', 0.7966039776802063),
 ('seen', 0.7911429405212402),
 ('apart', 0.7884204983711243),
 ('state', 0.7862942814826965),
 ('useful', 0.7859528660774231),
 ('whole', 0.7828403115272522),
 ('look', 0.7827354669570923),
 ('wish', 0.7813276052474976),
 ('show', 0.7799718976020813)]

In [12]:
w2v.most_similar(positive=['police'])

[('since', 0.9289465546607971),
 ('software', 0.9258817434310913),
 ('make', 0.924393892288208),
 ('getting', 0.9240319728851318),
 ('little', 0.9228752851486206),
 ('plus', 0.921281099319458),
 ('much', 0.9212347269058228),
 ('give', 0.9207863211631775),
 ('every', 0.9201672077178955),
 ('know', 0.9184824228286743)]