In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
!nvidia-smi

Sat Mar 27 23:22:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
import pandas as pd
import numpy as np
from IPython.display import SVG
import pydot

from gensim.models import Word2Vec
import multiprocessing

In [4]:
import nltk
from nltk.corpus import stopwords, brown

In [8]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams

from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Dot, Activation, Input
from keras.utils import plot_model

## 1. Data Preprocessing
- 하나의 샘플에 단어가 최소 2개 있어야 함
- 그래야 중심단어, 주변단어의 관계가 성립하며 그렇지 않으면 샘플을 구성할 수 없어 에러 발생
- 지속적으로 이를 만족하지 않는 샘플들 제거

In [10]:
dataset = fetch_20newsgroups(shuffle = True,
                             random_state = 2046,
                             remove = ('headers',
                                       'footers',
                                       'quotes'))

documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [13]:
print("총 샘플수: " , len(documents))

총 샘플수:  11314


### 1) 불필요한 토큰 제거, 정규화(소문자화)

In [14]:
df_news = pd.DataFrame({'document' : documents})

# 특수문자 제거
df_news['clean_doc'] = df_news['document'].str.replace('[^a-zA-Z]' , ' ')

# 길이가 3 이하인 단어 제거
df_news['clean_doc'] = df_news['clean_doc'].apply(
    lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

# 전체 단어에 대한 소문자 변환
df_news['clean_doc'] = df_news['clean_doc'].apply(lambda x : x.lower())

### 2) NULL값이 있는지 확인

In [15]:
df_news.isnull().values.any()

False

In [16]:
df_news.dropna(inplace = True)

print('총 샘플 수: ' , len(df_news))

총 샘플 수:  11314


### 3) 불용어 제거
- NLTK에서 정의한 불용어 리스트 사용

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
stop_words = stopwords.words('english')

tokenized_doc = df_news['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
tokenized_doc = tokenized_doc.to_list()

### 4) 샘플 중 단어가 1개 이하인 경우 모두 찾아 제거

In [19]:
# 단어가 1개 이하인 샘플의 인덱스를 찾아 저장하고, 해당 샘플들을 제거
drop_train = [index for index, sentence in enumerate(tokenized_doc) if len(sentence) <= 1]
tokenized_doc = np.delete(tokenized_doc, drop_train, axis = 0)

print('총 샘플수: ' , len(tokenized_doc))

총 샘플수:  10940


In [21]:
tokenized_doc

array([list(['kratz', 'comments', 'show', 'otherwise', 'bingo', 'question', 'glock', 'qualify', 'evidence', 'kratz', 'uses', 'first', 'aside', 'proves', 'nothing', 'moreover', 'comments', 'based', 'kratz', 'writes', 'free', 'argue', 'babbles', 'text', 'actually', 'knows', 'something', 'line', 'harder', 'note', 'almost', 'revolvers', 'work', 'harder', 'revolvers', 'sure', 'moreover', 'know', 'kratz', 'sample', 'unrepresentative', 'look', 'reasoning', 'basically', 'glocks', 'dangerous', 'like', 'third', 'generation', 'part', 'true', 'since', 'people', 'claim', 'revolvers', 'share', 'relevant', 'property', 'dangerous', 'argument', 'fails', 'would', 'care', 'looking', 'bogus', 'reasoning', 'make', 'hundreds', 'simple', 'statements', 'without', 'anyone', 'getting', 'right', 'merely', 'make', 'accurate', 'simple', 'statements', 'attaboy', 'mail', 'publically', 'accuracy', 'severe', 'burden', 'manage', 'andy']),
       list(['learn', 'know', 'million', 'dollars', 'money', 'know', 'rickey', 'h

### 5) 단어 집합을 생성, 정수인코딩

In [22]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)

word2idx = tokenizer.word_index
idx2word = {v : k for k, v in word2idx.items()}
encoded = tokenizer.texts_to_sequences(tokenized_doc)

In [23]:
print(encoded[:2])

[[7514, 533, 264, 548, 8530, 68, 6733, 4988, 236, 7514, 330, 12, 1674, 4406, 135, 4036, 533, 103, 7514, 1541, 143, 1514, 24704, 354, 98, 581, 25, 85, 2700, 145, 282, 7837, 24, 2700, 7837, 59, 4036, 4, 7514, 1473, 24705, 66, 1895, 864, 10668, 1228, 3, 318, 1515, 63, 83, 27, 2, 312, 7837, 1138, 1483, 1095, 1228, 328, 2555, 1, 304, 151, 6066, 1895, 15, 1787, 329, 1420, 56, 22, 182, 18, 1369, 15, 1504, 329, 1420, 24706, 55, 12465, 4121, 2737, 4407, 3703, 2957], [800, 4, 515, 1430, 216, 4, 8994, 10669, 1769, 901, 4, 901, 297, 461, 216, 295, 20294, 2632, 2243, 653, 2071, 1430, 1076, 102, 1188, 1213, 585, 210, 9470, 36, 56, 10028, 14, 10670, 135, 11491, 879, 135, 182, 1076, 13, 107, 252, 722, 5533, 102]]


- 단어집합 크기 확인

In [24]:
vocab_size = len(word2idx) + 1
print('단어집합의 크기: ' , vocab_size)

단어집합의 크기:  64277


In [25]:
w2v = Word2Vec(tokenized_doc,
               size = 100,
               window = 5,
               negative = 15,
               iter = 10,
               workers = multiprocessing.cpu_count())

In [28]:
word_vectors = w2v.wv
result = word_vectors.similar_by_word('saturday')
print(result)

[('sunday', 0.9509350657463074), ('friday', 0.9248849153518677), ('afternoon', 0.9200295805931091), ('evening', 0.9158885478973389), ('wednesday', 0.9103204607963562), ('monday', 0.902908205986023), ('tuesday', 0.8927125930786133), ('tonight', 0.8821916580200195), ('fame', 0.8697643280029297), ('weekend', 0.8613446950912476)]


## 2. 네거티브 샘플링을 통한 데이터셋 구성


In [29]:
# 네거티브 샘플링(상위 10개 뉴스그룹 샘플)
skip_grams = [skipgrams(sample,
                        vocabulary_size = vocab_size,
                        window_size = 10) for sample in encoded[:10]]

In [31]:
print(skip_grams[1])

([[216, 12017], [295, 10669], [297, 1430], [722, 1076], [9470, 1076], [13, 49331], [901, 295], [1076, 7176], [4, 60967], [56, 45664], [2243, 1123], [4, 33490], [2071, 61632], [1769, 58963], [56, 40817], [13, 6078], [585, 35145], [10670, 51643], [135, 9470], [1430, 23943], [135, 11491], [901, 8994], [461, 12340], [1076, 40203], [20294, 1430], [56, 135], [216, 1769], [1769, 10669], [4, 515], [901, 16255], [135, 19849], [2632, 49434], [800, 37187], [800, 41775], [210, 28323], [13, 135], [1188, 47717], [102, 9470], [20294, 57873], [135, 9429], [8994, 56946], [135, 57594], [4, 17782], [14, 9470], [11491, 722], [653, 36], [1769, 53663], [107, 52552], [585, 10028], [56, 14712], [585, 20294], [722, 5533], [10670, 12644], [36, 9470], [585, 653], [5533, 9725], [585, 33185], [1076, 461], [210, 1076], [461, 51953], [10028, 14], [1430, 34458], [653, 102], [585, 58331], [461, 2071], [295, 63185], [102, 16425], [9470, 18380], [102, 62480], [1430, 25511], [36, 32235], [102, 4711], [1076, 56], [5533, 1

In [24]:
len(skip_grams[0][0])

# 3260개의 단어

3260

### 1) 첫번째 뉴스그룹 샘플 확인
- 중심단어, 주변단어 관계를 가지는 경우 레이블 1
- 중심단어, 주변단어 관계를 가지지 않는 경우 레이블 0

In [32]:
# skip_grams[0]: 첫번째 뉴스그룹

pairs, labels = skip_grams[0][0] , skip_grams[0][1]

for i in range(5) :
  print('({:s} ({:d}), {:s} ({:d})) -> {:d}' .format(
      idx2word[pairs[i][0]] , pairs[i][0],
      idx2word[pairs[i][1]] , pairs[i][1],
               labels[i]))

(glocks (10668), kratz (7514)) -> 1
(sample (1473), eludes (47090)) -> 0
(glock (6733), bailey (16301)) -> 0
(glock (6733), nothing (135)) -> 1
(relevant (1483), generation (1515)) -> 1


In [33]:
print('전체 샘플 수: ' , len(skip_grams))
# encoded 중 상위 10개 뉴스그룹 샘플에 대해서만 수행했으므로 10개

전체 샘플 수:  10


- 첫번째 뉴스그룹 샘플이 가지고 있는 pairs와 labels의 개수 출력

In [27]:
len(pairs) , len(labels)

(3260, 3260)

### 2) 모든 뉴스그룹에 대해서 수행

In [34]:
%%time

skip_grams = [skipgrams(sample,
                        vocabulary_size = vocab_size,
                        window_size = 10) for sample in encoded]

CPU times: user 2min 37s, sys: 2.18 s, total: 2min 40s
Wall time: 2min 39s


## 3. Skip-Gram with Negative Sampling(SGNS) 구현

In [70]:
embed_size = 100 # 임베딩 벡터 차원(하이퍼 파라미터)

### 1) 모델 설계

In [71]:
# 중심단어를 위한 임베딩 테이블
w_inputs = Input(shape = (1, ), dtype = 'int32')
word_embedding = Embedding(vocab_size, embed_size)(w_inputs)

# 주변단어를 위한 임베딩 테이블
c_inputs = Input(shape = (1, ), dtype = 'int32')
context_embedding = Embedding(vocab_size, embed_size)(c_inputs)

- 각 단어는 임베딩 테이블을 거쳐 내적을 수행
- 내적의 결과는 1또는 0을 예측하기 위해 시그모이드를 거쳐 최종 예측

In [72]:
dot_product = Dot(axes = 2)([word_embedding, context_embedding])
dot_product = Reshape((1,) , input_shape = (1, 1))(dot_product)
output = Activation('sigmoid')(dot_product)

In [73]:
model = Model(inputs = [w_inputs, c_inputs] ,
              outputs = output)

In [74]:
model.compile(loss = 'binary_crossentropy' ,
              optimizer = 'adam')

In [75]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 100)       6427700     input_4[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 100)       6427700     input_5[0][0]                    
____________________________________________________________________________________________

### 2) 모델 학습: 5 에포크

In [None]:
%%time

for epoch in range(1, 6) :
  loss = 0
  for _, elem in enumerate(skip_grams) :
    first_elem = np.array(list(zip(*elem[0]))[0], dtype = 'int32')
    second_elem = np.array(list(zip(*elem[0]))[1] , dtype = 'int32')
    labels = np.array(elem[1] , dtype = 'int32')

    X = [first_elem, second_elem]
    y = labels
    loss += model.train_on_batch(X, y)

  print('Epoch: ' , epoch , 'Loss: ' , loss)

## 4. 결과 확인
- 학습된 임베딩 벡터들을 txt로 저장

In [None]:
f = open('004_SGNS.txt' , 'w')
f.write('{} {}\n' .format(vocab_size - 1 , embed_size))
vectors = model.get_weights()[0] # 0층의 밀집벡터

for word, i in tokenizer.word_index.items() :
  f.write('{} {}\n' .format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

In [None]:
from google.colabs import files

files.download('004_SGNS.txt')

In [None]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./004_SGNS.txt',
                                                      binary = False)

In [None]:
w2v.most_similar(positive = ['soldiers'])