<a href="https://colab.research.google.com/github/ykkim77/nlp_6th/blob/main/nlp_6th.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**케라스 임베딩 층을 이용한 워드 임베딩** <br>  출처 https://wikidocs.net/33793

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [None]:
sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1,0,0,1,1,0,1]

In [None]:
t = Tokenizer()
t.fit_on_texts(sentences)

vocab_size = len(t.word_index) + 1

In [None]:
X_encoded = t.texts_to_sequences(sentences)
print(X_encoded)

[[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]


In [None]:
max_len = max(len(I) for I in X_encoded)
print(max_len)

4


In [None]:
X_train = pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train = np.array(y_train)
print(X_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
model.add(Embedding(vocab_size,4, input_length=max_len))
model.add(Flatten()) 
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 4, 4)              64        
_________________________________________________________________
flatten_12 (Flatten)         (None, 16)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 17        
Total params: 81
Trainable params: 81
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam', loss ='binary_crossentropy', metrics=['acc'])
model.fit(X_train,y_train, epochs=20, verbose=2)

Epoch 1/20
1/1 - 0s - loss: 0.6911 - acc: 0.4286
Epoch 2/20
1/1 - 0s - loss: 0.6898 - acc: 0.5714
Epoch 3/20
1/1 - 0s - loss: 0.6885 - acc: 0.5714
Epoch 4/20
1/1 - 0s - loss: 0.6872 - acc: 0.7143
Epoch 5/20
1/1 - 0s - loss: 0.6859 - acc: 0.8571
Epoch 6/20
1/1 - 0s - loss: 0.6846 - acc: 0.8571
Epoch 7/20
1/1 - 0s - loss: 0.6833 - acc: 0.8571
Epoch 8/20
1/1 - 0s - loss: 0.6820 - acc: 0.8571
Epoch 9/20
1/1 - 0s - loss: 0.6807 - acc: 1.0000
Epoch 10/20
1/1 - 0s - loss: 0.6794 - acc: 1.0000
Epoch 11/20
1/1 - 0s - loss: 0.6780 - acc: 1.0000
Epoch 12/20
1/1 - 0s - loss: 0.6767 - acc: 1.0000
Epoch 13/20
1/1 - 0s - loss: 0.6754 - acc: 1.0000
Epoch 14/20
1/1 - 0s - loss: 0.6740 - acc: 1.0000
Epoch 15/20
1/1 - 0s - loss: 0.6727 - acc: 1.0000
Epoch 16/20
1/1 - 0s - loss: 0.6713 - acc: 1.0000
Epoch 17/20
1/1 - 0s - loss: 0.6700 - acc: 1.0000
Epoch 18/20
1/1 - 0s - loss: 0.6686 - acc: 1.0000
Epoch 19/20
1/1 - 0s - loss: 0.6672 - acc: 1.0000
Epoch 20/20
1/1 - 0s - loss: 0.6658 - acc: 1.0000


<tensorflow.python.keras.callbacks.History at 0x7f391bf74290>

In [None]:
import numpy as np
import gensim

In [None]:
!wget "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2021-03-30 18:01:24--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.84.230
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.84.230|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz.3’


2021-03-30 18:02:00 (43.9 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz.3’ saved [1647046227/1647046227]



In [None]:
!ls

GoogleNews-vectors-negative300.bin.gz
GoogleNews-vectors-negative300.bin.gz.1
GoogleNews-vectors-negative300.bin.gz.2
GoogleNews-vectors-negative300.bin.gz.3
sample_data


In [None]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
print(word2vec_model.vectors.shape)

(3000000, 300)


In [None]:
embedding_matrix = np.zeros((vocab_size,300))
np.shape(embedding_matrix)

(16, 300)

In [None]:
def get_vector(word):
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        return None

In [None]:
for word, i in t.word_index.items():
    print(word,i)
    temp = get_vector(word)
    if temp is not None:
        embedding_matrix[i] = temp   


nice 1
great 2
best 3
amazing 4
stop 5
lies 6
pitiful 7
nerd 8
excellent 9
work 10
supreme 11
quality 12
bad 13
highly 14
respectable 15


In [None]:
print(embedding_matrix[1])
print(word2vec_model['nice'])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
e = Embedding(vocab_size,300, weights= [embedding_matrix], input_length=max_len, trainable = False)
model.add(e)
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train,y_train,epochs=100, verbose=2)
model.predict(X_train)

Epoch 1/100
1/1 - 0s - loss: 0.7058 - acc: 0.5714
Epoch 2/100
1/1 - 0s - loss: 0.6867 - acc: 0.5714
Epoch 3/100
1/1 - 0s - loss: 0.6681 - acc: 0.8571
Epoch 4/100
1/1 - 0s - loss: 0.6500 - acc: 0.8571
Epoch 5/100
1/1 - 0s - loss: 0.6326 - acc: 0.8571
Epoch 6/100
1/1 - 0s - loss: 0.6157 - acc: 0.8571
Epoch 7/100
1/1 - 0s - loss: 0.5993 - acc: 1.0000
Epoch 8/100
1/1 - 0s - loss: 0.5836 - acc: 1.0000
Epoch 9/100
1/1 - 0s - loss: 0.5683 - acc: 1.0000
Epoch 10/100
1/1 - 0s - loss: 0.5536 - acc: 1.0000
Epoch 11/100
1/1 - 0s - loss: 0.5394 - acc: 1.0000
Epoch 12/100
1/1 - 0s - loss: 0.5257 - acc: 1.0000
Epoch 13/100
1/1 - 0s - loss: 0.5125 - acc: 1.0000
Epoch 14/100
1/1 - 0s - loss: 0.4997 - acc: 1.0000
Epoch 15/100
1/1 - 0s - loss: 0.4874 - acc: 1.0000
Epoch 16/100
1/1 - 0s - loss: 0.4755 - acc: 1.0000
Epoch 17/100
1/1 - 0s - loss: 0.4641 - acc: 1.0000
Epoch 18/100
1/1 - 0s - loss: 0.4530 - acc: 1.0000
Epoch 19/100
1/1 - 0s - loss: 0.4423 - acc: 1.0000
Epoch 20/100
1/1 - 0s - loss: 0.4320 - a

array([[0.97527725],
       [0.09885675],
       [0.08259019],
       [0.8561011 ],
       [0.8928354 ],
       [0.18677226],
       [0.9167881 ]], dtype=float32)