<a href="https://colab.research.google.com/github/ykkim77/nlp_11th/blob/main/nlp_11th.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np 
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
import pandas as pd
 

**Dataset 준비하기**<br>
실제 실무에서와 같이 데이터 전처리 하는데에 시간이 꽤나 오래 걸립니다. 
그러나 모델 작성은 그리 어렵지 않으니, 실습에 너무 큰 부담을 가지 마시길 바랍니다.

In [3]:
!!curl -O http://www.manythings.org/anki/fra-eng.zip

['  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current',
 '                                 Dload  Upload   Total   Spent    Left  Speed',
 '',
 '  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0',
 '100 6134k  100 6134k    0     0  11.1M      0 --:--:-- --:--:-- --:--:-- 11.1M']

In [4]:
!unzip fra-eng.zip

Archive:  fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 


In [5]:
f = open('fra.txt','r')
lines = f.readlines()
print(lines[0])
print(lines[4])

eng_sents, fra_sents = [], []

# 한 문장을 영어와 불어를 분리해 따로 저장

for line in lines:
    eng, fra, _ = line.split('\t')
    eng_sents.append(eng)
    fra_sents.append(fra)

Go.	Va !	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)

Hi.	Salut.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)



In [6]:
eng_sents[0], fra_sents[0]

('Go.', 'Va !')

In [7]:
print('데이터의 총 개 수', len(lines))
eng_sent_lengths = [len(eng_sent) for eng_sent in eng_sents]

데이터의 총 개 수 185583


In [8]:
#EDA eng

eng_sents_series = pd.Series(eng_sents)
eng_sents_series.apply(len).describe()


count    185583.000000
mean         30.323753
std          12.548177
min           3.000000
25%          22.000000
50%          28.000000
75%          36.000000
max         262.000000
dtype: float64

In [9]:
#EDA fra

fra_sents_series = pd.Series(fra_sents)
fra_sents_series.apply(len).describe()

count    185583.000000
mean         35.835120
std          15.215418
min           4.000000
25%          25.000000
50%          33.000000
75%          43.000000
max         325.000000
dtype: float64

In [10]:
for ind, eng_sent in enumerate(eng_sents):
    eng_sents[ind] = eng_sent + ' '

for ind, fra_sent in enumerate(fra_sents):
    fra_sents[ind] = fra_sent + ' '

In [11]:
# change char to number

eng_chars = set("".join(eng_sents))
print('# English chars : ',len(eng_chars))

# generate English char level dictionary
eng_char_dict = {}
for ind, eng_char in enumerate(eng_chars):
    eng_char_dict[eng_char] = ind


fra_chars = set("".join(fra_sents))
print('# English chars : ',len(fra_chars))

# generate Franch char level dictionary
fra_char_dict = {}
for ind, fra_char in enumerate(fra_chars):
    fra_char_dict[fra_char] = ind

# English chars :  90
# English chars :  113


In [12]:
#change all english to number

eng_vectors = []

for eng_sent in eng_sents:
    eng_vectors.append([eng_char_dict[eng_char] for eng_char in eng_sent])

#change all franch to number

fra_vectors = []

for fra_sent in fra_sents:
    fra_vectors.append([fra_char_dict[fra_char] for fra_char in fra_sent])

In [13]:
print(eng_sents[0], eng_vectors[0])
eng_char_dict['G'], eng_char_dict['o'], eng_char_dict['o'], eng_char_dict[' ']

print(fra_sents[0], fra_vectors[0])
fra_char_dict['V'], fra_char_dict['a'], fra_char_dict[' '], fra_char_dict['!']

Go.  [25, 30, 18, 13]
Va !  [44, 63, 16, 53, 16]


(44, 63, 16, 53)

In [14]:
#change english vector to onehot vector
eng_input_onehots = [to_categorical(eng_vector, len(eng_chars)) for eng_vector in eng_vectors]

In [15]:
fra_input_vectors = [fra_vector[:-1] for fra_vector in fra_vectors]
fra_output_vectors = [fra_vector[1:] for fra_vector in fra_vectors]

#change franch vector to onehot vector
fra_input_onehots = [to_categorical(fra_vector, len(fra_chars)) 
                    for fra_vector in fra_input_vectors]

fra_output_onehots = [to_categorical(fra_vector, len(fra_chars))
                       for fra_vector in fra_output_vectors]

In [16]:
fra_vectors[0], fra_input_vectors[0], fra_output_vectors[0]

([44, 63, 16, 53, 16], [44, 63, 16, 53], [63, 16, 53, 16])

In [17]:
train_xs = [eng_input_onehots, fra_input_onehots]
train_ys = fra_output_onehots 

**Build Encoder Model**

In [18]:
K.clear_session()
#encoder model
encoder_inputs = Input(shape=(None,len(eng_chars)))
latent_dim = 256
encoder = LSTM(latent_dim,return_state=True)
encoder_ouputs, encoder_h, encoder_c = encoder(encoder_inputs)
encoder_ouputs

<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'lstm')>

**Build Decoder Model**

In [19]:
decoder_inputs = Input(shape=(None, len(fra_chars)))

latent_dim = 256
decoder = LSTM(latent_dim,return_state=True, return_sequences=True)
decoder_ouputs, _, _ = decoder(decoder_inputs, initial_state=[encoder_h, encoder_c])
decoder_ouputs = Dense(units=len(fra_chars), activation='softmax')(decoder_ouputs)

**Model**

In [20]:
model = Model([encoder_inputs, decoder_inputs], decoder_ouputs)

In [21]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'])

In [22]:
from tqdm import tqdm

epochs = 1

for epoch in range(epochs):
    epoch_acc = []
    epoch_loss = []
    for ind in tqdm(range(len(train_xs[0]))):
        xs_0 = np.array(train_xs[0][ind: ind+1])
        xs_1 = np.array(train_xs[1][ind: ind+1])
        ys = np.array(train_ys[ind: ind+1])

        hist = model.fit([xs_0,xs_1],ys, batch_size=1, epochs=1, verbose = False)
        epoch_acc.append(hist.history['accuracy'])
        epoch_loss.append(hist.history['loss'])
    print('acc', np.mean(epoch_acc))
    print('loss', np.mean(epoch_loss))

  1%|          | 1658/185583 [02:21<5:11:18,  9.85it/s]

KeyboardInterrupt: ignored