In [1]:
from tensorflow import keras
keras.__version__

'2.4.0'

# LSTM으로 텍스트 생성하기

이 노트북은 [케라스 창시자에게 배우는 딥러닝](https://tensorflow.blog/deep-learning-with-python/) 책의 8장 1절의 코드 예제입니다. 책에는 더 많은 내용과 그림이 있습니다. 이 노트북에는 소스 코드에 관련된 설명만 포함합니다. 이 노트북의 설명은 케라스 버전 2.2.2에 맞추어져 있습니다. 케라스 최신 버전이 릴리스되면 노트북을 다시 테스트하기 때문에 설명과 코드의 결과가 조금 다를 수 있습니다.

----

[...]

## 글자 수준의 LSTM 텍스트 생성 모델 구현

이런 아이디어를 케라스로 구현해 보죠. 먼저 언어 모델을 학습하기 위해 많은 텍스트 데이터가 필요합니다. 위키피디아나 반지의 제왕처럼 아주 큰 텍스트 파일이나 텍스트 파일의 묶음을 사용할 수 있습니다. 이 예에서는 19세기 후반 독일의 철학자 니체의 글을 사용하겠습니다(영어로 번역된 글입니다). 학습할 언어 모델은 일반적인 영어 모델이 아니라 니체의 문체와 특정 주제를 따르는 모델일 것입니다.

## 데이터 전처리

먼저 말뭉치를 다운로드하고 소문자로 바꿉니다:

In [2]:
from tensorflow import keras
import numpy as np

path = keras.utils.get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('말뭉치 크기:', len(text))

Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
말뭉치 크기: 600893


In [3]:
type(text)

str

그 다음 `maxlen` 길이를 가진 시퀀스를 중복하여 추출합니다. 추출된 시퀀스를 원-핫 인코딩으로 변환하고 크기가 `(sequences, maxlen, unique_characters)`인 3D 넘파이 배열 `x`로 합칩니다. 동시에 훈련 샘플에 상응하는 타깃을 담은 배열 `y`를 준비합니다. 타깃은 추출된 시퀀스 다음에 오는 원-핫 인코딩된 글자입니다.

In [4]:
# 60개 글자로 된 시퀀스를 추출합니다.
maxlen = 60

# 세 글자씩 건너 뛰면서 새로운 시퀀스를 샘플링합니다.
step = 3

# 추출한 시퀀스를 담을 리스트
sentences = []

# 타깃(시퀀스 다음 글자)을 담을 리스트
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('시퀀스 개수:', len(sentences))

# 말뭉치에서 고유한 글자를 담은 리스트
chars = sorted(list(set(text)))
print('고유한 글자:', len(chars))
# chars 리스트에 있는 글자와 글자의 인덱스를 매핑한 딕셔너리
char_indices = dict((char, chars.index(char)) for char in chars)

# 글자를 원-핫 인코딩하여 0과 1의 이진 배열로 바꿉니다.
print('벡터화...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

시퀀스 개수: 200278
고유한 글자: 57
벡터화...


## 네트워크 구성

이 네트워크는 하나의 `LSTM` 층과 그 뒤에 `Dense` 분류기가 뒤따릅니다. 분류기는 가능한 모든 글자에 대한 소프트맥스 출력을 만듭니다. 순환 신경망이 시퀀스 데이터를 생성하는 유일한 방법은 아닙니다. 최근에는 1D 컨브넷도 이런 작업에 아주 잘 들어 맞는다는 것이 밝혀졌습니다.

In [5]:
from tensorflow.keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

타깃이 원-핫 인코딩되어 있기 때문에 모델을 훈련하기 위해 `categorical_crossentropy` 손실을 사용합니다:

In [6]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## 언어 모델 훈련과 샘플링

훈련된 모델과 시드로 쓰일 간단한 텍스트가 주어지면 다음과 같이 반복하여 새로운 텍스트를 생성할 수 있습니다.

1.	지금까지 생성된 텍스트를 주입하여 모델에서 다음 글자에 대한 확률 분포를 뽑습니다.
2.	특정 온도로 이 확률 분포의 가중치를 조정합니다.
3.	가중치가 조정된 분포에서 무작위로 새로운 글자를 샘플링합니다.
4.	새로운 글자를 생성된 텍스트의 끝에 추가합니다.

다음 코드는 모델에서 나온 원본 확률 분포의 가중치를 조정하고 새로운 글자의 인덱스를 추출합니다(샘플링 함수입니다):

In [7]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

마지막으로 다음 반복문은 반복적으로 훈련하고 텍스트를 생성합니다. 에포크마다 학습이 끝난 후 여러가지 온도를 사용해 텍스트를 생성합니다. 이렇게 하면 모델이 수렴하면서 생성된 텍스트가 어떻게 진화하는지 볼 수 있습니다. 온도가 샘플링 전략에 미치는 영향도 보여 줍니다.

In [8]:
import random
import sys

random.seed(42)
start_index = random.randint(0, len(text) - maxlen - 1)

# 60 에포크 동안 모델을 훈련합니다
for epoch in range(1, 60):
    print('에포크', epoch)
    # 데이터에서 한 번만 반복해서 모델을 학습합니다
    model.fit(x, y, batch_size=128, epochs=1)

    # 무작위로 시드 텍스트를 선택합니다
    seed_text = text[start_index: start_index + maxlen]
    print('--- 시드 텍스트: "' + seed_text + '"')

    # 여러가지 샘플링 온도를 시도합니다
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('------ 온도:', temperature)
        generated_text = seed_text
        sys.stdout.write(generated_text)

        # 시드 텍스트에서 시작해서 400개의 글자를 생성합니다
        for i in range(400):
            # 지금까지 생성된 글자를 원-핫 인코딩으로 바꿉니다
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            # 다음 글자를 샘플링합니다
            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

에포크 1
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the world the will maniver and the will the will the spiritually the more the manion of the will the spiritually the spiritually of the and strong the maniver and the spiritually and the spiritual of the one of the spiritually the waster and such a more and strong the spiritually the spiritually the spiritual and a strong the strong of the disculless of the world the in the strong the spirituall
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the scientes of least and the histor. and the reason the inter of the instance, after every and the waster manity, and which a a possible interlignt shance spiritual of the discistional exprecially with a master there is the maning, and the aroust as it on the will such the haster of men desirter and the latter spiritually the desire the process, in the fing the s

en"'s be the atter orwaptic,-storeans? is in which peirla(ter discoveire of theh measity of
scard, the righ it some. he kn=hed dir
에포크 5
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the conscience of such a such a more and superstion and a moral man it is a distrust and a more and states of the same a strong of the same and superstion the most conscience of the sense of the way and the delight and conscience of the states of the same to a souls of the still and same and states and souls and sense of the sense of the most conscience of the state of such a strange and still a
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for at said even that the conditioned of the most are conditions, instinct of the sense it is a such a far as a still philosophers of all such a the hupace of such a souls of the world and a most never belong of the most supertime to it an

dream, fas were the enstervalt, non, nos with
not
light) of more odeupire to longu stronghed, as-now so presentrauntation.


pienally in fairible that only to
list, all believe
에포크 9
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through form of the structure of the spirit and the sense of the present and strong with the strive and interpretation of the such an all the contempt of the same the reality of the structure of the soul as a succording to the same and the same the most would something and souls and the soul, and the same to the most world of the moral and succession of the structure of the consideration of the structure of
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the soul, it is made school, the uningroush the superse of the second to precistotion. the feeling in the more thing which so that of problem of the mind and with the
sense of the strength 

idness, petarianshomes, capbiralizies medited.l. howtedy where how amquire.

is modern onven
juss assumes to
which he attaine contindible, to our wi rescely upon suspiciousness, they a displeanwical--which embicy way
that the listional tagray, by his politicicationous abgred car
에포크 13
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the spirit of the consider the spirit of the strength of the spirit of the more interpretation of the same and some and the spirit of the same and the profound of the more and more things of the spirit of the same and some so that in the contrary and the soul in the science of the world of the act of the spirit of the world of the spirit of the same and things of the contrary to the more into th
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the principal are men are a thing that is the other sacrifice of the body and has sou

peemination of will be atternaks aricby, and the each recondedd sicjes-firman, as wisved he akesh" to methine; and
the toy compandisiving man
who under stind it is constant.
no longer
germanis.
there is beothe one has terted enjoyabere, "it? has attempt composed ofteg,-plitnessless, and
reprives, projalt, "the willd" has
a firmly.

8


1a
에포크 17
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the sense of the sense of the sense of the sense of the same the sense of the strong of the propers of the conscience of the sense of the same the senses of the same to the sense of the same to the sense of the same with the estimation of the same to the strong of the moral and souls and the sense of the strength of the same profoundly and all the sense of the sense of the spirit of the sense of
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the same mean of himself

through for a understanding tho
will seal still
evens no dillucion holy,
a philosophers!, from whiched lire art strong which believed
him-act, it is dote gulrcof thretrage yet man himself of things" forded gregaring herd naturous.--platopicion ofrhwome, hosestater which -philosophy amongts by advance,
act give fhen the gegt and painfed, and of a hished" that late, 
 fear of thirst out mimitated haked
with
p
에포크 21
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through former to the contrary and more the sense of the sense of the sense of the most delicate and present strong the most man is the most spirit and paters and the most religion of the sense of the stupidity of the presence of the world of the spirit and precisely and more the most spirit and the sense of the sense of the sense of the presence of the sense of the sense of the standard of the action of th
------ 온도: 0.5
the slowly ascen

  This is separate from the ipykernel package so we can avoid doing imports until


e most spirit of the sense of christian the gonesting of the personal formess of
man, to the most entire to many and the superstition of all more endured, dece
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through foller readl
heil the "sangtument recure hby because in the were the trage of a con acwarde.--cherst inside to prezating of unforiest: the event and
gultivally with -it were one thoulal,
n amount unstainter of philosophers, the centurime
than what preys of presence among men betimisity,
emoripated useful. to (precaus, here that beduthjent of relation of woman, the his stronger hope eirned himself "t
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through forkelies.

eraftification; to framed o!
quatical capaced more follientike play
how predication-fire,
how dwhellust and vihended when they honest.          xyou distusp. there is
no paragerfogoism as in the pecipio tanil your need onerie for any one self-callo,
when we
stage, the good
asser

through for the english time is in the reality of the strong the conscience of the conscious propers his dangerous sensual conscious philosophers of a form of the end than the passion of the precise and science of the conditions have lastenth--the animality of the sense of the simplius of the
great instincts of the power--and
sense of the states imporises of his emotions and interest delightful with the sen
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for graspical and disinterestic creation of which acwaed ourstards the contrary of
century, artist oder.


131

=the emotion to a pure enough year his fact
of eeseed a conscient to men.


1itimed by the superstition of emblandal learning of human impostutic are of "degered sacraint rorvo-age," in theirden society of myself original god soul for genuin from the find
of can who could not by arm(restes
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through formful the great culuder to
this
heady 

through for the sense of the somen to him some destiny belongs to mens--himself. the cherest believed by the rest desire view. but i all the sense of the same to charbhed and a such feely, precisely and the destruisity of the come the goint evil destruction of the world be the conduct and in common outwort: the good or as the presence of the acts of the sense of the delight in the stand us it is also the ag
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for purpystw and our vultels
strive foregrance on interlue of vigocicate is, hately dectibility of
every sensure, for hely down inle every strengs and the people should have it would not be daily langualing man
at the good present, grow, were the stand
various natures; and the discording pet conforden without inscience, sympathy
the impostne of races and
who love scruled and plant,--all this propoga
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through for very former honesty becomed avour, h

through for the been a disposition of a man is to speak that the general desist of the german things of the science of the bay be learned to possess of supercired
religious conceive that an exuseation. and the instincts of the sense of the same as the christian and stronger that which the present super-imperative and every strength and about the succlusion without privilege it is the spiritual and more spri
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through forth becomes thing be task, that
men, more
believe, feel so that it is a bendered by laboning dhbush, and it folly--the german whole more empives: they winding the funite of the knowledge enough the
galmomw--yu
knowledge, thens really, therefore, i very
thought.if th. the bradded, but i mean in whil, contain as us'e
dicre
or-day beneigatible femining who habitueowh
nonarss
of habit of especially.


------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through fortitulles..=wome
footes eviden-." they

through for the same time of the unsentiment of the centuries, the sunsability of the proves to ask to be ascertain lived and paradity to the most father and man as all the man the case of man and the laigh of consideration; they have to be something for the subjection some origin of the consideration is the willing, the estimate in a that of the most does not have can be as man more comprehensible to see t
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through forestives him miture will overchined?--made souls, and yet at sevent ourselves an extent subtlety. the poul and eme
follys. the dovef music testimate its guise thing their longed of as
for vould ultimate entired free haind of a thing of
every
kindless, no or powerful of a corrageness as tgere generationce to many opinion only of history. for anarim of a only the nessious and artist in virtually chr
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through for their is justify nowadays
peetury an

through for the cates of the thing in considers that we are--is the religion and antivations of the order that we all the still the most man something that the degree of the excess of discialnty, as a same at all they have to be with the stint to the whole dereiged that the reason of the sublimer to one's false of the brings also god": and all the christian as if the rises noble and the estimate of the cons
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for the thing in them bad will as the tanted operions with he makes everything alone there we are opinion
for 
 dones? but there may -toe propety they looblety irred, minred, and mediocry, perceavers always whereger.


17

=the
shor
they another. their samity of facter-affingness into he mapble from philosophy, in the spirit. the generation of
europe all the "things, and reculud
scands--are about th
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through for mind appeared to hinted one hyrother

through for a distingushian of the more and pathet and position that it is an and soul at last in the stronger works of the far to
a conceals and the moral of the superficialist of the world of the characteristical god philosophers to let it also have nowaday and therefore, the metaphysics of the natural want not for the religion, and the master of man to be who can be word has not all the superior on the m
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through formully beguled towards
opinion in too granding, propensity advolivily; out of
truth, the
cause and traauests sprend to word will strange that live
something and one may instincts origin, one planoue ancaus plato plave with the cogverers of birst, men that it may shars and lack of men that is a day make us to far a seat to uniners, the may prepare in the rerage of
logic vanity--they are a day fintl
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through footiotutaliers, could we are trolling, 

through for the german should be concerning the subject and higher the consequently the powerful condition of the world of the sense of the comment that a hard. there is not to be still power to the result of the sense--the master of the child and consequently in
the end as a satisfaction of the colour, the demons them for the life of the delight in being on the betinifi(wk that the most consequence and deg
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through fortune them master, doly lomed by a attatice of something has perhaps of moral,
sed towards hume--whrt-to represed the element of usibil, when only ordinary bee with life must revelers and dights of good northerous;
while "something compulsed", by
with the greatest philosopher. the philosopher,
as"founds the laies
of a magnicart
of instance a actedoges impulse of kind, inkind reveredness, together:
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through forward that whether quite vered
sensiti

through for the expression of the experience, and with the freedom of the the great him the desire of the armances of his unifficient was a such a thought or by the freedom of the more plants of the contest amount of the expression of it. in the distinguishsh, by the necessary
onghe the contemonity and self-science of the same the delien and even the expression, and the sense of the desire of our ordsed of 
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for one punitu, to be      , the struggle more future, i did grars that because )as: there was
by the father--alone be seously that everydner, variety ever been centuries, crefuicn.


178
ha
helvouse. not renove of subtle
desire them--lack of discourds prono; bbquilecon it, grew: key as the germanis, where
existeces as the etart shwith
it is those as mast untaste. one shieers, has apars one who neve
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through for can seel"
the pleasing, there the
di

through forms of the greatest conscience of possibi-hard in a the presence of fair to talk and strong soul herself to the individual producive man as the origin of morality of any interpretation, and in all the chant such a still to speak it is it we do not believe of fellows the philosopher of the higher and dangerous and man and problem in the religious community, and the thought must be predictorically t
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for mearac
leavely--here, so
imitst and hence, begil good,
evends the highest just as have been deceate, and that they be some worths of by philosopher parmented in all itsting to leavemat, they caruler to the consideration appear because also.--bemmments which is seem.--but the, it is of dines goody must
calledy individual crefure,
o thinkers, who has he himself; and we can guared by those preducat
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through for skepticisms!


11

=mis comple ituda

여기서 볼 수 있듯이 낮은 온도는 아주 반복적이고 예상되는 텍스트를 만듭니다. 하지만 국부적인 구조는 매우 실제와 같습니다. 특히 모든 단어(단어는 글자의 지역 패턴으로 이루어집니다)가 실제 영어 단어입니다. 높은 온도에서 생성된 텍스트는 아주 흥미롭고 놀라우며 창의적이기도 합니다. 이따금 꽤 그럴싸하게 보이는 완전히 새로운 단어를 창조합니다(‘begarmed’와 ‘isharent’ 같은 단어입니다). 높은 온도에서는 국부적인 구조가 무너지기 시작합니다. 대부분의 단어가 어느정도 무작위한 문자열로 보입니다. 확실히 이 네트워크에서는 텍스트 생성에 가장 좋은 온도는 0.5입니다. 항상 다양한 샘플링 전략으로 실험해 봐야합니다! 학습된 구조와 무작위성 사이에 균형을 잘 맞추면 흥미로운 것을 만들 수 있습니다.

더 많은 데이터에서 크고 깊은 모델을 훈련하면 이것보다 훨씬 논리적이고 실제와 같은 텍스트 샘플을 생성할 수 있습니다. 당연히 우연이 아닌 의미 있는 텍스트가 생성된다고 기대하지 마세요. 글자를 연속해서 나열하기 위한 통계 모델에서 데이터를 샘플링한 것뿐입니다. 언어는 의사소통의 수단입니다. 의사소통이 의미하는 것과 의사소통이 인코딩된 메시지의 통계 구조 사이는 차이가 있습니다. 이 차이를 검증하기 위해 다음과 같은 사고 실험을 해보죠. 컴퓨터가 대부분의 디지털 통신에서 하는 것처럼 사람의 언어가 의사소통을 압축하는데 더 뛰어나다면 어떨까요? 언어의 의미가 줄진 않지만 고유한 통계 구조가 사라질 것입니다. 이는 방금과 같은 언어 모델을 학습하는 것을 불가능하게 만듭니다.

## 정리

* 이전의 토큰이 주어지면 다음 토큰(들)을 예측하는 모델을 훈련하여 시퀀스 데이터를 생성할 수 있습니다.
* 텍스트의 경우 이런 모델을 언어 모델이라 부릅니다. 단어 또는 글자 단위 모두 가능합니다.
* 다음 토큰을 샘플링할 때 모델이 만든 출력에 집중하는 것과 무작위성을 주입하는 것 사이에 균형을 맞추어야 합니다.
* 이를 위해 소프트맥스 온도 개념을 사용합니다. 항상 다양한 온도를 실험해서 적절한 값을 찾습니다.