# 20. LSTM with Moby Dick

In [1]:
import torch
import torch.nn as nn
# from torch.autograd import Variable

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import CountVectorizer

import random
import numpy as np

## Preparing Data

In [2]:
nltk.download("gutenberg")
nltk.download("stopwords")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
raw = nltk.corpus.gutenberg.raw("melville-moby_dick.txt")
len(raw)

1242990

In [4]:
print(raw[21945:23000])
raw = raw[21945:200000]  # 20만

CHAPTER 1

Loomings.


Call me Ishmael.  Some years ago--never mind how long
precisely--having little or no money in my purse, and nothing
particular to interest me on shore, I thought I would sail about a
little and see the watery part of the world.  It is a way I have of
driving off the spleen and regulating the circulation.  Whenever I
find myself growing grim about the mouth; whenever it is a damp,
drizzly November in my soul; whenever I find myself involuntarily
pausing before coffin warehouses, and bringing up the rear of every
funeral I meet; and especially whenever my hypos get such an upper
hand of me, that it requires a strong moral principle to prevent me
from deliberately stepping into the street, and methodically knocking
people's hats off--then, I account it high time to get to sea as soon
as I can.  This is my substitute for pistol and ball.  With a
philosophical flourish Cato throws himself upon his sword; I quietly
take to the ship.  There is nothing

In [5]:
raw = raw.replace('--', ' ')

## BoW

In [6]:
tokens = word_tokenize(raw)
print(tokens[:50])

['CHAPTER', '1', 'Loomings', '.', 'Call', 'me', 'Ishmael', '.', 'Some', 'years', 'ago', 'never', 'mind', 'how', 'long', 'precisely', 'having', 'little', 'or', 'no', 'money', 'in', 'my', 'purse', ',', 'and', 'nothing', 'particular', 'to', 'interest', 'me', 'on', 'shore', ',', 'I', 'thought', 'I', 'would', 'sail', 'about', 'a', 'little', 'and', 'see', 'the', 'watery', 'part', 'of', 'the', 'world']


In [7]:
def tokenizer(doc):
    return ["/".join(p) for p in pos_tag(doc)]

tokens = tokenizer(tokens)

print(tokens[:50])

['CHAPTER/NN', '1/CD', 'Loomings/NNP', './.', 'Call/VB', 'me/PRP', 'Ishmael/NNP', './.', 'Some/DT', 'years/NNS', 'ago/RB', 'never/RB', 'mind/VB', 'how/WRB', 'long/JJ', 'precisely/RB', 'having/VBG', 'little/JJ', 'or/CC', 'no/DT', 'money/NN', 'in/IN', 'my/PRP$', 'purse/NN', ',/,', 'and/CC', 'nothing/NN', 'particular/JJ', 'to/TO', 'interest/NN', 'me/PRP', 'on/IN', 'shore/NN', ',/,', 'I/PRP', 'thought/VBD', 'I/PRP', 'would/MD', 'sail/VB', 'about/IN', 'a/DT', 'little/JJ', 'and/CC', 'see/VB', 'the/DT', 'watery/JJ', 'part/NN', 'of/IN', 'the/DT', 'world/NN']


In [8]:
countvec = CountVectorizer(analyzer = 'word',
                           tokenizer=lambda x: x.split(', '),
                           preprocessor = None, 
                           stop_words = None,
                           ngram_range=(1, 1),
                           lowercase=False
                          )

data = countvec.fit_transform(tokens).toarray()

In [9]:
len(data)

36630

In [10]:
countvec.inverse_transform(data)[:30]

[array(['CHAPTER/NN'], dtype='<U26'),
 array(['1/CD'], dtype='<U26'),
 array(['Loomings/NNP'], dtype='<U26'),
 array(['./.'], dtype='<U26'),
 array(['Call/VB'], dtype='<U26'),
 array(['me/PRP'], dtype='<U26'),
 array(['Ishmael/NNP'], dtype='<U26'),
 array(['./.'], dtype='<U26'),
 array(['Some/DT'], dtype='<U26'),
 array(['years/NNS'], dtype='<U26'),
 array(['ago/RB'], dtype='<U26'),
 array(['never/RB'], dtype='<U26'),
 array(['mind/VB'], dtype='<U26'),
 array(['how/WRB'], dtype='<U26'),
 array(['long/JJ'], dtype='<U26'),
 array(['precisely/RB'], dtype='<U26'),
 array(['having/VBG'], dtype='<U26'),
 array(['little/JJ'], dtype='<U26'),
 array(['or/CC'], dtype='<U26'),
 array(['no/DT'], dtype='<U26'),
 array(['money/NN'], dtype='<U26'),
 array(['in/IN'], dtype='<U26'),
 array(['my/PRP$'], dtype='<U26'),
 array(['purse/NN'], dtype='<U26'),
 array([',/,'], dtype='<U26'),
 array(['and/CC'], dtype='<U26'),
 array(['nothing/NN'], dtype='<U26'),
 array(['particular/JJ'], dtype='<U26'),
 array([

## Define Model

In [11]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size,hidden_size,num_layers, dropout = 0.5)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, hidden, cell):
        out, (hidden, cell) = self.lstm(input.view(1,1,-1),(hidden,cell))
        out = self.fc(out.view(1,-1))
        return out,hidden, cell

    def init_hidden_cell(self):
        hidden = torch.zeros(self.num_layers, 1, self.hidden_size).cuda()
        cell = torch.zeros(self.num_layers, 1, self.hidden_size).cuda()
        return hidden, cell

    
model = LSTM(len(data[0]), 1000, len(data[0]), 2).cuda()

## Training

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss = nn.CrossEntropyLoss()

In [13]:
step = 10
num_epochs = 10

In [14]:
for epoch in range(num_epochs):
    
    sp = list(range(0, len(data) - 2 * step, step))
    sp = np.add(sp, random.randint(0, step))
    random.shuffle(sp)
    
    for i in range(len(sp)) :
    
        (hidden, cell) = model.init_hidden_cell()

        cost = 0

        for pos in range(sp[i], sp[i] + step):
            X = torch.from_numpy(data[pos]).type(torch.FloatTensor).cuda()
            y = torch.from_numpy(data[pos+1]).cuda()
            _, y = y.max(dim=0)
            y = y.unsqueeze(0)

            pred, hidden, cell = model(X,hidden,cell)
            cost += loss(pred, y.cuda())

        cost.backward()
        optimizer.step()

        if (i + 1) % 1000 == 0 :
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'%(epoch+1, num_epochs, i + 1, len(sp), cost.item()))

Epoch [1/10], Iter [1000/3661] Loss: 161.2860
Epoch [1/10], Iter [2000/3661] Loss: 220.3274
Epoch [1/10], Iter [3000/3661] Loss: 199.1042
Epoch [2/10], Iter [1000/3661] Loss: 576.1527
Epoch [2/10], Iter [2000/3661] Loss: 218.2385
Epoch [2/10], Iter [3000/3661] Loss: 410.0923
Epoch [3/10], Iter [1000/3661] Loss: 127.4666
Epoch [3/10], Iter [2000/3661] Loss: 261.7136
Epoch [3/10], Iter [3000/3661] Loss: 641.1652
Epoch [4/10], Iter [1000/3661] Loss: 335.1092
Epoch [4/10], Iter [2000/3661] Loss: 351.0817
Epoch [4/10], Iter [3000/3661] Loss: 289.4716
Epoch [5/10], Iter [1000/3661] Loss: 314.1636
Epoch [5/10], Iter [2000/3661] Loss: 410.8134
Epoch [5/10], Iter [3000/3661] Loss: 540.3054
Epoch [6/10], Iter [1000/3661] Loss: 346.7859
Epoch [6/10], Iter [2000/3661] Loss: 296.6013
Epoch [6/10], Iter [3000/3661] Loss: 282.1308
Epoch [7/10], Iter [1000/3661] Loss: 323.8258
Epoch [7/10], Iter [2000/3661] Loss: 334.5731
Epoch [7/10], Iter [3000/3661] Loss: 166.2658
Epoch [8/10], Iter [1000/3661] Los

## Test

Multinomial :
Returns a tensor where each row contains num_samples indices sampled from the multinomial probability distribution located in the corresponding row of tensor input.

In [15]:
start_num = 5
text = countvec.inverse_transform(data[start_num])[0][0].split('/')[0]

model.eval()
hidden, cell = model.init_hidden_cell()

X_test = torch.from_numpy(data[start_num]).type(torch.FloatTensor).cuda()

for pos in range(100) :
    
    pred, hidden, cell = model(X_test, hidden, cell)
    
    m = torch.nn.Softmax(dim = pred.shape[0])
    
    pred = m(pred)
    
    pred = torch.multinomial(pred, 1).data[0][0]
    
    temp = np.zeros(len(data[0]))
    
    temp[pred] = 1
    
    text += " " + countvec.inverse_transform(temp)[0][0].split('/')[0]
    
    X_test = torch.from_numpy(temp).type(torch.FloatTensor).cuda()
    
print("* Generated Text : \n", text)

* Generated Text : 
 me trying off . will income other other will little ; bought quietly an ; but ensued world-wide other the inches lighten morning head ; does Dost have ; ; . ; ; than run ; head ; fact ; ; an will other Seven the he room leaving enough Dost honour than had . than way conscience than but Alone few to driving but but the morning ; does sitting eyes darkened . . ; inches sitting sich relief light ; candelabra-wise towards an other few ; wild ; the than sadly redeemed morning , but has ; have the
