In [1]:
import os
import io
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
import esanpy
import pandas as pd
import tqdm

In [2]:
args = {
    'data': 'data',
    'model': 'LSTM',
    'emsize': 200,
    'nhid': 200,
    'nlayers': 2,
    'lr': 20,
    'clip': 0.25,
    'epochs': 100,
    'batch_size': 20,
    'bptt': 35,
    'dropout': 0.2,
    'tied': True,
    'seed': 42,
    'cuda': True,
    'log_interval': 1000,
    'save': 'model.pth',
    'temperature': 1.0,
    'outf': 'out.txt',
    'words': 1000
}

In [3]:
esanpy.start_server()

In [6]:
filepath = '/home/wararaki/workspace/git_repos/soseki/natsume.ndjson'

In [7]:
text_df = pd.read_json(filepath, orient='records', lines=True)

In [8]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [14]:
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(path)
        self.valid = self.tokenize(path)
        self.test = self.tokenize(path)

    def tokenize(self, path):
        """Tokenizes a text file."""
        # Add words to the dictionary
        df = pd.read_json(path, orient='records', lines=True)
        tokens = 0
        for _, row in tqdm.tqdm(df.iterrows()):
            text = io.StringIO(row.get('text'))
            for line in text:
                words = esanpy.analyzer(line, analyzer='kuromoji_neologd')
                if len(words) == 0:
                    pass
                words += ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        ids = torch.LongTensor(tokens)
        token = 0
        for _, row in tqdm.tqdm(df.iterrows()):
            text = io.StringIO(row.get('text'))
            for line in text:
                words = esanpy.analyzer(line, analyzer='kuromoji_neologd') + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

In [10]:
class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()),
                    Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))
        else:
            return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())

In [11]:
torch.manual_seed(args['seed'])
if torch.cuda.is_available():
    if not args['cuda']:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
    else:
        torch.cuda.manual_seed(args['seed'])

In [15]:
corpus = Corpus(filepath)

118it [00:22,  5.28it/s]
118it [00:21,  5.39it/s]
118it [00:20,  5.87it/s]
118it [00:21,  5.36it/s]
118it [00:20,  5.70it/s]
118it [00:22,  5.21it/s]


In [16]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    if args['cuda']:
        data = data.cuda()
    return data

In [17]:
eval_batch_size = 10
train_data = batchify(corpus.train, args['batch_size'])
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [18]:
###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
model = RNNModel(args['model'], ntokens, args['emsize'], args['nhid'], args['nlayers'], args['dropout'], args['tied'])
if args['cuda']:
    model.cuda()

criterion = nn.CrossEntropyLoss()

In [19]:
###############################################################################
# Training code
###############################################################################

def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    #if type(h) == Variable:
    
    if isinstance(h, Variable):
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)

In [20]:
def get_batch(source, i, evaluation=False):
    seq_len = min(args['bptt'], len(source) - 1 - i)
    data = Variable(source[i:i+seq_len], volatile=evaluation)
    target = Variable(source[i+1:i+1+seq_len].view(-1))
    return data, target

In [21]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    for i in range(0, data_source.size(0) - 1, args['bptt']):
        data, targets = get_batch(data_source, i, evaluation=True)
        output, hidden = model(data, hidden)
        output_flat = output.view(-1, ntokens)
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)

In [22]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args['batch_size'])
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args['bptt'])):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(model.parameters(), args['clip'])
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data

        if batch % args['log_interval'] == 0 and batch > 0:
            #print(total_loss)
            #print(args['log_interval'])
            cur_loss = total_loss / args['log_interval']
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args['bptt'], lr,
                elapsed * 1000 / args['log_interval'], cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

# Loop over epochs.
lr = args['lr']
best_val_loss = None

In [23]:
# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, args['epochs']+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(args['save'], 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

  This is separate from the ipykernel package so we can avoid doing imports until


-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  9.85s | valid loss  7.74 | valid ppl  2302.20
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "


-----------------------------------------------------------------------------------------
| end of epoch   2 | time:  8.89s | valid loss  7.06 | valid ppl  1163.56
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time:  8.94s | valid loss  6.74 | valid ppl   849.01
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   4 | time:  9.25s | valid loss  6.48 | valid ppl   651.24
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   5 | time:  9.05s | valid loss  6.27 | valid ppl   529.95
--------------------------------------------------------------------------

In [24]:
# Load the best saved model.
with open(args['save'], 'rb') as f:
    model = torch.load(f)

In [25]:
# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

  This is separate from the ipykernel package so we can avoid doing imports until


| End of training | test loss  4.04 | test ppl    56.81


## Generate Text

In [26]:
with open(args['save'], 'rb') as f:
    model = torch.load(f)
model.eval()

if args['cuda']:
    model.cuda()
else:
    model.cpu()

In [28]:
corpus = Corpus(filepath)
ntokens = len(corpus.dictionary)
hidden = model.init_hidden(1)
input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True)
if args['cuda']:
    input.data = input.data.cuda()

118it [00:21,  5.56it/s]
118it [00:22,  5.18it/s]
118it [00:21,  5.57it/s]
118it [00:21,  5.41it/s]
118it [00:20,  5.62it/s]
118it [00:22,  5.36it/s]
  after removing the cwd from sys.path.


In [29]:
text = ''
with open(args['outf'], 'wb') as outf:
    line = ''
    for i in range(args['words']):
        output, hidden = model(input, hidden)
        word_weights = output.squeeze().data.div(args['temperature']).exp().cpu()
        word_idx = torch.multinomial(word_weights, 1)[0]
        input.data.fill_(word_idx)
        word = corpus.dictionary.idx2word[word_idx]
        #if word == '<eos>':
        #    word = '、'

        #print(word)
        if word == '<eos>':
            word = '\n'
            line += word
            if line == '\n':
                pass
            else:
                text += line
                print(line)
            line = ''
        else:
            word += ''
            line += word

#        if i % args['log_interval'] == 0:
#            print('| Generated {}/{} words'.format(i, args['words']))

あがる商売えらい早い買う来る

これから三五日前同じ事聞く上等事笑われる

老人道具祭日得学校通う来る後少し

下向く直す云うあまり軽い御母さんつけ加える

何事違う驚くそこ下唐津人指し指全快参上思う

あまりく一人儲くもん

そんな御困る方妻帯まま

そうわたし何御約束今日出来る事頼む

ええいや

浜田たん

下女若い女婦人会会もらう気がつく様子

時時継

那古井棄真紅とまる煽る白雲平生取り落す

見るこうとうとう碧飲む時少しく無愛想相手思う

あの女頼む

私和尚さんかい

まだ御誂えるいくら奥さん生れ相当分る甲野さん笑う

若い女気出来

せん細君簡単思う事覚える

そんな画考しょう

御尤も

有る事云う害奴有馬線

どうして困るこの間きまるかね

そんなに元来おれ今何そんな事云う何加減知る

正直どうしても困る見える早速明仕方がない

さも気楽仕方がない待った

まるでこまやか心返す

ちる

こう言うおかしい今思う

えなかりる思う

そんな事そう

同じ無い

なに似る面白い

随分面白い良い呑気剣呑

全体動何とも

あの女怒るそうさな第二単語広告だいぶthe近いハハハハ

近々探偵

山越君どう

何

知る君そんなあう

何高木遊ぶ行く聞すいや

単なる書く様子分る

あなた本当に逢う

半熟そう理

誰か

発句

達磨知る

随分御役に立つ申す難題

遠慮行うとうてい仕合

いっそ方あのわずかたしか気多事

そう事いずれ来る

御前真面真面目目見様云う訳なっちまう

どっちもうどこ

来る毫思う何買う

今何関係出来るいける云う

ええ寒月君知る私困る

近頃困る何だか分るそう余計事図案千里そうとにかくええみんな早い甲野君察す好い

いいえ聞くええ駄目

分る少々変

なに体鼻云う妙話繰り返す余腹立つ一人叶うおるなんやっぱり相手非常妙こう

いやきめるおわるくるだいぶ真面目すべて自信もつにくい証拠一人外交外交官官試験落第たしか

奇麗今頃目何度色々音置く来る

一体そんな考いけるあなたこちら御口見る

二三日長い間立待ち乗り越える

まず世君と僕送り込む

そう

そう天下正直世の中他人知るこう云ういろいろ大切思うああ云う考云う句聞くおかしい考えるわるい君話思い出す

そうそりゃ分るこの間取り上げる帰る行く

何訳持つ貰えるどうせあの人同