In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

  from numpy.core.umath_tests import inner1d


In [2]:
PATH = 'C:/input/Fast_AI/nietzsche/'

In [3]:
?get_data

In [4]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", '{}nietzsche.txt'.format(PATH))
text = open('{}nietzsche.txt'.format(PATH)).read()
print('corpus length: ', len(text))

corpus length:  600901


In [5]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [6]:
chars = sorted(list(set(text))) # 字符集，
vocab_size = len(chars) + 1
print("total chars: ", vocab_size)

total chars:  86


In [7]:
chars.insert(0,"\0") # 插入零值有时是有用的，e.g. for padding

In [8]:
''.join(chars[:-6]) #每个字符联合起来

'\x00\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

In [9]:
char_indices = {c:i for i,c in enumerate(chars)}
indices_char = {i:c for i,c in enumerate(chars)}

In [10]:
idx = [char_indices[c] for c in text] # 对字符编码
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [11]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

Three char model

In [12]:
# create inputs
# 创建一个每3个字符的列表，
cs = 3
c1_data = [idx[i] for i in range(0,len(idx)-cs,cs)]
c2_data = [idx[i+1] for i in range(0,len(idx)-cs,cs)]
c3_data = [idx[i+2] for i in range(0,len(idx)-cs,cs)]
c4_data = [idx[i+3] for i in range(0,len(idx)-cs,cs)] # 需要预测的字符

In [13]:
x1 = np.stack(c1_data) # stack增加维度,从标量变成一维向量
x2 = np.stack(c2_data)
x3 = np.stack(c3_data)

In [14]:
len(x1.shape)

1

In [15]:
# output
y = np.stack(c4_data)

In [16]:
x1[:4],x2[:4],x3[:4],y[:4]

(array([40, 30, 29,  1]),
 array([42, 25,  1, 43]),
 array([29, 27,  1, 45]),
 array([30, 29,  1, 40]))

In [17]:
x1.shape,y.shape

((200300,), (200300,))

# create and train model

In [18]:
# size for hidden state
n_hidden = 256
# latent factors to create(i.e. the size of embedding matrix)
n_fac = 42

In [19]:
class Char3Model(nn.Module):
    def __init__(self,vocab_size,n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac)
        
        # the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden) # linear transformation to the incoming data: :math:`y = xA^T + b`
        # the layer operation from hidden to hidden
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        # the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
    
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1))) # F(input) Functional interface
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        #h = V(torch.zeros(in1.size()).cuda()) # tensors
        h = V(torch.zeros(in1.size())) # no gpu
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))
        

In [20]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3],axis=1), y, bs=512) #当前路径，-1表示val_idxs验证集，()

In [21]:
#m = Char3Model(vocab_size,n_fac).cuda()
m = Char3Model(vocab_size,n_fac)

In [22]:
it = iter(md.trn_dl) # 
*xs,yt = next(it) # *xs是3个张量组成列表，每个张量是512个长度
t = m(*V(xs)) # *表示多个变量，V创建pytorch tensor的列表，模型输入3个元素



In [23]:
len(next(it)),type(next(it)[0]),len(next(it)[0]),type(t)

(4, torch.Tensor, 512, torch.Tensor)

In [24]:
opt = optim.Adam(m.parameters(), 1e-2)

In [25]:
fit(m, md, 1, opt, F.nll_loss) # 1个epoch

  0%|                                                  | 0/392 [00:00<?, ?it/s]



epoch      trn_loss   val_loss   
    0      2.116108   3.735598  



[3.735597610473633]

In [26]:
len(xs),len(xs[0]),xs[0][:10],xs[1][:10],xs[2][:10]

(3,
 512,
 tensor([67, 68, 76, 61, 71, 73, 58, 69, 55, 72]),
 tensor([57, 74, 61, 58, 60, 62,  2, 62, 58, 62]),
 tensor([ 2, 60, 58, 71, 54, 54, 67, 56,  2, 66]))

In [27]:
len(V(xs)),len(V(xs)[0]),type(V(xs)),type(xs)

(3, 512, list, list)

Test model

In [28]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp])) # numpy array to pytorch tensor
    p = m(*VV(idxs)) # 模型输入pytorch tensors，返回预测值
    i = np.argmax(to_np(p)) # 预测值最大数对应的序号
    return chars[i] # 词库序号对应的字符

In [29]:
get_next('and') # (该模型只能输入三个字符)每输入3个字符，预测下一个字符



' '

In [30]:
get_next('lov') # 输入and,得到空格，输入lov得到e



'e'

my first RNN

In [31]:
# create inputs
# the size of unrolled RNN
cs=8

In [32]:
# For each of 0 through 7, create a list of every 8th character with that starting point. 
# These will be the 8 inputs to our model.
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)] #每8个字符一组，每次滑动一个字符

In [33]:
len(c_in_dat),len(c_in_dat[0]),c_in_dat[:3]

(600893,
 8,
 [[40, 42, 29, 30, 25, 27, 29, 1],
  [42, 29, 30, 25, 27, 29, 1, 1],
  [29, 30, 25, 27, 29, 1, 1, 1]])

In [34]:
# 下一个字符，labels for model
c_out_dat = [idx[i+cs] for i in range(len(idx)-cs)]

In [35]:
len(c_out_dat),c_out_dat[:3]

(600893, [1, 1, 43])

In [36]:
type(c_in_dat),type(c_in_dat[0])
xs = np.stack(c_in_dat,axis=0) # numpy.ndarray(600893, 8),沿着0方向重叠

In [37]:
xs.shape,type(xs),

((600893, 8), numpy.ndarray)

In [38]:
y = np.stack(c_out_dat)

In [39]:
xs[:3],type(c_out_dat),type(y),y[:cs]

(array([[40, 42, 29, 30, 25, 27, 29,  1],
        [42, 29, 30, 25, 27, 29,  1,  1],
        [29, 30, 25, 27, 29,  1,  1,  1]]),
 list,
 numpy.ndarray,
 array([ 1,  1, 43, 45, 40, 40, 39, 43]))

# create and train model

In [40]:
val_idx = get_cv_idxs(len(idx)-cs-1) # 默认抽取20%用于校验

In [41]:
md = ColumnarModelData.from_arrays('.',val_idx,xs,y,bs=512)

In [42]:
type(val_idx),len(val_idx),val_idx[:10]

(numpy.ndarray,
 120178,
 array([ 39594, 351807,  39637, 484823, 248398, 537236, 144432, 313337, 495007,  20866]))

In [43]:
class CharLoopModel(nn.Module):
    def __init__(self,vocab_size,n_fac): # 创建模型
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac)
        self.l_in = nn.Linear(n_fac,n_hidden)
        self.l_hidden = nn.Linear(n_hidden,n_hidden)
        self.l_out = nn.Linear(n_hidden,vocab_size) # 输出每一个词的概率
    
    # 前一个模型输入3个字符来预测第四个
    def forward(self,*cs): # 连续n个字符
        bs = cs[0].size(0) # 512
        #h = V(torch.zeros(bs, n_hidden).cuda())
        h = V(torch.zeros(bs, n_hidden)) # (512,256) 占位，
        for c in cs:
            inp = F.relu(self.l_in(self.e(c))) #输入
            h = F.tanh(self.l_hidden(h+inp)) # 隐层
        return F.log_softmax(self.l_out(h), dim=-1)
            

In [44]:
#m = CharLoopModel(vocab_size,n_fac).cuda()
m = CharLoopModel(vocab_size,n_fac)
opt = optim.Adam(m.parameters(),1e-2)

In [45]:
fit(m,md,1,opt,F.nll_loss)

  0%|                                                  | 0/939 [00:00<?, ?it/s]



epoch      trn_loss   val_loss   
    0      2.019531   2.007482  



[2.0074820353494456]

In [46]:
set_lrs(opt,0.001) # 减少学习率

In [47]:
fit(m, md, 1, opt, F.nll_loss)

  0%|                                                  | 0/939 [00:00<?, ?it/s]



epoch      trn_loss   val_loss   
    0      1.751342   1.745868  



[1.7458681814359769]

In [48]:
class CharLoopConcatModel(nn.Module):
    def __init__(self,vocab_size,n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden) # 输入是因子大小与隐层大小之和(42+256,256)->(298,256)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
    
    def forward(self,*cs):
        bs = cs[0].size(0) # 第一个元素大小 512
        #bs = cs[0].size # dimension specified as 0 but tensor has no dimensions
        #h = V(torch.zeros(bs,n_hidden).cuda()) # 占位张量 512 × n_hidden
        h = V(torch.zeros(bs, n_hidden)) #(512,256) (批量大小, 隐层大小)
        for c in cs:
            inp = torch.cat((h,self.e(c)), 1) # concatenante在给定的维度(512,256)+(512,42) ->(512,298)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        return F.log_softmax(self.l_out(h), dim=-1)
        

In [49]:
#m = CharLoopConcatModel(vocab_size, n_fac).cuda()
m = CharLoopConcatModel(vocab_size, n_fac)
opt = optim.Adam(m.parameters(), 1e-3)

In [50]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))



In [51]:
len(xs),type(xs),len(xs[0]),type(xs[0]),xs[0].shape,len(xs[0].shape)

(8, list, 512, torch.Tensor, torch.Size([512]), 1)

In [52]:
n_hidden

256

In [53]:
fit(m,md,1,opt,F.nll_loss)

  0%|                                                  | 0/939 [00:00<?, ?it/s]



epoch      trn_loss   val_loss   
    0      1.844303   1.827775  



[1.827775081789645]

In [54]:
set_lrs(opt,1e-4)

In [55]:
fit(m,md,1,opt,F.nll_loss)

  0%|                                                  | 0/939 [00:00<?, ?it/s]



epoch      trn_loss   val_loss   
    0      1.749071   1.742517  



[1.7425170810825448]

In [56]:
# Test model

In [60]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [61]:
VV(T(np.array([10,12,13]))).size(0),VV(T(np.array([10,20,30])))[0]

(3, tensor(10))

In [62]:
get_next('I love Y')

RuntimeError: dimension specified as 0 but tensor has no dimensions

In [None]:
# RNN with pytorch

In [88]:
class CharRnn(nn.Module):
    def __init__(self,vocab_size,n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac) # 词嵌入(sz,42)
        self.rnn = nn.RNN(n_fac,n_hidden) # 隐层
        self.l_out = nn.Linear(n_hidden,vocab_size) #输出
    
    def forward(self,*cs):
        bs = cs[0].size(0) # 512
        h = V(torch.zeros(1,bs,n_hidden)) # (1,512,256)
        inp = self.e(torch.stack(cs)) # concatenantes新的维度
        outp,h = self.rnn(inp,h)
        
        return F.log_softmax(self.l_out(outp[-1]),dim=-1) # 沿着最后一个维度

In [90]:
#m = CharRnn(vocab_size, n_fac).cuda()
m = CharRnn(vocab_size, n_fac) # 初始化
opt = optim.Adam(m.parameters(),1e-3)

In [91]:
it = iter(md.trn_dl)
*xs,yt=next(it)

In [92]:
type(xs),len(xs),len(xs[1]),xs[0][:10],type(xs[0][0]) # 每一个元素都是张量

(list, 8, 512, tensor([58, 68,  2,  1, 55, 72, 75, 58, 71, 68]), torch.Tensor)

In [93]:
t = m.e(V(torch.stack(xs))) # [8*512]->[8,512]->e嵌入->[8,512,42]
t.size() # [8, 512, 42]

torch.Size([8, 512, 42])

In [94]:
ht = V(torch.zeros(1,512,n_hidden))
outp, hn = m.rnn(t, ht) # t:[8,512,42], ht:[1,512,256] -> [8,512,256],[1,512,256]
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [95]:
t = m(*V(xs)); t.size() # [8*512]输出[512,86],512个数据，预测输出维度86

torch.Size([512, 86])

In [96]:
fit(m,md,4,opt,F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.859307   1.84655   
    1      1.689058   1.670258  
    2      1.58966    1.589731  
    3      1.534597   1.549192  



[1.5491921181217638]

In [98]:
# Test model

In [99]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [100]:
get_next('I love Yo')

RuntimeError: dimension specified as 0 but tensor has no dimensions

In [101]:
def get_next_n(inp,n): #预测以后n个字符
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c # 移动一位
    return res

In [102]:
get_next_n('for thos', 40)

RuntimeError: dimension specified as 0 but tensor has no dimensions

# Multi-output model

In [104]:
# non-verlappint sets of characters
c_in_dat =  [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)] # 每个子列表长度为cs,

In [105]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [109]:
xs = np.stack(c_in_dat) # stack会产生新的维度(75112, 8)

In [110]:
type(c_in_dat),type(xs),len(c_in_dat),len(xs),xs.shape

(list, numpy.ndarray, 75112, 75112, (75112, 8))

In [111]:
ys = np.stack(c_out_dat);ys.shape

(75112, 8)

In [112]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62],
       [72,  2, 54,  2, 76, 68, 66, 54],
       [67,  9,  9, 76, 61, 54, 73,  2],
       [73, 61, 58, 67, 24,  2, 33, 72],
       [ 2, 73, 61, 58, 71, 58,  2, 67]])

In [113]:
ys[:cs,:cs]

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72],
       [ 2, 54,  2, 76, 68, 66, 54, 67],
       [ 9,  9, 76, 61, 54, 73,  2, 73],
       [61, 58, 67, 24,  2, 33, 72,  2],
       [73, 61, 58, 71, 58,  2, 67, 68]])

In [118]:
val_idx = get_cv_idxs(len(xs)-cs-1);val_idx,len(val_idx) # 随机采样20%,从75112个元素中得到15020个样本

(array([25679, 44279, 25725, ..., 60996, 71743, 16822]), 15020)

In [147]:
md = ColumnarModelData.from_arrays('.',val_idx,xs,ys,bs=512) # xs:(75112, 8)

In [153]:
class CharSeqRnn(nn.Module):
    def __init__(self,vocab_size,n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac) # 词嵌入
        self.rnn = nn.RNN(n_fac, n_hidden) # RNN输入
        self.l_out = nn.Linear(n_hidden, vocab_size) #输出
    
    def forward(self,*cs): # 输入一批
        bs = cs[0].size(0) # 第一批的个数，512，
        h = V(torch.zeros(1,bs,n_hidden)) # 1×512×256
        inp = self.e(torch.stack(cs)) # 8*512 * 42
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp),dim=-1) # 512 * 42最后一个维度上softmax

In [154]:
#m = CharSeqRnn(vocab_size, n_fac).cuda()
m = CharSeqRnn(vocab_size, n_fac)
opt = optim.Adam(m.parameters(), 1e-3)

In [155]:
it = iter(md.trn_dl) # (xs.shape[0]-len(val_dix))/bs = 117.36, 可以迭代118次
*xst,yt = next(it) # 迭代出来的list类型，8*512，维度怎么变化了？ 512 * 8 -> 8*512

In [156]:
def nll_loss_seq(inp,targ):
    sl,bs,nh = inp.size() #
    targ = targ.transpose(0,1).contiguous().view(-1) # view reshape tensor
    return F.nll_loss(inp.view(-1,nh), targ)

In [157]:
fit(m,md,4,opt,nll_loss_seq)

epoch      trn_loss   val_loss   
    0      2.628987   2.429591  
    1      2.303159   2.209454  
    2      2.148196   2.094467  
    3      2.051874   2.015718  



[2.0157178389883232]

In [173]:
set_lrs(opt,1e-4)

In [174]:
fit(m,md,1,opt,nll_loss_seq)

epoch      trn_loss   val_loss   
    0      1.998778   2.001827  



[2.001826798677762]

Identity init!

In [175]:
#m = CharSeqRnn(vocab_size, n_fac).cuda()
m = CharSeqRnn(vocab_size, n_fac)
opt = optim.Adam(m.parameters(), 1e-2)

In [176]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden)) # 256*256

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])

In [178]:
fit(m,md,4,opt,nll_loss_seq)

epoch      trn_loss   val_loss   
    0      2.376414   2.190629  
    1      2.083358   2.017618  
    2      1.972382   1.941661  
    3      1.910616   1.907838  



[1.9078382542225396]

# Stateful model

In [181]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH='C:/input/Fast_AI/nietzsche/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = '{}{}'.format(PATH,TRN_PATH)
VAL = '{}{}'.format(PATH,VAL_PATH)
!ls {PATH}

nietzsche.txt


In [211]:
trn_pct = round(len(text) * 0.8)
open(TRN + 'trn.txt','w').write(text[:trn_pct])
open(VAL + 'val.txt','w').write(text[trn_pct:])

trn.txt


In [213]:
!ls {VAL}

val.txt


In [216]:
len(text),len(open(TRN + 'trn.txt','r').read()),len(open(VAL + 'val.txt','r').read())

(600901, 480721, 120180)

In [236]:
TEXT = data.Field(lower=True,tokenize=list)
bs=64;bptt=8;n_fac=42;n_hidden=256

FILES = dict(train=TRN_PATH,validation=VAL_PATH,test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3) # nlp.py中devie=-1

len(md.trn_dl),md.nt,len(md.trn_ds),len(md.trn_ds[0].text)

(922, 55, 1, 472945)

In [242]:
#RNN
class CharSeqStatefulRnn(nn.Module):
    def __init__(self,vocab_size,n_fac,bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs) # 设置h
    
    def forward(self, cs):
        bs = cs[0].size(0) # 512?
        if self.h.size(1) != bs: self.init_hidden(bs) # 如果不为bs,就为hs
        outp,h = self.rnn(self.e(cs), self.h) #cs:512*8**42, h:1*512*256
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size) # view reshape
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1,bs,n_hidden)) # 1*512*256
            

In [243]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512) # md.nt=55
opt = optim.Adam(m.parameters(), 1e-3)

In [244]:
fit(m, md, 4, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.884857   1.872309  
    1      1.709847   1.725692  
    2      1.616218   1.653537  
    3      1.56666    1.611235  



[1.6112346434425027]

In [232]:
md.nt

55

In [238]:
# RNN Loop

In [254]:
# from the pytorch source
def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [261]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self,cs):
        bs = cs[0].size(0) # 512，512个一组，一共8组
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs:
            o = self.rnn(self.e(c), o) # 
            outp.append(o)
        outp = self.l_out(torch.stack(outp)) # 隐层的输出？
        self.h = repackage_var(o) # 重新包装变量
        return F.log_softmax(outp, dim=-1).view(-1,self.vocab_size)
    
    #def init_hidden(self,bs): self.h = V(torch.zeros(1,bs,n_hidden)) # 隐层初始化
    def init_hidden(self,bs): self.h = V(torch.zeros(bs,n_hidden)) # 隐层初始化
        
    

In [262]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512)
opt = optim.Adam(m.parameters(), 1e-3)

In [263]:
fit(m, md, 4, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      2.066676   2.027538  
    1      1.92879    1.908172  
    2      1.873995   1.874719  
    3      1.850525   1.836054  



[1.8360541076422126]

In [None]:
# GRU

In [278]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs) # 1*512*256
        
    def forward(self, cs):
        bs = cs[0].size(0) # 512
        if self.h.size(-1) != bs: self.init_hidden(bs) # h最后一个维度是否是512
        outp,h = self.rnn(self.e(cs), self.h) # 输入+隐层
        self.h = repackage_var(h) # 包装h为新变量
        return F.log_softmax(self.l_out(outp),dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self,bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [279]:
# from the pytorch source code - for reference
def GRUCell(input, hidden, w_ih, b_ih, w_hh, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chun(3,1)
    h_r, h_i, h_n = gh.chun(3,1)
    
    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [280]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512)

opt = optim.Adam(m.parameters(), 1e-3)

In [281]:
fit(m, md, 6, opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.97656    1.938397  
    1      1.862626   1.814721  
    2      1.808669   1.778084  
    3      1.766002   1.750427  
    4      1.773002   1.7309    
    5      1.729853   1.718262  



[1.7182616094370866]

In [282]:
??repackage_var

# Putting it all together:LSTM

In [285]:
from fastai import sgdr
n_hidden=512

In [293]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl): # nl是？
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5) # 多了一个nl
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
    
    def forward(self,cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp, h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp),dim=-1).view(-1, self.vocab_size) # 
    
    def init_hidden(self,bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden))) # 两个tensors的元祖

In [294]:
#m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2) # md.nt=55
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)

In [295]:
os.makedirs('{}models'.format(PATH), exist_ok=True)

In [299]:
fit(m, md, 2, lo.opt, F.nll_loss)

epoch      trn_loss   val_loss   
    0      1.636392   1.60128   
    1      1.614935   1.577701  



[1.5777011029563774]

In [302]:
on_end = lambda sched, cycle: save_model(m, '{}models/cys_{}'.format(PATH, cycle))
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end= on_end)]
fit(m, md, 2**3-1, lo.opt, F.nll_loss, callbacks=cb)

epoch      trn_loss   val_loss   
    0      1.472722   1.449508  
    1      1.530008   1.486972  
    2      1.410719   1.409494  
    3      1.56901    1.527985  
    4      1.490952   1.466602  
    5      1.404913   1.404065  
    6      1.349563   1.373167  
    7      1.554647   1.516745  
    8      1.515578   1.499048  
    9      1.497208   1.474008  
    10     1.457199   1.442812  
    11     1.405281   1.408189  
    12     1.356546   1.372848  
    13     1.316557   1.349846  
    14     1.28401    1.340366  



[1.3403660913157125]

In [304]:
on_end = lambda sched, cycle: save_model(m, '{}models/cys_{}'.format(PATH, cycle))
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**2-1, lo.opt, F.nll_loss, callbacks=cb)

  2%|▋                             | 21/922 [00:07<05:34,  2.70it/s, loss=1.26]
epoch      trn_loss   val_loss   
    0      1.181904   1.323479  
    1      1.184854   1.324047  
    2      1.179561   1.324294  



[1.32429356757416]

In [312]:
# Test
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(),1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [313]:
get_next('I love y')

'o'

In [314]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:] + c
    return res

In [316]:
print(get_next_n('for thos', 600))

for those surmounters, and i? i have--body is its determine, and old greeks? the first in want is the weak, the european speak also; it man! sympathy, andof everythings, will of sympathy--likewise or originate according) at once about supregation with  eachto apparently proud, asual and all coming i metaphysicul, both against to the paths, andunconscious.--in dememoral life does not i find of k unfree-spiritual ideal, taste, peculiarthe facts to its declared readily (intoxical deficial to him; a new"it! imagine); it do novit upon this physiological generation.--for m for he did to time rely, asia or "


In [317]:
print(get_next_n('I love y', 600))

I love you. life oneses themasters in put follow.63. idiony, grows not impropess again. we must can you, with one which) of though her ought in the question ethics (and raisacy, we exhausted in a bad, truthness of their thing imagoryman is germany: "child":--only on a dispresented remains for the onceno bear supposing to pertain them than alltog experience in the way to the best are beautified, it determine than is that the question.= havingly and soul, the choches, willpossible? gratifiesardown--and ignoble and error over-moral kinds, for distrust, inmocratic (species areundering, andinflaceit as the


In [318]:
?Variable

In [319]:
?sched

Object `sched` not found.


In [322]:
??repackage_var