# Setup

In [1]:
import numpy as np
import torch
from torch import nn,optim
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 读取及处理数据

In [2]:
with open('data/anna.txt','r') as f:
    text=f.read()

In [3]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [4]:
# tokenization
idx_to_char=list(set(text))
char_to_idx={ch:idx for idx,ch in enumerate(idx_to_char)}
vocab_size=len(idx_to_char)

In [5]:
char_to_idx['i'],idx_to_char[63]

(79, 'a')

In [6]:
# corpus
corpus=[char_to_idx[i] for i in text]

In [7]:
corpus[:10],len(corpus)

([59, 29, 63, 4, 27, 7, 33, 68, 41, 44], 1985223)

In [8]:
# minibatches
def get_batches(corpus,batch_size,num_steps,device):
    batch_len=len(corpus)//batch_size
    batch_num=(batch_len-1)//num_steps
    corpus=np.array(corpus)   
    batch_data=corpus[:batch_size*batch_len].reshape(batch_size,batch_len)
    for i in range(batch_num):
        x=batch_data[:,i*num_steps:(i+1)*num_steps]
        y=batch_data[:,i*num_steps+1:(i+1)*num_steps+1]
        yield torch.tensor(x,dtype=torch.float32,device=device),\
        torch.tensor(y,dtype=torch.float32,device=device)

In [9]:
# minibatches 测试
test_seq=range(30)
test_batches=get_batches(test_seq,3,4,device)
for x,y in test_batches:
    print(x.shape,y.shape)

torch.Size([3, 4]) torch.Size([3, 4])
torch.Size([3, 4]) torch.Size([3, 4])


In [9]:
# one_hot 表示
def one_hot(x,vocab_size):
    # x:(batch_size,num_steps)
    # out:(batch_size,num_steps,vocab_size)
    bs,ns=x.shape
    out=torch.zeros((bs,ns,vocab_size),dtype=torch.float32,device=x.device)
    for i in range(bs):
        for j in range(ns):
            out[i,j,int(x[i,j])]=1.
    return out

In [10]:
# one_hot 测试
x=torch.tensor([[2,4,1],[1,2,3]])
out=one_hot(x,5);out.shape

torch.Size([2, 3, 5])

In [12]:
# 将corpus划分为minibatches
batch_size,num_steps=8,50
mini_batches=get_batches(corpus,batch_size,num_steps,device)

In [13]:
for X,Y in mini_batches:
    print(X.shape,Y.shape)
    break

torch.Size([8, 50]) torch.Size([8, 50])


# 定义网络

In [36]:
class Net(nn.Module):
    def __init__(self,vocab_size,num_hiddens,num_layers,drop_prob):
        super(Net,self).__init__()
        self.vocab_size=vocab_size
        self.num_hiddens=num_hiddens
        self.lstm=nn.LSTM(vocab_size,num_hiddens,num_layers,dropout=drop_prob,batch_first=True)
        self.dropout=nn.Dropout(p=drop_prob)
        self.fc=nn.Linear(num_hiddens,vocab_size)
        self.num_layers=num_layers
    def forward(self,x,hidden):
        # x:(batch_size,num_steps)
        x=one_hot(x,self.vocab_size)
        temp,hidden=self.lstm(x,hidden)
        temp=self.dropout(temp)   #(batch_size,num_steps,num_hiddens)
        temp=temp.contiguous().view(-1,self.num_hiddens)  #(batch_size*num_steps,num_hiddens)
        out=self.fc(temp)
        return out,hidden
    def init_state(self,batch_size):
        weight=next(self.parameters()).data
        hidden=(weight.new(self.num_layers,batch_size,self.num_hiddens).zero_().cuda(),
               weight.new(self.num_layers,batch_size,self.num_hiddens).zero_().cuda())
        return hidden

# 训练

In [37]:
valid_ratio=0.2
L=len(corpus)
valid_num=int(L*valid_ratio)
valid_corpus=corpus[:valid_num]
train_corpus=corpus[valid_num:]
batch_size=128
num_steps=100
# train_iter=get_batches(train_corpus,batch_size,num_steps,device)
# valid_iter=get_batches(valid_corpus,batch_size,num_steps,device)

In [38]:
class Corpus(Dataset):
    def __init__(self,corpus,batch_size,num_steps):
        self.corpus=corpus
        batch_len=len(corpus)//batch_size
        batch_num=(batch_len-1)//num_steps
        corpus=np.array(corpus)   
        batch_data=corpus[:batch_size*batch_len].reshape(batch_size,batch_len)
        self.batch_data=batch_data
        self.batch_num=batch_num
        self.num_steps=num_steps
    def __len__(self):
        return self.batch_num
    def __getitem__(self,idx):
        x=self.batch_data[:,idx*self.num_steps:(idx+1)*self.num_steps]
        y=self.batch_data[:,idx*self.num_steps+1:(idx+1)*self.num_steps+1]
        return x,y        

In [39]:
train_data=Corpus(train_corpus,batch_size,num_steps)
valid_data=Corpus(valid_corpus,batch_size,num_steps)

In [40]:
train_iter=DataLoader(train_data,batch_size=1)
valid_iter=DataLoader(valid_data,batch_size=1)

In [41]:
for X,Y in valid_iter:
    print(X.shape,Y.shape)
    break

torch.Size([1, 128, 100]) torch.Size([1, 128, 100])


In [21]:
# l_train=sum(1 for _ in train_iter)
# l_valid=sum(1 for _ in valid_iter)
# l_train,l_valid

(124, 31)

In [42]:
def train(model,num_epochs,lr,train_iter,valid_iter,clip=5):
    model=model.to(device)
    criterion=nn.CrossEntropyLoss()
    optimizer=optim.Adam(model.parameters(),lr=lr)
    valid_min_loss=np.inf
    train_loss_list,valid_loss_list=[],[]
    for epoch in range(num_epochs):
#         train_iter=get_batches(train_corpus,batch_size,num_steps,device)
#         valid_iter=get_batches(valid_corpus,batch_size,num_steps,device)
        model.train()
        train_loss,valid_loss=0.0,0.0
        hidden=model.init_state(batch_size)
        for X,Y in train_iter:
            # X:(batch_size,num_steps)
            # Y:(batch_size,num_steps)
            X=X.squeeze(0).to(device)
            Y=Y.squeeze(0).to(device)
            y_pred,hidden=model(X,hidden)  # (batch_size*num_steps,vocab_size)
            hidden=(hidden[0].detach(),hidden[1].detach())
            loss=criterion(y_pred,Y.view(-1).long())
            print(loss.item())
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(),clip)
            optimizer.step()
            train_loss+=loss.item()*Y.shape[0]*Y.shape[1]
            train_loss_list.append(loss.item())
        model.eval()
        hidden=model.init_state(batch_size)
        with torch.no_grad():
            for X,Y in valid_iter:
                X=X.squeeze(0).to(device)
                Y=Y.squeeze(0).to(device)
                y_pred,hidden=model(X,hidden)
                loss1=criterion(y_pred,Y.view(-1).long())
                valid_loss+=loss1.item()*Y.shape[0]*Y.shape[1]
                valid_loss_list.append(loss1.item())
        train_loss=train_loss/(len(train_data)*batch_size*num_steps)
        valid_loss=valid_loss/(len(valid_data)*batch_size*num_steps)
        if valid_loss<valid_min_loss:
            print('validation decreased {}-->{}.saving model...'.\
                 format(valid_min_loss,valid_loss))
            valid_min_loss=valid_loss
        print('epoch:{},train loss:{},valid loss:{}'.\
             format(epoch+1,train_loss,valid_loss))

In [43]:
num_hiddens=512
num_layers=1
drop_prob=0
num_epochs=1
lr=0.001
model=Net(vocab_size, num_hiddens, num_layers, drop_prob)

In [44]:
train(model,num_epochs,lr,train_iter,valid_iter)  #训练一次花费时间长

4.422250270843506
4.3940935134887695
4.36194372177124
4.314607620239258
4.213876247406006
3.4982666969299316
3.2906057834625244
3.210195541381836
3.161435127258301
3.157169818878174
3.184666633605957
3.173596143722534
3.1485846042633057
3.14168119430542
3.1212921142578125
3.117521047592163
3.1341660022735596
3.1339242458343506
3.131324052810669
3.1198978424072266
3.1225481033325195
3.1074090003967285
3.0931224822998047
3.093663215637207
3.1027867794036865
3.103619337081909
3.0913808345794678
3.096703290939331
3.083831787109375
3.086979627609253
3.1143958568573
3.095179796218872
3.095350980758667
3.077864646911621
3.1012117862701416
3.104846715927124
3.1231496334075928
3.107208251953125
3.1028528213500977
3.0944082736968994
3.0917162895202637
3.083364248275757
3.0834827423095703
3.078120231628418
3.0878357887268066
3.1052374839782715
3.107113838195801
3.1156508922576904
3.115726947784424
3.0858230590820312
3.0871856212615967
3.0833468437194824
3.0884604454040527
3.070981502532959
3.0664

In [51]:
def predict(model,char,hidden=None,top_k=None):
    x=np.array([[char_to_idx[char]]])  # (1,1)
    x=torch.tensor(x,dtype=torch.float32,device=device)  
    hidden = tuple([each.data for each in hidden])
    out,hidden=model(x,hidden)  # out:(1,vocab_size) 
    p=F.softmax(out,dim=1).data   #(1,vocab_size)
    p=p.cpu()
    if top_k is None:
        top_ch=np.arange(vocab_size)
    else:
        p,top_ch=p.topk(top_k)
        top_ch=top_ch.numpy().squeeze()
    p=p.numpy().squeeze()
    char=np.random.choice(top_ch,p=p/p.sum())
    return idx_to_char[char],hidden

def sample(model,size,prime='The',top_k=None):
    model.eval()
    chars=[ch for ch in prime]
    hidden=model.init_state(1)
    for ch in prime:
        char,hidden=predict(model,ch,hidden,top_k=top_k)
    chars.append(char)
    for ii in range(size):
        char,hidden=predict(model,chars[-1],hidden,top_k=top_k)
        chars.append(char)
    return ''.join(chars)

In [53]:
print(sample(model, 1000, top_k=5, prime="And Levin said"))

And Levin saide his  eeshee tat taretho the tannd  e rot ot ei toro titethithathen ein hore thot aed hin ho ser  eo hotther and  ond oo het ardnneteerit hhe hin  eo thes horis eothis the hin hate toee the th teen hi ee sieet ae hen ho tert oe thathatitie aher het aterith  tenthann hit  toteratone  an  ere ann hor taet oasinneteetheen thin  eo tore th  on tha  th sethar an sinn he hhr aon tet horeerte so ae thene  hithethann onn ao the hen onn honneeat oonesthe soter there aer atree toret orrs ton aone  on aone  ie  or ann tor  thar sh  or  aneteete  end th sh neerasien hinn an teasithan  hit he te te  ie  int aonen and oets toee tone sirn heeritine  on hart ooet aor ahis an  oorie to to teethin ae to e toet hishe tanttaottat eoro t teesandishis hirttand hind aen tie hinn oorer ane ah tind we  hethin he hit eee to eirt hen ho to ton tent onndtherertae tot totiee hone ae thirerite  endet endeathensearet ha sir tor sirese he he shens tenthertanethireenashond he ti sh  a t tht ha se th tit