# A Baseline Implementation for SE125 Project 2

We provide a baseline model for conversation modeling using deep learning.


## 1. Libraries
In this section, we import third-party libraries to be used in this project.
You may need to install them using `pip`:
```
    pip install tqdm
    pip install cython
    pip install tables
    pip install tensorboardX
    ...
```

In [1]:
import numpy as np
import time
import os
import math
import sys
import tables
import json
import random
from tqdm import tqdm

import torch 
import torch.utils.data as data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import logging
logger = logging.getLogger(__name__)# logger is a log
logging.basicConfig(level=logging.DEBUG, format="%(message)s")#,format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")


## 2. Utilities

In this section we maintain utilities for model construction and training. 
Please put your own utility modules/functions in this section.

In [2]:
PAD_ID, SOS_ID, EOS_ID, UNK_ID = [0, 1, 2, 3] #SOS:start of sentence; EOS:end of sentence;
# 解码器在第一步读取<sos>符，预测目标句子的第一个单词，然后需要将这个预测的单词复制到第二步作为输入，再预测第二个单词，直到预测的单词为<eos>为止
# unk:低频词换为这个？  pad:用于padding？

def asHHMMSS(s):        # second to h,m,s
    m = math.floor(s / 60)
    s -= m * 60
    h = math.floor(m /60)
    m -= h *60
    return '%d:%d:%d'% (h, m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s # rs=s/percent-s
    return '%s<%s'%(asHHMMSS(s), asHHMMSS(rs))      # dont know want to present what

#######################################################################
import nltk
try: 
    nltk.word_tokenize("hello world")   # word_tokenize:分词
except LookupError: 
    nltk.download('punkt')
    
def sent2indexes(sentence, vocab, maxlen):  # 短句子padding，长句子截断，返回在vocab_id数组（可能是二维的，见98行），vocab没有的用UNK_ID代替
    '''sentence: a string or list of string
       return: a numpy array of word indices 单词索引, see vocab, each word has an id
       vocab: dictionary, see vocab.txt
    '''      
    def convert_sent(sent, vocab, maxlen):# 短的句子padding，长的句子截断，返回在vocab里的id值组成的数组，vocab没有的用UNK_ID代替
        idxes = np.zeros(maxlen, dtype=np.int64)    # maxlen为int:一维，长度为maxlen; maxlen为（2,2）:两维
        idxes.fill(PAD_ID)      # padding
        tokens = nltk.word_tokenize(sent.strip())  #strip()方法用于移除字符串头尾指定的字符（默认为空格或换行符）;
                                                    # tokens：数组，包括标点符号。如果是多个句子，是二维数组？
        idx_len = min(len(tokens), maxlen)
        for i in range(idx_len): idxes[i] = vocab.get(tokens[i], UNK_ID)    # Dictionary.get函数返回指定键的值; dict.get(key, default=None)
        return idxes, idx_len
    if type(sentence) is list:
        inds, lens = [], []
        for sent in sentence:
            idxes, idx_len = convert_sent(sent, vocab, maxlen)
            #idxes, idx_len = np.expand_dims(idxes, 0), np.array([idx_len])
            inds.append(idxes)
            lens.append(idx_len)        # [,,,]类型
        return np.vstack(inds), np.vstack(lens)     #按垂直方向堆叠数组构成一个新的数组, 堆叠的数组需要具有相同的维度
    else:
        inds, lens = sent2indexes([sentence], vocab, maxlen)
        return inds[0], lens[0]

def indexes2sent(indexes, vocab, ignore_tok=PAD_ID): # 与sent2indexes正好相反
    '''indexes: numpy array'''
    def revert_sent(indexes, ivocab, ignore_tok=PAD_ID):
        toks=[]
        length=0
        indexes=filter(lambda i: i!=ignore_tok, indexes)  #filter函数用于过滤序列，返回由符合条件元素组成的新列表; filter(function, iterable)
                                                            #此处是忽略padding
        for idx in indexes:
            toks.append(ivocab[idx])
            length+=1
            if idx == EOS_ID:   #end of sentence
                break
        return ' '.join(toks), length  # join方法用于将序列中的元素以指定的字符连接生成一个新的字符串,如[a,b]->a-b,此处是以空格分隔;str.join(sequence)
    
    ivocab = {v: k for k, v in vocab.items()}
    if indexes.ndim==1:# one sentence, indexes的数组维度
        return revert_sent(indexes, ivocab, ignore_tok)
    else:# dim>1
        sentences=[] # a batch of sentences
        lens=[]
        for inds in indexes:
            sentence, length = revert_sent(inds, ivocab, ignore_tok)
            sentences.append(sentence)
            lens.append(length)
        return sentences, lens
    
def save_model(model, epoch):
    """Save model parameters to checkpoint"""
    ckpt_path=f'./output/checkpoint_iter{epoch}.pkl'    # checkpoint path
    #print(f'Saving model parameters to {ckpt_path}')
    torch.save(model.state_dict(), ckpt_path)
        
def load_model(model, epoch):
    """Load parameters from checkpoint"""
    ckpt_path=f'./output/checkpoint_iter{epoch}.pkl'
    # print(f'Loading model parameters from {ckpt_path}')
    model.load_state_dict(torch.load(ckpt_path))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\feaaaaaa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


## 3. Configuration
In this section, we configure some hyper-parameters超参数 for the model.

In [3]:
def get_config():
    conf = {
    'maxlen':40, # maximum utterance length 
    'diaglen':10, # how many utterance kept in the context window

    # Model Arguments
    'emb_size':200, # size of word embeddings 
    'rnn_hid_utt':512, # number of rnn hidden units for utterance encoder 
    'rnn_hid_ctx':512, # number of rnn hidden units for context encoder
    'rnn_hid_dec':512, # number of rnn hidden units for decoder
    'n_layers':1, # number of layers
    'dropout':0.5, # dropout applied to layers (0 = no dropout)
        # dropout是指在深度学习网络的训练过程中，对于神经网络单元，按照一定的概率将其暂时从网络中丢弃。注意是暂时
        # 对于随机梯度下降来说，由于是随机丢弃，故而每一个mini-batch都在训练不同的网络。
    'teach_force': 0.8, # use teach force for decoder  Teacher Forcing是一种网络训练方法。
        # 它每次不使用上一个state的输出作为下一个state的输入，而是直接使用训练数据的标准答案(ground truth)的对应上一项作为下一个state的输入。
        # 有计划地学习的意思就是: 使用一个概率p去选择使用标准答案的输出还是前一个时间步骤模型生成的输出作为当前时间步骤的输入
      
    # Training Arguments
    'batch_size':64,
    'epochs':10, # maximum number of epochs 迭代次数
    'lr':2e-4, # autoencoder learning rate  adam参数
    'beta1':0.9, # beta1 for adam 优化器参数
    'init_w':0.05, # initial w
    'clip':5.0,  # gradient clipping, max norm     渐变剪裁，最大梯度，防止梯度爆炸
    }
    return conf 

## 4. Data Loader
A tool to load batches from the binarized (.h5) dataset

In [4]:
class DialogDataset(data.Dataset):
    def __init__(self, filepath, max_ctx_len=7, max_utt_len=40):
        # 1. Initialize file path or list of file names.
        #read training sentences(list of int array) from a hdf5 file
        #HDF文件是安装树状结构组织起来的。其顶部是根节点（），根节点下可以接很多分组（group），
        # 每个分组下有可以有很多节点，包括表（table），数组（array），压缩数组（compression array，Earray），
        # 可扩展压缩数组（enlargeable array，Earray），变长数组（variable length array，VLarray）。
        # 每个节点下还有叶子节点，即最终存储的数据。
        self.max_ctx_len=max_ctx_len
        self.max_utt_len=max_utt_len
        
        print("loading data...")
        table = tables.open_file(filepath)
        self.data = table.get_node('/sentences')[:].astype(np.long)     # [:]表示取全部的内容
        #astype：转换数组的数据类型。
        # int32 --> float64        完全ojbk
        # float64 --> int32        会将小数部分截断
        # string --> float64        如果字符串数组表示的全是数字，也可以用astype转化为数值类型
        self.index = table.get_node('/indices')[:]
        self.data_len = self.index.shape[0]
        print("{} entries".format(self.data_len))

    def __getitem__(self, offset):
        # dont know wtf it is
        # pos_utt may be the position utterance begin; ctx may be the question and res may be the answer?
        pos_utt, ctx_len, res_len = self.index[offset]['pos_utt'], self.index[offset]['ctx_len'], self.index[offset]['res_len']
        ctx_arr=self.data[pos_utt-ctx_len:pos_utt]
        res_arr=self.data[pos_utt:pos_utt+res_len]
        ## split context array into utterances
        context=[]
        utt_lens=[]
        utt=[]
        for i, tok in enumerate(ctx_arr):# get sentences, each one is long of max_utt_len(EOS is counted, too)
            # enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列
            # 同时列出数据和数据下标，一般用在 for循环中。enumerate(sequence, [start=0]),返回 enumerate(枚举) 对象。
            # >>>seasons = ['Spring', 'Summer', 'Fall', 'Winter']
            # >>> list(enumerate(seasons))
            # [(0, 'Spring'), (1, 'Summer'), (2, 'Fall'), (3, 'Winter')]
            # 此处ctx_arr的元素是token
            utt.append(ctx_arr[i])
            if tok==EOS_ID:  # so may be many end of sentence?
                if len(utt)<self.max_utt_len+1:
                    utt_lens.append(len(utt)-1)# floor is not counted in the utt length?
                    utt.extend([PAD_ID]*(self.max_utt_len+1-len(utt)))  #padding
                else:
                    utt=utt[:self.max_utt_len+1]  # trunk
                    # attention that utt is list of vocab_id
                    utt[-1]=EOS_ID
                    utt_lens.append(self.max_utt_len)
                context.append(utt)    # context的元素是长度为max_utt_len（算上EOS_ID）的list
                utt=[]

        if len(context)>self.max_ctx_len: # trunk long context
            context=context[-self.max_ctx_len:]
            utt_lens=utt_lens[-self.max_ctx_len:]
        context_len=len(context)
        
        if len(context)<self.max_ctx_len: # pad short context
            for i in range(len(context), self.max_ctx_len):
                context.append([0, SOS_ID, EOS_ID]+[PAD_ID]*(self.max_utt_len-2)) # [floor, <sos>, <eos>, <pad>, <pad> ...]
                utt_lens.append(2) # <s> and </s>
        context = np.array(context)        
        utt_lens=np.array(utt_lens)
        floors=context[:,0]     # 对一个二维数组, X[:,0]就是取所有行的第0个数据
        context = context[:,1:]  # 对一个二维数组, X[:, m:n]，即取所有数据的第m到n-1列数据, 此处即是去掉floor的context
        
        ## trunk&Padding ##
        response = res_arr[1:]  # this means response is only one sentence?
        if len(response)<self.max_utt_len:
            res_len=len(response)
            response=np.append(response,[PAD_ID]*(self.max_utt_len-len(response)))
        else:
            response=response[:self.max_utt_len]
            response[-1]=EOS_ID
            res_len=self.max_utt_len
        # so why return floor?
        # context的元素是长度为max_utt_len（算上EOS_ID）的list; response是一句长度为max_utt_len的话
        return context, context_len, utt_lens, floors, response, res_len

    def __len__(self):
        return self.data_len
    

def load_dict(filename):#vocab.json
    return json.loads(open(filename, "r").readline())

def load_vecs(fin):         
    """read vectors (2D numpy array) from a hdf5 file"""
    h5f = tables.open_file(fin)
    h5vecs= h5f.root.vecs   # don't know what it means
    
    vecs=np.zeros(shape=h5vecs.shape,dtype=h5vecs.dtype)
    vecs[:]=h5vecs[:]
    h5f.close()
    return vecs

## 5. Models
Define your model(including its dependent sub-modules) here. 

In [5]:
import torch.nn.init as weight_init
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RNNEncoder(nn.Module):
    def __init__(self, embedder, input_size, hidden_size, bidir, n_layers, dropout=0.5):
        #bidir为True
        super(RNNEncoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.bidir = bidir  # wtf is this
        assert type(self.bidir)==bool
        self.dropout=dropout
        
        self.embedding = embedder # nn.Embedding(vocab_size, emb_size)
        self.rnn = nn.GRU(input_size, hidden_size, n_layers, batch_first=True, bidirectional=bidir)
        #GRU是LSTM的一种变体
        #hidden_size – The number of features in the hidden state h
        #num_layers – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two GRUs together to form a stacked GRU, 
        #   with the second GRU taking in outputs of the first GRU and computing the final results. Default: 1
        #batch_first – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False
        #bidirectional – If True, becomes a bidirectional GRU. Default: False
        
        self.init_h = nn.Parameter(torch.randn(self.n_layers*(1+self.bidir), 1, self.hidden_size), requires_grad=True)#learnable h0
        #A kind of Tensor that is to be considered a module parameter.
        
        self.init_weights()
        
    def init_weights(self):
        """adopted from https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5"""
        for w in self.rnn.parameters(): # initialize the gate weights with orthogonal
            if len(w.shape)>1: 
                weight_init.orthogonal_(w.data)
                #正交矩阵,gain默认为1
            else:
                weight_init.normal_(w.data)
                #从正态分布中选一个数,默认mean=0.0,std=1.0
                
    
    def forward(self, inputs, input_lens=None, init_h=None): 
        # init_h: [n_layers*n_dir x batch_size x hid_size]
        if self.embedding is not None:  #目的是升高数据维度
            inputs=self.embedding(inputs)  # input: [batch_sz x seq_len] -> [batch_sz x seq_len x emb_sz]
        
        batch_size, seq_len, emb_size=inputs.size() 
        inputs=F.dropout(inputs, self.dropout, self.training)# dropout
        
        if input_lens is not None:# sort and pack sequence 
            input_lens_sorted, indices = input_lens.sort(descending=True)
            inputs_sorted = inputs.index_select(0, indices)
            #pack_padded_sequence要求输入必须排好序
            inputs = pack_padded_sequence(inputs_sorted, input_lens_sorted.data.tolist(), batch_first=True)
            #将填充过的变长序列压紧
        
        if init_h is None:
            init_h = self.init_h.expand(-1,batch_size,-1).contiguous()# use learnable initial states, expanding along batches
        #self.rnn.flatten_parameters() # time consuming!!
        hids, h_n = self.rnn(inputs, init_h) # hids: [b x seq x (n_dir*hid_sz)]  
                                                  # h_n: [(n_layers*n_dir) x batch_sz x hid_sz] (2=fw&bw)
        #input:tensor containing the features of the input sequence. The input can also be a packed variable length sequence.
        #h_0:tensor containing the initial hidden state for each element in the batch.
        #output:tensor containing the output features h_t from the last layer of the GRU, for each t
        #h_n:tensor containing the hidden state for t = seq_len Like output
        if input_lens is not None: # reorder and pad
            _, inv_indices = indices.sort()
            hids, lens = pad_packed_sequence(hids, batch_first=True)     
            hids = hids.index_select(0, inv_indices)
            h_n = h_n.index_select(1, inv_indices)
            #0表示按行索引，1表示按列索引，inv_indices表示索引的序号
        h_n = h_n.view(self.n_layers, (1+self.bidir), batch_size, self.hidden_size) #[n_layers x n_dirs x batch_sz x hid_sz]
        #重构维度
        h_n = h_n[-1] # get the last layer [n_dirs x batch_sz x hid_sz]
        enc = h_n.view(batch_size,-1) #[batch_sz x (n_dirs*hid_sz)]
        #-1代表可以由其他位置数字推断出来，参数乘积需要和元素个数一致
            
        return enc, hids
    
class ContextEncoder(nn.Module):
    def __init__(self, utt_encoder, input_size, hidden_size, n_layers=1, dropout=0.5):
        super(ContextEncoder, self).__init__()     
        self.utt_encoder=utt_encoder
        self.ctx_encoder= RNNEncoder(None, input_size, hidden_size, False, n_layers, dropout)

    def forward(self, context, context_lens, utt_lens): # context: [batch_sz x diag_len x max_utt_len] 
                                                      # context_lens: [batch_sz x dia_len]
        batch_size, max_context_len, max_utt_len = context.size()
        utts = context.view(-1, max_utt_len) # [(batch_size*diag_len) x max_utt_len]
        utt_lens = utt_lens.view(-1)
        utt_encs, _ = self.utt_encoder(utts, utt_lens) # [(batch_size*diag_len) x 2hid_size]
        
        utt_encs = utt_encs.view(batch_size, max_context_len, -1)
        enc, hids = self.ctx_encoder(utt_encs, context_lens)
        return enc
  

class RNNDecoder(nn.Module):
    def __init__(self, embedder, input_size, hidden_size, vocab_size, n_layers=1, dropout=0.5):
        super(RNNDecoder, self).__init__()
        self.n_layers = n_layers
        self.input_size= input_size # size of the input to the RNN (e.g., embedding dim)
        self.hidden_size = hidden_size # RNN hidden size
        self.vocab_size = vocab_size # RNN output size (vocab size)
        self.dropout= dropout
        
        self.embedding = embedder
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True)
        self.project = nn.Linear(hidden_size, vocab_size)
        
        self.init_weights()
        
    def init_weights(self):
        for w in self.rnn.parameters(): # initialize the gate weights with orthogonal
            if w.dim()>1:
                weight_init.orthogonal_(w)
        self.project.weight.data.uniform_(-0.1, 0.1)#nn.init.xavier_normal_(self.out.weight)        
        nn.init.constant_(self.project.bias, 0.)

    def forward(self, init_h, inputs=None, lens=None, enc_hids=None, src_pad_mask=None, context=None):
        '''
        init_h: initial hidden state for decoder
        enc_hids: enc_hids for attention use
        context: context information to be paired with input
        inputs: inputs to the decoder
        lens: input lengths
        '''
        if self.embedding is not None:
            inputs = self.embedding(inputs) # input: [batch_sz x seqlen x emb_sz]
        batch_size, maxlen, _ = inputs.size()
        inputs = F.dropout(inputs, self.dropout, self.training)  
        h = init_h.unsqueeze(0) # last_hidden of decoder [n_dir x batch_sz x hid_sz]        

        if context is not None:            
            repeated_context = context.unsqueeze(1).repeat(1, maxlen, 1) # [batch_sz x max_len x hid_sz]
            inputs = torch.cat([inputs, repeated_context], 2)
                
            #self.rnn.flatten_parameters()
        hids, h = self.rnn(inputs, h)         
        decoded = self.project(hids.contiguous().view(-1, self.hidden_size))# reshape before linear over vocab
        decoded = decoded.view(batch_size, maxlen, self.vocab_size)
        return decoded, h
    
    def sampling(self, init_h, enc_hids, src_pad_mask, context, maxlen, to_numpy=True):
        """
        A simple greedy sampling
        :param init_h: [batch_sz x hid_sz]
        :param enc_hids: a tuple of (enc_hids, mask) for attention use. [batch_sz x seq_len x hid_sz]
        """
        device = init_h.device
        batch_size = init_h.size(0)
        decoded_words = torch.zeros((batch_size, maxlen), dtype=torch.long, device=device)  
        sample_lens = torch.zeros((batch_size), dtype=torch.long, device=device)
        len_inc = torch.ones((batch_size), dtype=torch.long, device=device)
               
        x = torch.zeros((batch_size, 1), dtype=torch.long, device=device).fill_(SOS_ID)# [batch_sz x 1] (1=seq_len)
        h = init_h.unsqueeze(0) # [1 x batch_sz x hid_sz]  
        for di in range(maxlen):  
            if self.embedding is not None:
                x = self.embedding(x) # x: [batch_sz x 1 x emb_sz]
            h_n, h = self.rnn(x, h) # h_n: [batch_sz x 1 x hid_sz] h: [1 x batch_sz x hid_sz]

            logits = self.project(h_n) # out: [batch_sz x 1 x vocab_sz]  
            logits = logits.squeeze(1) # [batch_size x vocab_size]                  
            x = torch.multinomial(F.softmax(logits, dim=1), 1)  # [batch_size x 1 x 1]?
            decoded_words[:,di] = x.squeeze()
            len_inc=len_inc*(x.squeeze()!=EOS_ID).long() # stop increse length (set 0 bit) when EOS is met
            sample_lens=sample_lens+len_inc            
        
        if to_numpy:
            decoded_words = decoded_words.data.cpu().numpy()
            sample_lens = sample_lens.data.cpu().numpy()
        return decoded_words, sample_lens

class MyModel(nn.Module):
    '''The basic Hierarchical Recurrent Encoder-Decoder model. '''
    def __init__(self, config, vocab_size):
        super(MyModel, self).__init__()
        self.vocab_size = vocab_size
        self.maxlen=config['maxlen']
        self.clip = config['clip']
        self.init_w = config['init_w']
        
        self.embedder= nn.Embedding(vocab_size, config['emb_size'], padding_idx=PAD_ID) 
        #para1:单词数目
        #para2:单词维度
        #para3:填充ID,填补长度不一的单词
        self.utt_encoder = RNNEncoder(self.embedder, config['emb_size'], config['rnn_hid_utt'], True, 
                                   config['n_layers'], config['dropout']) 
                                                        # utter encoder: encode response to vector
        self.context_encoder = ContextEncoder(self.utt_encoder, config['rnn_hid_utt']*2,
                                              config['rnn_hid_ctx'], 1, config['dropout']) 
                                              # context encoder: encode context to vector    
        self.decoder = RNNDecoder(self.embedder, config['emb_size'], config['rnn_hid_ctx'], vocab_size, 1, config['dropout']) # utter decoder: P(x|c,z)
        self.optimizer = optim.Adam(list(self.context_encoder.parameters())
                                      +list(self.decoder.parameters()),lr=config['lr'])

    def forward(self, context, context_lens, utt_lens, response, res_lens):
        c = self.context_encoder(context, context_lens, utt_lens)
        output,_ = self.decoder(c, response[:,:-1], res_lens-1) # decode from z, c  # output: [batch x seq_len x n_tokens]   
        dec_target = response[:,1:].clone()
        dec_target[response[:,1:]==PAD_ID] = -100
        loss = nn.CrossEntropyLoss()(output.view(-1, self.vocab_size), dec_target.view(-1))
        return loss
    
    def train_batch(self, context, context_lens, utt_lens, response, res_lens):
        self.context_encoder.train()
        self.decoder.train()
        
        loss = self.forward(context, context_lens, utt_lens, response, res_lens)
        
        self.optimizer.zero_grad()  # 将梯度初始化为零
        loss.backward()
        # `clip_grad_norm` to prevent exploding gradient in RNNs
        nn.utils.clip_grad_norm_(list(self.context_encoder.parameters())+list(self.decoder.parameters()), self.clip)
        #梯度裁剪
        # nn.utils.clip_grad_norm(parameters, max_norm, norm_type=2)
        # 这个函数是根据参数的范数来衡量的
        # parameters (Iterable[Variable]) – 一个基于变量的迭代器，会进行归一化
        # max_norm (float or int) – 梯度的最大范数
        # norm_type(float or int) – 规定范数的类型，默认为L2
        # Returns:参数的总体范数（作为单个向量来看）
        self.optimizer.step()
        
        return {'train_loss': loss.item()}      
    
    def valid(self, context, context_lens, utt_lens, response, res_lens):
        #对于含有batch normalization或者是Dropout层的模型，训练时的froward和验证时的forward有计算上是不同的，
        #因此在前向传递过程中需要指定模型是在训练还是在验证
        self.context_encoder.eval()  
        self.decoder.eval()        
        loss = self.forward(context, context_lens, utt_lens, response, res_lens)
        return {'valid_loss': loss.item()}
    
    def sample(self, context, context_lens, utt_lens, n_samples):    
        self.context_encoder.eval()
        self.decoder.eval()
        with torch.no_grad():
            c = self.context_encoder(context, context_lens, utt_lens)
        sample_words, sample_lens = self.decoder.sampling(c, None, None, None, n_samples, self.maxlen)  
        return sample_words, sample_lens  

## 6. Evaluation
We provide the evaluation script as well as the BLEU score metric. 

**Do not change code in this block**

In [6]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from collections import Counter

class Metrics:
    """
    """
    def __init__(self):
        super(Metrics, self).__init__()

    def sim_bleu(self, hyps, ref):
        """
        :param ref - a list of tokens of the reference
        :param hyps - a list of tokens of the hypothesis
    
        :return maxbleu - recall bleu
        :return avgbleu - precision bleu
        """
        scores = []
        for hyp in hyps:
            try:
                scores.append(sentence_bleu([ref], hyp, smoothing_function=SmoothingFunction().method7,
                                        weights=[1./4, 1./4, 1./4, 1./4]))
            except:
                scores.append(0.0)
        return np.max(scores), np.mean(scores)
    
def evaluate(model, metrics, test_loader, vocab, repeat, f_eval):
    ivocab = {v: k for k, v in vocab.items()}
    device = next(model.parameters()).device
    
    recall_bleus, prec_bleus, avg_lens  = [], [], []
        
    dlg_id = 0
    for context, context_lens, utt_lens, floors, response, res_lens in tqdm(test_loader): 
        
        if dlg_id > 5000: break
        
#        max_ctx_len = max(context_lens)
        max_ctx_len = context.size(1)
        context, utt_lens, floors = context[:,:max_ctx_len,1:], utt_lens[:,:max_ctx_len]-1, floors[:,:max_ctx_len] 
                         # remove empty utts and the sos token in the context and reduce the context length
        ctx, ctx_lens = context, context_lens
        context, context_lens, utt_lens \
            = [tensor.to(device) for tensor in [context, context_lens, utt_lens]]

#################################################
        utt_lens[utt_lens==0]=1
#################################################
        
        with torch.no_grad():
            sample_words, sample_lens = model.sample(context, context_lens, utt_lens, repeat)
        # nparray: [repeat x seq_len]       
        
        pred_sents, _ = indexes2sent(sample_words, vocab)
        pred_tokens = [sent.split(' ') for sent in pred_sents]   
        ref_str, _ =indexes2sent(response[0].numpy(), vocab, SOS_ID)
        #ref_str = ref_str.encode('utf-8')
        ref_tokens = ref_str.split(' ')
        
        max_bleu, avg_bleu = metrics.sim_bleu(pred_tokens, ref_tokens)
        recall_bleus.append(max_bleu)
        prec_bleus.append(avg_bleu)
        
        avg_lens.append(np.mean(sample_lens))

        response, res_lens = [tensor.to(device) for tensor in [response, res_lens]]
        
        ## Write concrete results to a text file
        dlg_id += 1 
        if f_eval is not None:
            f_eval.write("Batch {:d} \n".format(dlg_id))
            # print the context
            start = np.maximum(0, ctx_lens[0]-5)
            for t_id in range(start, ctx_lens[0], 1):
                context_str = indexes2sent(ctx[0, t_id].numpy(), vocab)
                f_eval.write("Context {:d}-{:d}: {}\n".format(t_id, floors[0, t_id], context_str))
            #print the ground truth response    
            f_eval.write("Target >> {}\n".format(ref_str.replace(" ' ", "'")))
            for res_id, pred_sent in enumerate(pred_sents):
                f_eval.write("Sample {:d} >> {}\n".format(res_id, pred_sent.replace(" ' ", "'")))
            f_eval.write("\n")
    prec_bleu= float(np.mean(prec_bleus))
    recall_bleu = float(np.mean(recall_bleus))
    result = {'avg_len':float(np.mean(avg_lens)),
              'recall_bleu': recall_bleu, 'prec_bleu': prec_bleu, 
              'f1_bleu': 2*(prec_bleu*recall_bleu) / (prec_bleu+recall_bleu+10e-12),
             }
    
    if f_eval is not None:
        for k, v in result.items():
            f_eval.write(str(k) + ':'+ str(v)+' ')
        f_eval.write('\n')
    print("Done testing")
    print(result)
    
    return result


## 7. Training
The training script here.

In [7]:
import argparse
from datetime import datetime
from tensorboardX import SummaryWriter # install tensorboardX (pip install tensorboardX) before importing this package

def train(args, model=None, pad = 0):
    # LOG #
    fh = logging.FileHandler(f"./output/logs.txt")
                                      # create file handler which logs even debug messages
    logger.addHandler(fh)# add the handlers to the logger
    
    timestamp = datetime.now().strftime('%Y%m%d%H%M')
    tb_writer = SummaryWriter(f"./output/logs/{timestamp}") if args.visual else None

    # Set the random seed manually for reproducibility.
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")    # use GPU if available, else CPU
    print(device)
    # max = a if a>b else b
    # 即为:if a>b:
    #     max = a;
    # else:
    #     max = b;


    config=get_config()

    if args.visual:
        json.dump(config, open(f'./output/config_{timestamp}.json', 'w'))# save configs

    ###############################################################################
    # Load data
    ###############################################################################
    data_path = args.data_path+args.dataset+'/'
    # diaglen: how many utterance kept in the context window; config['diaglen']=10
    # maxlen: maximum utterance length; config['maxlen']=40
    train_set = DialogDataset(os.path.join(data_path, 'train.h5'), config['diaglen'], config['maxlen'])
    valid_set = DialogDataset(os.path.join(data_path, 'valid.h5'), config['diaglen'], config['maxlen'])
    test_set = DialogDataset(os.path.join(data_path, 'test.h5'), config['diaglen'], config['maxlen'])
    vocab = load_dict(os.path.join(data_path, 'vocab.json'))
    ivocab = {v: k for k, v in vocab.items()}
    n_tokens = len(ivocab)
    metrics=Metrics()           # a class used to evaluate
    print("Loaded data!")

    ###############################################################################
    # Define the models
    ###############################################################################
    if model is None:
        model = MyModel(config, n_tokens)

    if args.reload_from>=0:
        load_model(model, args.reload_from) # Load parameters from checkpoint
        
    model=model.to(device)  # 这代表将模型加载到指定设备上。torch.device("cpu")代表的使用cpu，torch.device("cuda")则代表的使用GPU。

    logger.info("Training...")
    best_perf = -1
    itr_global=1
    start_epoch=1 if args.reload_from==-1 else args.reload_from+1       #三目运算符
    # max = a if a>b else b
    # 即为:if a>b:
    #     max = a;
    # else:
    #     max = b;
    for epoch in range(start_epoch, config['epochs']+1): # config['epochs']=10
        epoch_start_time = time.time()
        itr_start_time = time.time()
        
        # shuffle (re-define) data between epochs
        # config['batch_size']=64
        train_loader=torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'],
                                                 shuffle=True, num_workers=1, drop_last=True)
        # num_workers：使用多进程加载的进程数，0代表不使用多进程
        # drop_last如果设置为True：这个是对最后的未完成的batch来说的，比如你的batch_size设置为64，而一个epoch只有100个样本，那么训练的时候后面的36个就被扔掉了…
        # 如果为False（默认），那么会继续正常执行，只是最后的batch_size会小一点。 就是一个整除问题
        # epoch: 迭代次数，1个epoch等于使用训练集中的全部样本训练一次；一个epoch = 所有训练样本的一个正向传递和一个反向传递
        # 举个例子，训练集有1000个样本，batch_size=10，那么训练完整个样本集需要：100次iteration，1次epoch。
        n_iters=train_loader.__len__()
        itr = 1
        for batch in train_loader:# loop through all batches in training data
            model.train()
            context, context_lens, utt_lens, floors, response, res_lens = batch

 #           max_ctx_len = max(context_lens)
            max_ctx_len = context.size(1)
            context, utt_lens = context[:,:max_ctx_len,1:], utt_lens[:,:max_ctx_len]-1
                                    # remove empty utterances in context
                                    # remove the sos token in the context and reduce the context length            
            batch_gpu = [tensor.to(device) for tensor in [context, context_lens, utt_lens, response, res_lens]] 
            train_results = model.train_batch(*batch_gpu)
                     
            if itr % args.log_every == 0:
                # in main:parser.add_argument('--log_every', type=int, default=100, help='interval to log autoencoder training results')
                elapsed = time.time() - itr_start_time
                log = '%s|%s@gpu%d epo:[%d/%d] iter:[%d/%d] step_time:%ds elapsed:%s'\
                %(args.model, args.dataset, args.gpu_id, epoch, config['epochs'],
                         itr, n_iters, elapsed, timeSince(epoch_start_time,itr/n_iters))
                logger.info(log)
                logger.info(train_results)
                if args.visual:
                    #parser.add_argument('-v','--visual', action='store_true', default=False, help='visualize training status in tensorboard')
                    tb_writer.add_scalar('train_loss', train_results['train_loss'], itr_global)

                itr_start_time = time.time()    
                
            if itr % args.valid_every == 0 and False:  # must be false???
                logger.info('Validation ')
                valid_loader=torch.utils.data.DataLoader(dataset=valid_set, batch_size=config['batch_size'], shuffle=True, num_workers=1)
                model.eval()    
                valid_losses = []
                for context, context_lens, utt_lens, floors, response, res_lens in valid_loader:
 #                   max_ctx_len = max(context_lens)
                    max_ctx_len = context.size(1)
                    context, utt_lens = context[:,:max_ctx_len,1:], utt_lens[:,:max_ctx_len]-1
                             # remove empty utterances in context
                             # remove the sos token in the context and reduce the context length
                    batch = [tensor.to(device) for tensor in [context, context_lens, utt_lens, response, res_lens]]
                    valid_results = model.valid(*batch)    
                    valid_losses.append(valid_results['valid_loss'])
                if args.visual: tb_writer.add_scalar('valid_loss', np.mean(valid_losses), itr_global)
                logger.info({'valid_loss':np.mean(valid_losses)})    
                
            itr += 1
            itr_global+=1            
            
            if itr_global % args.eval_every == 0:  # evaluate the model in the validation set
                model.eval()          
                logger.info("Evaluating in the validation set..")

                valid_loader=torch.utils.data.DataLoader(dataset=valid_set, batch_size=1, shuffle=False, num_workers=1)

                f_eval = open(f"./output/tmp_results/iter{itr_global}.txt", "w")
                repeat = 10            
                eval_results = evaluate(model, metrics, valid_loader, vocab, repeat, f_eval)
                bleu = eval_results['recall_bleu']
                if bleu> best_perf:
                    save_model(model, 0)#itr_global) # save model after each epoch
                if args.visual:
                    tb_writer.add_scalar('recall_bleu', bleu, itr_global)
                
        # end of epoch ----------------------------
               # model.adjust_lr()

    return model


## 8. Main Function (Training)
You can change the default arguments by setting the `default` attribute.

In [8]:

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Dialog Pytorch')
    # Path Arguments
    parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
    parser.add_argument('--model', type=str, default='MyModel', help='model name')
    parser.add_argument('--dataset', type=str, default='weibo', help='name of dataset.')
    parser.add_argument('-v','--visual', action='store_true', default=False, help='visualize training status in tensorboard')
    parser.add_argument('--reload_from', type=int, default=-1, help='reload from a trained ephoch')
    parser.add_argument('--gpu_id', type=int, default=1, help='GPU ID')

    # Evaluation Arguments
    parser.add_argument('--log_every', type=int, default=100, help='interval to log autoencoder training results')
    parser.add_argument('--valid_every', type=int, default=1000, help='interval to validation')
    parser.add_argument('--eval_every', type=int, default=2000, help='interval to evaluation to concrete results')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    
    
    
    
    args = parser.parse_args(args=[])
    print(vars(args))

    # make output directory if it doesn't already exist
    os.makedirs(f'./output/models', exist_ok=True)
    os.makedirs(f'./output/tmp_results', exist_ok=True)
        
    torch.backends.cudnn.benchmark = True # speed up training by using cudnn
    torch.backends.cudnn.deterministic = True # fix the random seed in cudnn
    
    model = train(args)

{'data_path': './data/', 'model': 'MyModel', 'dataset': 'weibo', 'visual': False, 'reload_from': -1, 'gpu_id': 1, 'log_every': 100, 'valid_every': 1000, 'eval_every': 2000, 'seed': 1111}
cpu
loading data...


OSError: ``./data/weibo/train.h5`` does not exist

## 9. Main Function (Test)

**Please do not change code here except the default arguments**

In [None]:

def test(args):
    conf = get_config()
    # Set the random seed manually for reproducibility.
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)
    else:
        print("Note that our pre-trained models require CUDA to evaluate.")
    
    # Load data
    data_path=args.data_path+args.dataset+'/'
    test_set=DialogDataset(data_path+'test.h5', conf['diaglen'], conf['maxlen'])
    test_loader=torch.utils.data.DataLoader(dataset=test_set, batch_size=1, shuffle=False, num_workers=1)
    vocab = load_dict(data_path+'vocab.json')
    n_tokens = len(vocab)

    metrics=Metrics()
    
    # Load model checkpoints    
    model = MyModel(conf, n_tokens)
    load_model(model, 0)
    #model=model.to(device)
    model.eval()
    #Sets the module in evaluation mode.
    
    f_eval = open("./output/results.txt", "w")
    repeat = args.n_samples
    
    evaluate(model, metrics, test_loader, vocab, repeat, f_eval)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='PyTorch DialogGAN for Eval')
    parser.add_argument('--data_path', type=str, default='./data/', help='location of the data corpus')
    parser.add_argument('--dataset', type=str, default='weibo', help='name of dataset, SWDA or DailyDial')
    parser.add_argument('--model', type=str, default='MyModel', help='model name')
    parser.add_argument('--reload_from', type=int, default=0, 
                        help='directory to load models from')
    
    parser.add_argument('--n_samples', type=int, default=10, help='Number of responses to sampling')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    args = parser.parse_args(args=[])
    print(vars(args))
    test(args)