# 用RNN做POS tagging

姓名: \[write-your-name-here\]

在这份作业中，你会用一个bidirectional recurrent neural network来做POS tagging。

In [None]:
# import necessary libraries and set the random seeds

import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torchtext import data
import numpy as np
import random
from torch.utils.data import Dataset
import time
import shutil

EMBEDDING_DIM = 300
HIDDEN_DIM = 200


USE_CUDA = torch.cuda.is_available()
random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
if USE_CUDA:
    torch.cuda.manual_seed(53113)
    
BATCH_SIZE = 128


载入POS tagging训练和dev数据集。这些文件都是tab分隔的text和POS tag数据，

In [None]:
def load_datasets():
    text = data.Field(include_lengths=True)
    tags = data.Field()
#     train_data, val_data, test_data = data.TabularDataset.splits(path='Pytorch-POS-Tagger/RNN_Data_files/', train='train_data.tsv', validation='val_data.tsv', test='val_data.tsv', fields=[('text', text), ('tags', tags)], format='tsv')
    train_data, val_data, test_data = data.TabularDataset.splits(path='./', train='train.txt', validation='dev.txt', test='dev.txt', fields=[('text', text), ('tags', tags)], format='tsv')

    
    batch_sizes = (BATCH_SIZE, BATCH_SIZE, BATCH_SIZE)
    train_loader, dev_loader, test_loader = data.BucketIterator.splits((train_data, val_data, test_data), batch_sizes=batch_sizes, sort_key=lambda x: len(x.text))

    text.build_vocab(train_data)
    tags.build_vocab(train_data)
    dataloaders = {'train': train_loader,
                   'validation': dev_loader,
                   'test': dev_loader}
    return text, tags, dataloaders

text, tags, dataloaders = load_datasets()
text_vocab_size = len(text.vocab.stoi) + 1
tag_vocab_size = len(tags.vocab.stoi) - 1   # = 42 (not including the <pad> token
print(text_vocab_size)
print(tag_vocab_size)

In [None]:
class POSDataset(Dataset):
    def __init__(self, path, sen_vocab, tag_vocab):
        super(POSDataset, self).__init__()
        self.sen_vocab = sen_vocab
        self.tag_vocab = tag_vocab
        self.num_classes = tag_vocab.size()
        sen_file = os.path.join(path, 'sentences.txt')
        tag_file = os.path.join(path, 'tags.txt')
        self.sentences = []
        with open(sen_file, 'r') as f:
            for line in f:
                idxs = self.sen_vocab.toIdx(line.rstrip('\n').split(' '))
                tensor = torch.LongTensor(idxs)
                self.sentences.append(tensor)

        self.tags = []
        with open(tag_file, 'r') as f:
            for line in f:
                idxs = self.tag_vocab.toIdx(line.rstrip('\n').split(' '))
                tensor = torch.LongTensor(idxs)
                self.tags.append(tensor)

        # making sure there are same number of sentences as tags.
        assert(len(self.sentences) == len(self.tags))

    def __getitem__(self, index):
        sentence = self.sentences[index]
        tags = self.tags[index]
        return sentence, tags

    def __len__(self):
        return len(self.sentences)

In [None]:
def sequence_mask(sequence_length, max_len=None):
    ''' Given a tensor of a sequence of lengths, create a mask of each length. 
    '''
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.range(0, max_len - 1).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


# run one epoch of training
def train(model, train_loader, loss_fn, optimizer, use_gpu=False):
    model.train()  # Set model to training mode
    running_loss = 0.0
    running_corrects = 0
    example_count = 0
    step = 0
    # Iterate over data.
    for batch in train_loader:
        sentences = batch.text[0].transpose(1, 0)
        tags = batch.tags.transpose(1, 0)
        ''' Implement the code to train the model. 
            - Prepare the input data (text, tags, mask) to the correct format and shape
            - Run the forward method of the model
            - Compute the loss
            - Run backward on loss for back propagation
            - Run the optimizer to update the model parameters. 
            - Compute the number of correct predictions
        '''
        # TODO
        
        
        
        step += 1
        if step % 100 == 0:
            print('loss: {}, running_corrects: {}, example_count: {}, acc: {}'.format(loss.item(), 
                            running_corrects, example_count, (running_corrects / example_count) * 100))
        if step * batch_size >= 40000:
            break
    loss = running_loss / example_count
    acc = (running_corrects / example_count) * 100
    print(loss)
    print(acc)
    print('Train Loss: {:.4f} Acc: {:2.3f} ({}/{})'.format(loss, acc, running_corrects, example_count))
    return loss, acc


def validate(model, val_loader, loss_fn, use_gpu=False):
    model.eval()  # Set model to evaluate mode
    running_loss = 0.0
    running_corrects = 0
    example_count = 0
    # Iterate over data.
    with torch.no_grad():
        for batch in val_loader:
            sentences = batch.text[0].transpose(1, 0)
            tags = batch.tags.transpose(1, 0) 
            ''' Similar to training, do the following to evaluate the model.  
            - Prepare the input data (text, tags, mask) to the correct format and shape
            - Run the forward method of the model
            - Compute the loss
            - Compute the number of correct predictions
            '''
            # TODO
            

    loss = running_loss / example_count
    acc = (running_corrects / example_count) * 100
    print(loss)
    print(acc)
    print('Validation Loss: {:.4f} Acc: {:2.3f} ({}/{})'.format(loss, acc, running_corrects, example_count))
    return loss, acc


def train_model(model, data_loaders, criterion, optimizer, scheduler, save_dir, num_epochs=25, use_gpu=False):
    print('Training Model with use_gpu={}...'.format(use_gpu))
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        train_begin = time.time()
        train_loss, train_acc = train(model, data_loaders['train'], criterion, optimizer, use_gpu)
        train_time = time.time() - train_begin
        print('Epoch Train Time: {:.0f}m {:.0f}s'.format(train_time // 60, train_time % 60))
        
        validation_begin = time.time()
        val_loss, val_acc = validate(model, data_loaders['validation'], criterion, use_gpu)
        validation_time = time.time() - validation_begin
        print('Epoch Validation Time: {:.0f}m {:.0f}s'.format(validation_time // 60, validation_time % 60))
        
        # deep copy the model
        is_best = val_acc > best_acc
        if is_best:
            best_acc = val_acc
            best_model_wts = model.state_dict()

        save_checkpoint(save_dir, {
            'epoch': epoch,
            'best_acc': best_acc,
            'state_dict': model.state_dict(),
            # 'optimizer': optimizer.state_dict(),
        }, is_best)

        scheduler.step()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    # load best model weights
    model.load_state_dict(best_model_wts)

    return model


def save_checkpoint(save_dir, state, is_best):
    savepath = save_dir + '/' + 'checkpoint.pth.tar'
    torch.save(state, savepath)
    if is_best:
        shutil.copyfile(savepath, save_dir + '/' + 'model_best.pth.tar')


def test_model(model, test_loader, use_gpu=False):
    model.eval()  # Set model to evaluate mode
    running_corrects = 0
    example_count = 0
    test_begin = time.time()
    # Iterate over data.
    with torch.no_grad():
        for batch in test_loader:
            sentences = batch.text[0].transpose(1, 0)
            tags = batch.tags.transpose(1, 0)
            ''' Similar to dev, except do we not need to compute the loss here
            '''
            # TODO
            

    acc = (running_corrects / example_count) * 100
    print('Test Acc: {:2.3f} ({}/{})'.format(acc, running_corrects, example_count))
    test_time = time.time() - test_begin
    print('Test Time: {:.0f}m {:.0f}s'.format(test_time // 60, test_time % 60))
    return acc

# Define the model

In [None]:
class POSTagger(nn.Module):
    def __init__(self, rnn_class, embedding_dim, hidden_dim, vocab_size, target_size, num_layers):
        super(POSTagger, self).__init__()
        ''' Define your model here
            Basically, your model only need three components:
            - an embedding layer
            - a bidirectional RNN (LSTM, GRU) that takes the embeddings and outputs hidden states
            - a final linear prediction layer to convert hidden states to tag scores
            Optionally, define extra layers such as dropout to prevent overfitting. 
        ''' 
        # TODO
        
        
    def forward(self, sentences):
        ''' Define your forward method
        ''' 
        # TODO
        
        
        return tag_scores
    
model = POSTagger("lstm", EMBEDDING_DIM, HIDDEN_DIM, text_vocab_size, tag_vocab_size, 3)
if USE_CUDA:
    model = model.cuda()

In [None]:
LR = 0.001
GAMMA = 1.
STEP_SIZE = 10
NUM_EPOCHS = 10
SAVE_DIR = "./save/"
loss_fn = nn.CrossEntropyLoss(size_average=False)
optimizer = optim.Adam(model.parameters(), lr=LR)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)
model = train_model(model, dataloaders, loss_fn, optimizer, exp_lr_scheduler, SAVE_DIR, NUM_EPOCHS, use_gpu=USE_CUDA)
