## Group: Oasis
### Members: 
### 1. Nattapat Juthaprachakul, njuthapr
### 2. Siyu Wu, sw246

# chunker: default program

In [1]:
from default import *
import os

## Run the default solution on dev

In [4]:
chunker = LSTMTagger(os.path.join('data', 'train.txt.gz'), os.path.join('data', 'chunker'), '.tar')
decoder_output = chunker.decode('data/input/dev.txt')

100%|██████████| 1027/1027 [00:02<00:00, 459.66it/s]


## Evaluate the default output

In [5]:
flat_output = [ output for sent in decoder_output for output in sent ]
import conlleval
true_seqs = []
with open(os.path.join('data','reference','dev.out')) as r:
    for sent in conlleval.read_file(r):
        true_seqs += sent.split()
conlleval.evaluate(true_seqs, flat_output)

processed 23663 tokens with 11896 phrases; found: 11672 phrases; correct: 8568.
accuracy:  84.35%; (non-O)
accuracy:  85.65%; precision:  73.41%; recall:  72.02%; FB1:  72.71
             ADJP: precision:  36.49%; recall:  11.95%; FB1:  18.00  74
             ADVP: precision:  71.36%; recall:  39.45%; FB1:  50.81  220
            CONJP: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
             INTJ: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
               NP: precision:  70.33%; recall:  76.80%; FB1:  73.42  6811
               PP: precision:  92.40%; recall:  87.14%; FB1:  89.69  2302
              PRT: precision:  65.00%; recall:  57.78%; FB1:  61.18  40
             SBAR: precision:  84.62%; recall:  41.77%; FB1:  55.93  117
               VP: precision:  63.66%; recall:  58.25%; FB1:  60.83  2108


(73.40644276901988, 72.02420981842637, 72.70875763747455)

## Documentation

Write some beautiful documentation of your program here.

In [None]:
INDEX_BEG = 0
LENGTH_VECTOR_ENCODING = 300
# dtype = torch.cuda.FloatTensor
torch.set_default_tensor_type('torch.cuda.FloatTensor')


################################################################
'''Helper functions'''

def encode_one_char(sentence,vector_2d,first_char=True):
    ''' vector size = string.printable == 100
     one-hot encoding only the first/last letter of word '''

    '''if word exists in string.printable, use 1; otherwise, 0'''
    # iterate word by word in the sentence
    for word,vector in zip(sentence,vector_2d):
        if word == '[UNK]':
            continue

        if first_char:
            index_word = INDEX_BEG
        else:
            index_word = len(word)-1

        letter = word[index_word]
        index_strPrintable= string.printable.find(letter) # letter to index
        vector[index_strPrintable] = 1.0
    return vector_2d

def encode_internal_chars(sentence,vector_2d):
    ''' vector size = string.printable == 100
     Encoding only the internal letters of word  (excluding begining and ending chars)'''

    # iterate word by word in the sentence
    for word,vector in zip(sentence,vector_2d):
        if word == '[UNK]':
            continue

        # if word length < 3, there is no internal
        if len(word) < 3:
            continue

        internal_word = word[1:-1]
        # interate letter in the internal word
        for letter in internal_word:
            index_strPrintable= string.printable.find(letter) # letter to index
            vector[index_strPrintable] += 1.0
    return vector_2d

def encoding_sentence(sentence):
    '''Function to encode every word in the sentence'''

    '''encoding the beginning charactor of all words in the sentence'''
    beginChar_vector = np.zeros((len(sentence),len(string.printable)))
    beginChar_vector = encode_one_char(sentence,beginChar_vector,first_char=True)

    '''encoding the ending charactor of all words in the sentence'''
    endChar_vector = np.zeros((len(sentence),len(string.printable)))
    endChar_vector = encode_one_char(sentence,endChar_vector,first_char=False)

    '''encoding all internal charactors of all words in the sentence'''
    internal_vector = np.zeros((len(sentence),len(string.printable)))
    internal_vector = encode_internal_chars(sentence,internal_vector)

    ''' concate all 3 vectors '''
    encoding_vector = np.concatenate((beginChar_vector,internal_vector,endChar_vector),axis=1)
    # print(beginChar_vector.shape,endChar_vector.shape,internal_vector.shape,encoding_vector.shape)

    '''create Tensor from numpy object '''
    # encoding_tensor = torch.tensor(encoding_vector, dtype=torch.float)
    encoding_tensor = torch.tensor(encoding_vector, dtype=torch.float).cuda()

    return encoding_tensor
############################################################


class LSTMTaggerModel(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,char_encoding):
        torch.manual_seed(1)
        super(LSTMTaggerModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        '''if using character-level encoding, hidden dim to lstm = 128+300 = 428'''
        if char_encoding:
            lstm_embedding_dim = embedding_dim+LENGTH_VECTOR_ENCODING
        else:
            lstm_embedding_dim = embedding_dim

        self.lstm = nn.LSTM(lstm_embedding_dim, hidden_dim, bidirectional=False)

        ## second LSTM for character-level encoding vector
        # self.lstm_encoding = nn.LSTM(LENGTH_VECTOR_ENCODING, hidden_dim, bidirectional=False)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence,encoding_tensor=None):
        embeds = self.word_embeddings(sentence)

        # # put character-level encoding vectors into LSTM before concatenating with embedding
        # if encoding_tensor is not None:
        #     reshaped_encoding_tensor = torch.reshape(encoding_tensor,(-1,1,LENGTH_VECTOR_ENCODING))
        #     lstm_out_encoding, _ = self.lstm_encoding(reshaped_encoding_tensor)
        #     encoding_tensor = torch.reshape(lstm_out_encoding,(-1,LENGTH_VECTOR_ENCODING))

        '''if using character-level encoding, we concatenate Embedding vector with new encoded vectors = 128+300 = 428'''
        if encoding_tensor is not None:
            embeds = torch.cat([embeds,encoding_tensor],dim=1)

        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

class LSTMTagger:

    def __init__(self, trainfile, modelfile, modelsuffix, unk="[UNK]", epochs=10, embedding_dim=128, hidden_dim=64,char_encoding=False):
        self.unk = unk
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.epochs = epochs
        self.modelfile = modelfile
        self.modelsuffix = modelsuffix
        self.training_data = []
        if trainfile[-3:] == '.gz':
            with gzip.open(trainfile, 'rt') as f:
                self.training_data = read_conll(f)
        else:
            with open(trainfile, 'r') as f:
                self.training_data = read_conll(f)

        self.word_to_ix = {} # replaces words with an index (one-hot vector)
        self.tag_to_ix = {} # replace output labels / tags with an index
        self.ix_to_tag = [] # during inference we produce tag indices so we have to map it back to a tag

        for sent, tags in self.training_data:
            for word in sent:
                if word not in self.word_to_ix:
                    self.word_to_ix[word] = len(self.word_to_ix)
            for tag in tags:
                if tag not in self.tag_to_ix:
                    self.tag_to_ix[tag] = len(self.tag_to_ix)
                    self.ix_to_tag.append(tag)

        logging.info("word_to_ix:", self.word_to_ix)
        logging.info("tag_to_ix:", self.tag_to_ix)
        logging.info("ix_to_tag:", self.ix_to_tag)

        '''Flag whether do character-level encoding or not'''
        self.char_encoding = char_encoding
        # self.model = LSTMTaggerModel(self.embedding_dim, self.hidden_dim, len(self.word_to_ix), len(self.tag_to_ix))
        self.model = LSTMTaggerModel(self.embedding_dim, self.hidden_dim, len(self.word_to_ix), len(self.tag_to_ix),char_encoding=char_encoding).cuda()

        self.optimizer = optim.SGD(self.model.parameters(), lr=0.01)


    def train(self):
        loss_function = nn.NLLLoss()

        self.model.train()
        loss = float("inf")
        for epoch in range(self.epochs):
            for sentence, tags in tqdm.tqdm(self.training_data):

                # Step 1. Remember that Pytorch accumulates gradients.
                # We need to clear them out before each instance
                self.model.zero_grad()

                # Step 2. Get our inputs ready for the network, that is, turn them into
                # Tensors of word indices.
                # sentence_in = prepare_sequence(sentence, self.word_to_ix, self.unk)
                sentence_in = prepare_sequence(sentence, self.word_to_ix, self.unk).cuda()

                # targets = prepare_sequence(tags, self.tag_to_ix, self.unk)
                targets = prepare_sequence(tags, self.tag_to_ix, self.unk).cuda()

                # Step 3. Run our forward pass.
                # tag_scores = self.model(sentence_in)
                '''if using character-level encoding, we encode the tensor'''
                # create character level vectors to concate with the embeddings
                if self.char_encoding:
                    encoding_tensor = encoding_sentence(sentence)
                else:
                    encoding_tensor = None

                tag_scores = self.model(sentence_in,encoding_tensor=encoding_tensor)

                # Step 4. Compute the loss, gradients, and update the parameters by
                #  calling optimizer.step()
                loss = loss_function(tag_scores, targets)
                loss.backward()
                self.optimizer.step()

            if epoch == self.epochs-1:
                epoch_str = '' # last epoch so do not use epoch number in model filename
            else:
                epoch_str = str(epoch)
            savefile = self.modelfile + epoch_str + self.modelsuffix
            print("saving model file: {}".format(savefile), file=sys.stderr)
            torch.save({
                        'epoch': epoch,
                        'model_state_dict': self.model.state_dict(),
                        'optimizer_state_dict': self.optimizer.state_dict(),
                        'loss': loss,
                        'unk': self.unk,
                        'word_to_ix': self.word_to_ix,
                        'tag_to_ix': self.tag_to_ix,
                        'ix_to_tag': self.ix_to_tag,
                    }, savefile)


## Code explaination

The most important codes that we have implemented are the encoding_sentence where we encode each word in the sentence onto 3 vector where the first and second vectors represent the first and last character (encode_one_char) while the last vector represents the internal characters (encode_internal_chars)(all character between first and last character). After we have these 3 encoded vectors, we joined them with the word embedding. Also, we have tried putting these encoded vectors into LSTM of 64 or 100 hidden units before joing with word embedding (however, we comment out this code since this technique does not improve our score)  

## Run our solution on the dev.txt

In [4]:
from chunker import *
import os

optparser = optparse.OptionParser()
optparser.add_option("-i", "--inputfile", dest="inputfile", default=os.path.join('/Users/','wusiyu','Desktop','nlp-class-hw','chunker','data', 'input', 'dev.txt'), help="produce chunking output for this input file")
optparser.add_option("-t", "--trainfile", dest="trainfile", default=os.path.join('/Users/','wusiyu','Desktop','nlp-class-hw','chunker','data', 'train.txt.gz'), help="training data for chunker")
optparser.add_option("-m", "--modelfile", dest="modelfile", default=os.path.join('/Users/','wusiyu','Desktop','nlp-class-hw','chunker','data', 'chunker'), help="filename without suffix for model files")
optparser.add_option("-s", "--modelsuffix", dest="modelsuffix", default='.tar', help="filename suffix for model files")
optparser.add_option("-e", "--epochs", dest="epochs", default=5, help="number of epochs [fix at 5]")
optparser.add_option("-u", "--unknowntoken", dest="unk", default='[UNK]', help="unknown word token")
optparser.add_option("-f", "--force", dest="force", action="store_true", default=False, help="force training phase (warning: can be slow)")
optparser.add_option("-l", "--logfile", dest="logfile", default=None, help="log file for debugging")
optparser.add_option("-o", "--outputfile", dest="outputfile", default='output.txt', help="print result to output file")

(opts, _) = optparser.parse_args()
modelfile = opts.modelfile


chunker = LSTMTagger(opts.trainfile, modelfile, opts.modelsuffix, opts.unk,char_encoding=True)
decoder_output = chunker.decode(opts.inputfile)


100%|██████████| 1027/1027 [00:01<00:00, 617.66it/s]


## Evaluate our output

In [5]:
flat_output = [ output for sent in decoder_output for output in sent ]
import conlleval
true_seqs = []
with open(os.path.join('/Users/','wusiyu','Desktop','nlp-class-hw','chunker','data','reference','dev.out')) as r:
    for sent in conlleval.read_file(r):
        true_seqs += sent.split()
conlleval.evaluate(true_seqs, flat_output)

processed 23663 tokens with 11896 phrases; found: 11968 phrases; correct: 9279.
accuracy:  87.23%; (non-O)
accuracy:  88.26%; precision:  77.53%; recall:  78.00%; FB1:  77.77
             ADJP: precision:  43.33%; recall:  17.26%; FB1:  24.68  90
             ADVP: precision:  70.99%; recall:  46.73%; FB1:  56.36  262
            CONJP: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
             INTJ: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
               NP: precision:  76.23%; recall:  81.80%; FB1:  78.92  6693
               PP: precision:  91.11%; recall:  87.34%; FB1:  89.19  2340
              PRT: precision:  66.67%; recall:  48.89%; FB1:  56.41  33
             SBAR: precision:  84.38%; recall:  45.57%; FB1:  59.18  128
               VP: precision:  69.78%; recall:  73.35%; FB1:  71.52  2422


(77.5317513368984, 78.00100874243444, 77.76567214213878)

## Analysis

Do some analysis of the results. What ideas did you try? What worked and what did not?

The most important thing we implemented is that we concatenated the word embedding input to the chunker RNN as an input vector that is the character-level represenation of the word. To be more specific, we used three types of vectors (each with the same size of 100) to represent a word, which are the vector of the first character, the vector of the last character, and the vector of internal characters. Then, we just concatenated these three vectors to generate the final character-level represenation of a word. 

For the result of our method, there are lots of improvement to the scores. The FB1 score increases from 72.7 by the default method to 77.7 by our method. Also, we have tried putting the character-level vectors into LSTM with hidden layer of 64 and 100; however, the results are worse than the original one that we implemented.

For the training procedure, we have trained these models with Nvidia GPU 1050Ti and these are the results and training time: 

    Default->  FB1:72.7, training time: 18mins
    Encoded character-level vectors-> FB1:77.766, training time: 18 mins
    Encoded character-level vectors with LSTM (64 hidden units)-> FB1:76.79, training time:28 mins
    Encoded character-level vectors with LSTM (100 hidden units)-> FB1: 76.62, training time:30 mins
 
Therefore, this results shows that Encoded character-level vectors with word embedding are the best method in both training time and accuracy.