# Download data

In [0]:
# import dataset
import pandas as pd

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
val_df = pd.read_csv('val.csv')

In [0]:
# get labels
train_label_raw = list(train_df.NER)
val_label_raw = list(val_df.NER)

In [0]:
# split the labels 
train_labels = []
for label in train_label_raw:
    train_labels.append(label.split(" "))

val_labels = []
for label in val_label_raw:
    val_labels.append(label.split(" "))


# 1. Preproccessing

In [0]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer

# lemmatize the words and lower the word
def get_lemmatized_tokens(dataframe):
  """
  Extract sentences from dataframe and output stemmed tokens and related NER
  Extract labels from dataframe and tokenize it
  """
  
  sentences = list(dataframe.Sentence)

  tokenized_sentences = []
  for sentence in sentences:
    tokens = sentence.split(" ")
    for x in tokens:
        x.lower()
    tokenized_sentences.append(tokens)

  lemmatized_sentences = []
  lemmatizer = WordNetLemmatizer()

  for sentence in tokenized_sentences:
    lemmatized = [lemmatizer.lemmatize(word, pos='v') for word in sentence]
    lemmatized_sentences.append(lemmatized)

  return lemmatized_sentences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
train_tokens = get_lemmatized_tokens(train_df)
val_tokens = get_lemmatized_tokens(val_df)
test_tokens = get_lemmatized_tokens(test_df)

In [0]:
all_tokens = train_tokens + val_tokens + test_tokens

In [0]:
# Reference Lab 9 Code: https://colab.research.google.com/drive/1yVy7T9DNB9lJo3NgFdsHAuEsI0msAuPz
# make mapping between word and index , tag and index
word_to_ix = {}
for sentence in all_tokens:
    for word in sentence:
        word = word.lower()
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

word_list = list(word_to_ix.keys())

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}

for tags in train_labels:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

# 2. Input Embedding

##2.1. POS-Tag

In [0]:
# function to do the pos tagging for each sentence 
nltk.download('averaged_perceptron_tagger')
def generate_pos_tag(doc):
  """

  :param doc_words: words in doc
  :return: pos tags
  """
  pos_tags = []
  for sentence in doc:
    tags = []
    for word, tag in nltk.pos_tag(sentence):
      tags.append(tag)
    pos_tags.append(tags)
 
  return pos_tags

pos_tags = generate_pos_tag(all_tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [0]:
# use word2vec to train the embeddings for pos
from gensim.models import Word2Vec
w2v_pos = Word2Vec(sentences=pos_tags, 
                   size=20, 
                   window=3,
                   min_count=1,
                   workers=4,
                   sg=1
    
)

##2.2. Parse Tree

In [0]:
# Reference Lab 7 Code: https://colab.research.google.com/drive/1r6LqTob5l1W3hFpmg6ZQ5XgyYulOfCuc

import spacy

#load the spacy api with the pre-trained statistical models for English. English multi-task CNN trained on OntoNotes
nlp = spacy.load("en_core_web_sm")

def parse_sentence(data):
    parse_sentences_temp=[]
    for sentence in data:
        parse = nlp(' '.join(sentence))
        deps = []
        for x in parse:
          deps.append(x.dep_)

        parse_sentences_temp.append(deps[:len(sentence)])

    return parse_sentences_temp


In [0]:
parse_sentences=parse_sentence(all_tokens)

In [0]:
# use word2vec to train the embeddings for pos
from gensim.models import Word2Vec
w2v_parse_tree = Word2Vec(sentences=parse_sentences, 
                   size=20, 
                   window=3,
                   min_count=1,
                   workers=4,
                   sg=1
    
)

In [0]:
#make a dic about the word and pos,  word and parse tree
word_2_pos={}
for i in range(0,len(all_tokens)):
    for x in range(0,len(all_tokens[i])):
        word_2_pos[all_tokens[i][x]] = w2v_pos[pos_tags[i][x]]

word_2_pt={}
for i in range(0,len(all_tokens)):
    for x in range(0,len(all_tokens[i])):
        word_2_pt[all_tokens[i][x]] = w2v_parse_tree[parse_sentences[i][x]]       


  after removing the cwd from sys.path.
  if __name__ == '__main__':


##2.3. Word Embedding

In [0]:
# Reference Lab 9 Code: https://colab.research.google.com/drive/1yVy7T9DNB9lJo3NgFdsHAuEsI0msAuPz
#load the glove pre-trained word embedding 
import gensim.downloader as api
import numpy as np
word_emb_model = api.load("glove-twitter-100") 




  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# splice all embeddings pos, parse tree and word embedding 
EMBEDDING_DIM = 140

embedding_matrix = []
for word in word_list:
    try:
        word_embedding_temp=word_emb_model.wv[word]
        word_embedding_temp.extend(word_2_pos[word])
        word_embedding_temp.extend(word_2_pt[word])

        embedding_matrix.append(word_embedding_temp)
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  if sys.path[0] == '':


(12551, 140)

In [0]:
# Reference Lab9 Code: https://colab.research.google.com/drive/1yVy7T9DNB9lJo3NgFdsHAuEsI0msAuPz
# transfer word to index
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

train_input_index =  to_index(train_tokens,word_to_ix)
train_output_index = to_index(train_labels,tag_to_ix)
val_input_index = to_index(val_tokens,word_to_ix)
val_output_index = to_index(val_labels,tag_to_ix)
test_input_index = to_index(test_tokens,word_to_ix)


In [0]:
tag_to_ix

{'<START>': 0,
 '<STOP>': 1,
 'I-LOC': 6,
 'I-MISC': 4,
 'I-ORG': 3,
 'I-PER': 5,
 'O': 2}

# 3. Model 

In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

# Reference: Lab9 Code https://colab.research.google.com/drive/1yVy7T9DNB9lJo3NgFdsHAuEsI0msAuPz

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

#BiLSTM and CRF model with attention
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, attention_method = None):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.attention_method = attention_method

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        self.dropout = nn.Dropout(0.2)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size) if not attention_method else nn.Linear(hidden_dim * 2, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))
        print(self.transitions.shape)

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))
        


    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)

        ##self attention part
        if self.attention_method:
            lstm_out = torch.squeeze(lstm_out, 1)
            left_self = lstm_out.view(1, lstm_out.size(0), lstm_out.size(1))
            right_self = left_self.view(left_self.size(0), left_self.size(2), left_self.size(1))
            if "scale" in self.attention_method.lower():
                weight_att = nn.functional.softmax(torch.bmm(left_self, right_self) * 1/np.sqrt(self.hidden_dim * 2),dim=-1)
            else:
                weight_att = nn.functional.softmax(torch.bmm(left_self, right_self),dim=-1)

            output = torch.bmm(weight_att, left_self)
            concat_output = torch.cat((output, left_self), dim = -1)
            lstm_out = concat_output.view(len(sentence), self.hidden_dim * 2)
        else:
            lstm_out = lstm_out.view(len(sentence), self.hidden_dim)

        lstm_out = self.dropout(lstm_out)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  
        # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [0]:
# get the F1 score of the model on specific data set
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
def cal_acc(model, input_index, output_index):
    ground_truth=[]
    predicted=[]
    for x,y in zip(input_index,output_index):
        input_tensor = torch.tensor(x).to(device)
        _,output = model(input_tensor)
        ground_truth.extend(y)
        predicted.extend(output)

    f1score=f1_score(ground_truth,predicted,average = 'micro')
    return ground_truth, predicted, f1score

#4. Testing

##4.1. Different Embedding Model

###4.1.1. Word Embedding + POS-Tag + Parse Tree

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 200

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, attention_method = None).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.015, weight_decay=1e-4)

torch.Size([7, 7])


In [0]:
train_data=train_input_index+val_input_index
train_label=train_output_index+val_output_index

In [0]:
"""Each epoch will take about 2-3 minutes"""
import datetime

# Reference Lab 9 Code: https://colab.research.google.com/drive/1yVy7T9DNB9lJo3NgFdsHAuEsI0msAuPz
# train the model 
for epoch in range(15):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    max_val_acc=0
    model.eval()
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index)

    #pick the best model which has the biggest F1 score on val data
    if(val_acc>max_val_acc):
        best_model = model
        max_val_acc=val_acc

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train f1 score: %.4f, val loss: %.2f, val f1 score: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 23357.76, train f1 score: 0.8192, val loss: 4589.63, val f1 score: 0.7742, time: 144.35s
Epoch:2, Training loss: 12087.14, train f1 score: 0.8848, val loss: 2804.40, val f1 score: 0.8408, time: 143.29s
Epoch:3, Training loss: 7193.52, train f1 score: 0.9215, val loss: 2762.07, val f1 score: 0.8630, time: 142.87s
Epoch:4, Training loss: 5347.25, train f1 score: 0.9337, val loss: 3029.42, val f1 score: 0.8653, time: 142.82s
Epoch:5, Training loss: 3981.20, train f1 score: 0.9548, val loss: 2862.78, val f1 score: 0.8809, time: 145.22s
Epoch:6, Training loss: 3150.10, train f1 score: 0.9673, val loss: 2774.34, val f1 score: 0.8837, time: 144.42s
Epoch:7, Training loss: 2627.87, train f1 score: 0.9728, val loss: 2801.75, val f1 score: 0.8892, time: 143.15s
Epoch:8, Training loss: 2215.53, train f1 score: 0.9734, val loss: 3089.81, val f1 score: 0.8890, time: 144.20s
Epoch:9, Training loss: 1963.97, train f1 score: 0.9811, val loss: 2835.24, val f1 score: 0.9048, time

### 4.1.2. Word Embedding + Pos-Tag

In [0]:
EMBEDDING_DIM = 120
# splice the pos and word embedding
embedding_matrix = []
for word in word_list:
    try:
        word_embedding_temp=word_emb_model.wv[word]
        word_embedding_temp.extend(word_2_pos[word])

        embedding_matrix.append(word_embedding_temp)
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape

  


(12551, 120)

In [0]:
model_w_p = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, attention_method = None).to(device)
optimizer1= optim.SGD(model_w_p.parameters(), lr=0.015, weight_decay=1e-4)

torch.Size([7, 7])


In [0]:
"""Each epoch will take about 2-3 minutes"""
import datetime

# Reference Lab 9 Code: https://colab.research.google.com/drive/1yVy7T9DNB9lJo3NgFdsHAuEsI0msAuPz

for epoch in range(15):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model_w_p.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model_w_p.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model_w_p.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer1.step()

        train_loss+=loss.item()

    max_val_acc=0
    model_w_p.eval()
    _, _, train_acc = cal_acc(model_w_p,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model_w_p,val_input_index,val_output_index)

    if(val_acc>max_val_acc):
        best_model = model_w_p
        max_val_acc=val_acc

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model_w_p.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train f1 score: %.4f, val loss: %.2f, val f1 score: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 23395.54, train f1 score: 0.8143, val loss: 4958.59, val f1 score: 0.7664, time: 140.38s
Epoch:2, Training loss: 12758.37, train f1 score: 0.8782, val loss: 2894.08, val f1 score: 0.8363, time: 138.89s
Epoch:3, Training loss: 7656.35, train f1 score: 0.9203, val loss: 2753.11, val f1 score: 0.8684, time: 141.06s
Epoch:4, Training loss: 5464.88, train f1 score: 0.9386, val loss: 2764.70, val f1 score: 0.8736, time: 143.27s
Epoch:5, Training loss: 4060.91, train f1 score: 0.9581, val loss: 2682.26, val f1 score: 0.8829, time: 139.41s
Epoch:6, Training loss: 3200.77, train f1 score: 0.9605, val loss: 3049.03, val f1 score: 0.8827, time: 140.48s
Epoch:7, Training loss: 2653.24, train f1 score: 0.9669, val loss: 3155.78, val f1 score: 0.8839, time: 139.07s
Epoch:8, Training loss: 2257.03, train f1 score: 0.9719, val loss: 2727.83, val f1 score: 0.8816, time: 141.81s
Epoch:9, Training loss: 1923.67, train f1 score: 0.9800, val loss: 2757.10, val f1 score: 0.8936, time

### 4.1.3 word embedding + Parse Tree

In [0]:
EMBEDDING_DIM = 120
# splice the word embedding and parse tree
embedding_matrix = []
for word in word_list:
    try:
        word_embedding_temp=word_emb_model.wv[word]
        word_embedding_temp.extend(word_2_pt[word])

        embedding_matrix.append(word_embedding_temp)
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape

  


(12551, 120)

In [0]:
model_w_pt = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, attention_method = None).to(device)
optimizer2= optim.SGD(model_w_pt.parameters(), lr=0.015, weight_decay=1e-4)

torch.Size([7, 7])


In [0]:
"""Each epoch will take about 2-3 minutes"""
import datetime

# Reference Lab 9 Code: https://colab.research.google.com/drive/1yVy7T9DNB9lJo3NgFdsHAuEsI0msAuPz
# train
for epoch in range(15):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model_w_pt.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model_w_pt.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model_w_pt.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer2.step()

        train_loss+=loss.item()

    max_val_acc=0
    model_w_pt.eval()
    _, _, train_acc = cal_acc(model_w_pt,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model_w_pt,val_input_index,val_output_index)

    if(val_acc>max_val_acc):
        best_model = model_w_p
        max_val_acc=val_acc

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model_w_pt.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train f1 score: %.4f, val loss: %.2f, val f1 score: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 23453.19, train f1 score: 0.8133, val loss: 5131.74, val f1 score: 0.7676, time: 141.28s
Epoch:2, Training loss: 12855.27, train f1 score: 0.8777, val loss: 2827.09, val f1 score: 0.8415, time: 143.11s
Epoch:3, Training loss: 7199.73, train f1 score: 0.9274, val loss: 2579.76, val f1 score: 0.8747, time: 140.90s
Epoch:4, Training loss: 4852.70, train f1 score: 0.9483, val loss: 2717.20, val f1 score: 0.8793, time: 141.62s
Epoch:5, Training loss: 3508.58, train f1 score: 0.9565, val loss: 2842.28, val f1 score: 0.8801, time: 140.97s
Epoch:6, Training loss: 2831.20, train f1 score: 0.9669, val loss: 2758.99, val f1 score: 0.8816, time: 143.68s
Epoch:7, Training loss: 2366.47, train f1 score: 0.9714, val loss: 2928.31, val f1 score: 0.8826, time: 140.46s
Epoch:8, Training loss: 2120.89, train f1 score: 0.9737, val loss: 3124.29, val f1 score: 0.8949, time: 142.96s
Epoch:9, Training loss: 1822.63, train f1 score: 0.9782, val loss: 2774.40, val f1 score: 0.8884, time

##4.2. Different Attention Strategy

### 4.2.1. word embedding + Pos-Tag + Scale Dot Attention Score

In [0]:
EMBEDDING_DIM = 120

embedding_matrix = []
for word in word_list:
    try:
        word_embedding_temp=word_emb_model.wv[word]
        word_embedding_temp.extend(word_2_pos[word])

        embedding_matrix.append(word_embedding_temp)
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape

  


(12551, 120)

In [0]:
#using scale score to calculate the attention score
model_w_p = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, attention_method = "scale").to(device)
optimizer1 = optim.SGD(model_w_p.parameters(), lr=0.015, weight_decay=1e-4)

torch.Size([7, 7])


In [0]:
"""Each epoch will take about 2-3 minutes"""
import datetime

# Reference Lab 9 Code: https://colab.research.google.com/drive/1yVy7T9DNB9lJo3NgFdsHAuEsI0msAuPz

for epoch in range(15):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model_w_p.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model_w_p.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model_w_p.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer1.step()

        train_loss+=loss.item()

    max_val_acc=0
    model_w_p.eval()
    _, _, train_acc = cal_acc(model_w_p,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model_w_p,val_input_index,val_output_index)

    if(val_acc>max_val_acc):
        best_model = model_w_p
        max_val_acc=val_acc

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model_w_p.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train f1 score: %.4f, val loss: %.2f, val f1 score: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 23573.80, train f1 score: 0.8129, val loss: 5284.52, val f1 score: 0.7663, time: 136.23s
Epoch:2, Training loss: 13324.92, train f1 score: 0.8788, val loss: 2800.39, val f1 score: 0.8463, time: 134.56s
Epoch:3, Training loss: 7771.00, train f1 score: 0.9150, val loss: 2819.19, val f1 score: 0.8645, time: 134.16s
Epoch:4, Training loss: 5553.85, train f1 score: 0.9359, val loss: 2776.16, val f1 score: 0.8733, time: 134.43s
Epoch:5, Training loss: 4091.56, train f1 score: 0.9536, val loss: 2728.55, val f1 score: 0.8806, time: 134.92s
Epoch:6, Training loss: 3122.22, train f1 score: 0.9675, val loss: 2667.39, val f1 score: 0.9001, time: 137.51s
Epoch:7, Training loss: 2547.59, train f1 score: 0.9732, val loss: 2710.82, val f1 score: 0.8980, time: 135.96s
Epoch:8, Training loss: 2139.21, train f1 score: 0.9793, val loss: 2719.76, val f1 score: 0.9058, time: 135.75s
Epoch:9, Training loss: 1823.31, train f1 score: 0.9818, val loss: 2474.06, val f1 score: 0.9079, time

In [0]:
# get the details about the performance of our model
y_true,y_pred,_ = cal_acc(best_model,val_input_index,val_output_index)
from sklearn.metrics import classification_report
print(classification_report(y_true,y_pred,digits=4))

              precision    recall  f1-score   support

           2     0.9284    0.9870    0.9568      5790
           3     0.7143    0.4912    0.5821       285
           4     0.8030    0.5668    0.6646       187
           5     0.9444    0.7566    0.8401       875
           6     0.9353    0.8282    0.8785       419

    accuracy                         0.9224      7556
   macro avg     0.8651    0.7260    0.7844      7556
weighted avg     0.9194    0.9224    0.9176      7556



### 4.2.2. Word embedding + Pos-Tag + Dot Attention Score

In [0]:
#using dot score to calculate the attention score
model_w_p_dot = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, attention_method = "dot").to(device)
optimizer3 = optim.SGD(model_w_p_dot.parameters(), lr=0.015, weight_decay=1e-4)

torch.Size([7, 7])


In [0]:
"""Each epoch will take about 2-3 minutes"""
import datetime

# Reference Lab 9 Code: https://colab.research.google.com/drive/1yVy7T9DNB9lJo3NgFdsHAuEsI0msAuPz

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model_w_p_dot.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model_w_p_dot.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model_w_p_dot.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer3.step()

        train_loss+=loss.item()

    max_val_acc=0
    model_w_p_dot.eval()
    _, _, train_acc = cal_acc(model_w_p_dot,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model_w_p_dot,val_input_index,val_output_index)

    if(val_acc>max_val_acc):
        best_model = model_w_p_dot
        max_val_acc=val_acc

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model_w_p_dot.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train f1 score: %.4f, val loss: %.2f, val f1 score: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 23603.51, train f1 score: 0.8125, val loss: 5250.11, val f1 score: 0.7657, time: 136.32s
Epoch:2, Training loss: 13329.52, train f1 score: 0.8805, val loss: 2876.43, val f1 score: 0.8446, time: 137.73s
Epoch:3, Training loss: 7823.88, train f1 score: 0.9157, val loss: 2805.69, val f1 score: 0.8657, time: 134.86s
Epoch:4, Training loss: 5626.14, train f1 score: 0.9387, val loss: 2732.35, val f1 score: 0.8733, time: 134.66s
Epoch:5, Training loss: 4178.54, train f1 score: 0.9510, val loss: 2919.32, val f1 score: 0.8804, time: 134.81s
Epoch:6, Training loss: 3309.43, train f1 score: 0.9623, val loss: 2897.64, val f1 score: 0.8846, time: 135.17s
Epoch:7, Training loss: 2619.83, train f1 score: 0.9730, val loss: 2605.73, val f1 score: 0.9010, time: 136.54s
Epoch:8, Training loss: 2271.14, train f1 score: 0.9767, val loss: 2634.22, val f1 score: 0.8968, time: 134.64s
Epoch:9, Training loss: 1971.51, train f1 score: 0.9788, val loss: 2857.49, val f1 score: 0.8968, time

In [0]:
y_true,y_pred,_ = cal_acc(best_model,val_input_index,val_output_index)
from sklearn.metrics import classification_report
print(classification_report(y_true,y_pred,digits=4))

              precision    recall  f1-score   support

           2     0.9273    0.9872    0.9563      5790
           3     0.7525    0.5333    0.6242       285
           4     0.8906    0.6096    0.7238       187
           5     0.9400    0.7337    0.8241       875
           6     0.9340    0.8449    0.8872       419

    accuracy                         0.9235      7556
   macro avg     0.8889    0.7418    0.8031      7556
weighted avg     0.9217    0.9235    0.9189      7556



## different layers model

### 2 layers LSTM

In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

# Reference: Lab9 Code https://colab.research.google.com/drive/1yVy7T9DNB9lJo3NgFdsHAuEsI0msAuPz

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, attention_method = None):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.attention_method = attention_method

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=2, bidirectional=True)

        self.dropout = nn.Dropout(0.2)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size) if not attention_method else nn.Linear(hidden_dim * 2, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))
        print(self.transitions.shape)

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(4, 1, self.hidden_dim // 2).to(device),
                torch.randn(4, 1, self.hidden_dim // 2).to(device))
        


    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)

        ##self attention part
        if self.attention_method:
            lstm_out = torch.squeeze(lstm_out, 1)
            left_self = lstm_out.view(1, lstm_out.size(0), lstm_out.size(1))
            right_self = left_self.view(left_self.size(0), left_self.size(2), left_self.size(1))
            if "scale" in self.attention_method.lower():
                weight_att = nn.functional.softmax(torch.bmm(left_self, right_self) * 1/np.sqrt(self.hidden_dim * 2),dim=-1)
            else:
                weight_att = nn.functional.softmax(torch.bmm(left_self, right_self),dim=-1)

            output = torch.bmm(weight_att, left_self)
            concat_output = torch.cat((output, left_self), dim = -1)
            lstm_out = concat_output.view(len(sentence), self.hidden_dim * 2)
        else:
            lstm_out = lstm_out.view(len(sentence), self.hidden_dim)

        lstm_out = self.dropout(lstm_out)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  
        # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [45]:
model_2_layer = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, attention_method = "scale").to(device)
optimizer4 = optim.SGD(model_2_layer.parameters(), lr=0.03, weight_decay=1e-4)

torch.Size([7, 7])


In [46]:
"""Each epoch will take about 2-3 minutes"""
import datetime

# Reference Lab 9 Code: https://colab.research.google.com/drive/1yVy7T9DNB9lJo3NgFdsHAuEsI0msAuPz

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model_2_layer.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model_2_layer.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model_2_layer.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer4.step()

        train_loss+=loss.item()

    max_val_acc=0
    model_2_layer.eval()
    _, _, train_acc = cal_acc(model_2_layer,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model_2_layer,val_input_index,val_output_index)

    if(val_acc>max_val_acc):
        best_model = model_2_layer
        max_val_acc=val_acc

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model_2_layer.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train f1 score: %.4f, val loss: %.2f, val f1 score: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 23932.20, train f1 score: 0.8119, val loss: 6012.05, val f1 score: 0.7663, time: 186.30s
Epoch:2, Training loss: 17897.54, train f1 score: 0.8446, val loss: 3442.49, val f1 score: 0.8065, time: 182.20s
Epoch:3, Training loss: 8896.50, train f1 score: 0.9152, val loss: 2601.96, val f1 score: 0.8808, time: 190.15s
Epoch:4, Training loss: 5631.97, train f1 score: 0.9414, val loss: 2144.41, val f1 score: 0.8985, time: 192.82s
Epoch:5, Training loss: 3933.50, train f1 score: 0.9543, val loss: 2318.39, val f1 score: 0.9052, time: 189.63s
Epoch:6, Training loss: 2974.72, train f1 score: 0.9562, val loss: 2391.04, val f1 score: 0.9031, time: 180.36s
Epoch:7, Training loss: 2374.33, train f1 score: 0.9659, val loss: 2400.77, val f1 score: 0.9112, time: 191.10s
Epoch:8, Training loss: 1919.19, train f1 score: 0.9478, val loss: 3173.46, val f1 score: 0.8629, time: 179.49s
Epoch:9, Training loss: 1588.40, train f1 score: 0.9761, val loss: 2549.12, val f1 score: 0.9026, time

In [47]:
y_true,y_pred,_ = cal_acc(best_model,val_input_index,val_output_index)
from sklearn.metrics import classification_report
print(classification_report(y_true,y_pred,digits=4))

              precision    recall  f1-score   support

           2     0.9151    0.9870    0.9497      5790
           3     0.4970    0.5825    0.5363       285
           4     0.8652    0.6524    0.7439       187
           5     0.9456    0.4571    0.6163       875
           6     0.8668    0.8544    0.8606       419

    accuracy                         0.8948      7556
   macro avg     0.8180    0.7067    0.7414      7556
weighted avg     0.8990    0.8948    0.8855      7556



Try to feed val data set in our *model*

In [59]:
"""Each epoch will take about 2-3 minutes"""
import datetime

# Reference Lab 9 Code: https://colab.research.google.com/drive/1yVy7T9DNB9lJo3NgFdsHAuEsI0msAuPz

for epoch in range(4):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model_2_layer.train()
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model_2_layer.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model_2_layer.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer4.step()

        train_loss+=loss.item()

    max_val_acc=0
    model_2_layer.eval()
    _, _, train_acc = cal_acc(model_2_layer,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model_2_layer,val_input_index,val_output_index)

    if(val_acc>max_val_acc):
        best_model = model_2_layer
        max_val_acc=val_acc

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model_2_layer.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train f1 score: %.4f, val loss: %.2f, val f1 score: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 1494.99, train f1 score: 0.9743, val loss: 652.81, val f1 score: 0.9637, time: 69.85s
Epoch:2, Training loss: 453.68, train f1 score: 0.9818, val loss: 226.90, val f1 score: 0.9858, time: 69.93s
Epoch:3, Training loss: 220.74, train f1 score: 0.9831, val loss: 133.57, val f1 score: 0.9914, time: 70.32s
Epoch:4, Training loss: 127.63, train f1 score: 0.9846, val loss: 75.30, val f1 score: 0.9960, time: 69.78s


# Testing

In [76]:
import torch
torch.save(best_model,"model.pt")

  "type " + obj.__name__ + ". It won't be checked "


In [0]:
ix_2_tag={}
for x in tag_to_ix.keys():
    ix_2_tag[tag_to_ix[x]] = x

In [0]:
def predict(model, input_index):
    predicted=[]
    for x in input_index:
        input_tensor = torch.tensor(x).to(device)
        _,output = model(input_tensor)
        predicted.extend(output)
    return predicted


In [0]:
ix_2_tag

{0: '<START>',
 1: '<STOP>',
 2: 'O',
 3: 'I-ORG',
 4: 'I-MISC',
 5: 'I-PER',
 6: 'I-LOC'}

make a predition on our test data set

In [0]:
predition = predict(best_model,test_input_index)


In [0]:
fianl_predition=[]
for x in predition:
    fianl_predition.append(ix_2_tag[x])
    

In [0]:
fianl_predition

In [0]:
fianl_id = range(0,46666)

make a CSV file

In [0]:
test_prediction = {'Id':fianl_id ,'Predicted':fianl_predition }

df = pd.DataFrame(test_prediction)

df.to_csv('result.csv', index=False)