### Import libraries

In [1]:
import os
import sys
import numpy as np
import re
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
import math
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import time
import copy
import pickle
import glob
import spacy
from torch.utils.data import TensorDataset, DataLoader

### Key Features

In [2]:
Embedding_feature = 'GLOVE' #Skipgram / GLOVE
POS_feature = 'No' #No / Yes
Dataset_domain = 'Laptops' #Laptops / Restaurants
POS_hidden_dim = 128 #256 / 128

### Read in data and tagging IOB to training and testing data

In [3]:
#To extract the soup object and extract the relevant aspect terms from xml file
def soup2dict(sentence_nodes):

    sentences = []
    i = 0
    for n in sentence_nodes:
        i += 1
        sentence = {}
        aspect_term = []
        sentence['id'] = i
        sentence['text'] = n.find('text').string
        if n.find('aspectTerms'):
            for c in n.find('aspectTerms').contents:
                if c.name == 'aspectTerm':
                    if c['term'] not in aspect_term:
                        aspect_term.append(c['term'])

        sentence['aspect'] = aspect_term
        sentences.append(sentence)
        
    return sentences

In [4]:
#data processing
def split2words(s_text):

    s_text = re.sub('([.,!?()])', r' \1 ', s_text) # match the punctuation characters and surround them by spaces,
    s_text = re.sub('\s{2,}', ' ', s_text)         # collapse multiple spaces to one space
    words = s_text.lower().split()
    return words

In [5]:
#to tag each words to IOB based on the aspect terms
def tagging_IOB(s, aspects):

    tags = ['O'] * len(s)
    
    #print(s)

    for aspect in aspects:
        aspect_tokens = split2words(aspect)
        aspect_len = len(aspect_tokens)
        len_counter = 0
        pre_index = -math.inf
        for word in s: 
            #print('word is', word)
            if len_counter <aspect_len:
                if word in aspect_tokens: 
                    len_counter += 1
                    cur_index = s.index(word) 
                    if cur_index - pre_index == 1: # inside an aspect term
                        tags[cur_index] = 'I'
                    else:                       # beginning of an aspect term
                        tags[cur_index] = 'B'
                    pre_index = cur_index 
    return tags

In [6]:
#to convert into df format for saving the files
def dict2df(sentences):

    data = pd.DataFrame()
    for s in sentences:
        sentence = {}
        sentence['Sentence #'] = s['id']
        sentence['Word'] = split2words(s['text'])  # split text to words
        s_length = len(sentence['Word']) # the length of sentence, used to generate tag
        if len(s['aspect'])==0 or s['aspect'][0] == 'NULL': # tagging: if no aspect term
            sentence['Tag'] = ['O'] * s_length
        else:                                               # IOB format tag if aspect exist
            aspect_terms = [x.lower() for x in s['aspect']]  
            sentence['Tag'] = tagging_IOB(sentence['Word'], aspect_terms)

        # convert each setence to dataframe 
        sentence_df = pd.DataFrame.from_dict(sentence)
        data = data.append(sentence_df, ignore_index=True)
    
    return data

In [7]:
#to read the file and process the data based on above defined functions
def read_data(file_path):
    # convert xml raw data to soup
    soup = None
    with file_path.open(encoding="utf-8") as f:
        soup = BeautifulSoup(f.read().strip(), "lxml-xml")
    if soup is None:
        raise Exception("Can't read xml file")
    sentence_nodes = soup.find_all("sentence")

    # convert soup object to a list of dictionaries and df
    sentences = soup2dict(sentence_nodes)
    data = dict2df(sentences)
    
    return data

In [8]:
#read and save the data
if Dataset_domain == 'Restaurants': #Laptops
    test_path = Path.cwd().joinpath('Restaurants_Test.xml')
    train_path = Path.cwd().joinpath('Restaurants_Train.xml')
else:
    test_path = Path.cwd().joinpath('Laptops_Test.xml')
    train_path = Path.cwd().joinpath('Laptops_Train.xml') 
    
data_test = read_data(test_path)
data_train = read_data(train_path)

save_file = Path.cwd().joinpath('test.csv')
data_test.to_csv(save_file, index=False)
save_file = Path.cwd().joinpath('train.csv')
data_train.to_csv(save_file, index=False)

### Process data for Learning

In [9]:
#to read from previous saved file
train_path = Path.cwd().joinpath('train.csv')
test_path = Path.cwd().joinpath('test.csv')

# Read data
data_train = pd.read_csv(train_path)
data_test = pd.read_csv(test_path)

In [10]:
# create a vocab 
word_train = set(data_train['Word'].values)
word_test = set(data_test['Word'].values)
words = list(word_train.union(word_test))

In [11]:
#create a mapping of words in vocab to index
word2idx = {}
word2idx['<UNK>'] = 0
for i, word in enumerate(words):
    word2idx[word] = i + 1

n_words = len(word2idx)
word2idx

{'<UNK>': 0,
 'inside': 1,
 'steer': 2,
 'expecting': 3,
 'reached': 4,
 'handling': 5,
 'avid': 6,
 'overheats': 7,
 'routed': 8,
 'destroy': 9,
 '53%': 10,
 '"wlan"': 11,
 'these': 12,
 "arn't": 13,
 'laid': 14,
 'plain': 15,
 'serves': 16,
 'fell': 17,
 'rolls': 18,
 'incident': 19,
 'froze': 20,
 'dark': 21,
 'construction': 22,
 'seconds': 23,
 'velcro': 24,
 'difficult': 25,
 'impressive': 26,
 'run-on': 27,
 'lifestyle': 28,
 "wouldn't": 29,
 'press': 30,
 'manufacturer': 31,
 'gradual': 32,
 'dollar': 33,
 'fingertips': 34,
 'bell': 35,
 'identified': 36,
 '3': 37,
 'models': 38,
 'moment': 39,
 'coupons': 40,
 'boot-up': 41,
 'supposed': 42,
 'receptacle': 43,
 'talking': 44,
 'laptop"': 45,
 'pop': 46,
 '$600': 47,
 'touchpad': 48,
 'good;': 49,
 'anymore': 50,
 'name': 51,
 'cons': 52,
 'drivers': 53,
 'moved': 54,
 'kinds': 55,
 'ahold': 56,
 'past;': 57,
 'compared': 58,
 'seeing': 59,
 'restrictions': 60,
 'responding': 61,
 'another': 62,
 'cell': 63,
 'starters': 64,
 '

In [12]:
# define a sentence class to get all the sentences
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [13]:
#further format each sentence to the required format
data = data_train
getter = SentenceGetter(data)
sentences = getter.sentences # get all sentences

sent_tokens = [[w[0] for w in s] for s in sentences]
sent_tags = [[w[1] for w in s] for s in sentences]
training_data = list(zip(sent_tokens, sent_tags))

getter_test = SentenceGetter(data_test)
sentences_test = getter_test.sentences # get all sentences

sent_tokens_test = [[w[0] for w in s] for s in sentences_test]
sent_tags_test = [[w[1] for w in s] for s in sentences_test]
testing_data = list(zip(sent_tokens_test, sent_tags_test))

### Embedding

In [14]:
#read the pretrained glove embeds and extract relevant word vectors in vocab
def get_glove_embeds():
    glove_path = '../glove.840B.300d.txt'
    embeddings_index = {}
    with open(glove_path, encoding='utf8') as f:
        for i, line in tqdm(enumerate(f)):
            values = line.split()
            word = ''.join(values[:-300])
            coefs = np.asarray(values[-300:], dtype='float32')
            embeddings_index[word] = coefs
    EMBEDDING_DIM = 300
    embedding_matrix = np.zeros((len(words)+1, EMBEDDING_DIM))

    for word, i in word2idx.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is None:
            embedding_matrix[i] = embeddings_index['unk']
        else:
            embedding_matrix[i] = embedding_vector
    
    embedding_matrix = torch.from_numpy(embedding_matrix).float()
    return embedding_matrix, embeddings_index

In [15]:
# Define the skipgram and training of skipgram 
# Reference Prof Chris Lecture 6 Lecture Notebook
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, embed_dim):
        super(Skipgram, self).__init__()
        self.V = nn.Embedding(vocab_size, embed_dim, max_norm=1)
        self.U = nn.Linear(embed_dim, vocab_size)

    def forward(self, centers):
        out = self.V(centers)
        out = self.U(out)
        #out = F.log_softmax(out, dim=1)
        return out

def get_skipgram_embeds():
    
    spacy.prefer_gpu()
    nlp = spacy.load("en_core_web_sm")
    all_tokens = sent_tokens + sent_tokens_test
    tokens = [item for sublist in all_tokens for item in sublist]
    #print(len(tokens))
    window_size = 3

    # Given the window size, we can directly infer the required sizes for the 2 Numpy arrays
    skipgram = np.zeros(((len(tokens)-(2*window_size))*(2*window_size), 2), dtype=np.int32)

    # Loop through list of tokens
    for center_idx, pos in enumerate(range(window_size, len(tokens)-window_size)):

        # Get current center word and current context words
        center = tokens[pos]
        context = tokens[pos-window_size:pos] + tokens[pos+1:pos+window_size+1]

        # Loop over all context words to generate the 2*window_size (center_word, context_word)-pairs
        for idx, c in enumerate(context):
            skipgram_sample = np.array([  word2idx[center] if center in word2idx else 0 ,  word2idx[c] if c in word2idx else 0 ])
            skipgram[(center_idx*window_size*2)+idx] = skipgram_sample
    
    data = skipgram
    data = data.astype('int32')
    num_samples, num_indices = data.shape
    #print('Number of samples: {}'.format(num_samples))
    vocab_size = len(word2idx)
    
    X = torch.Tensor(data[:,0]).long()
    y = torch.Tensor(data[:,-1]).long()
    
    dataset = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=256, shuffle=True)
    use_cuda = True
    device = torch.device("cuda:0" if use_cuda else "cpu")
    
    embed_dim = 300

    model_skipgram = Skipgram(vocab_size, embed_dim)

    # Move th model to GPU, if available (by default it "stays" on the CPU)
    model_skipgram.to(device)
    
    num_epochs = 20

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model_skipgram.parameters(), lr=0.001)

    for epoch in range(num_epochs):

        epoch_loss = 0.0

        for idx, (x, y) in enumerate((dataloader)):
            x, y = x.to(device), y.to(device)

            logits = model_skipgram(x)
            loss = criterion(logits, y)

            model_skipgram.zero_grad()
            loss.backward()

            #nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()

            epoch_loss += loss.item()

        print('Epoch: {}: Loss: {} '.format(epoch,epoch_loss))
    
    embedding_matrix = model_skipgram.U.weight
    return embedding_matrix

In [16]:
# read glove or skipgram word embeddings
if Embedding_feature == 'GLOVE':
    embedding_matrix, embeddings_index = get_glove_embeds()
elif Embedding_feature == 'Skipgram':
    embedding_matrix = get_skipgram_embeds()

if Embedding_feature == 'GLOVE':
    word_true = 0
    for word in words:
        if word in embeddings_index:
            word_true += 1

    print(len(words))
    print(word_true)

2196017it [03:00, 12188.24it/s]

5011
4458





### Process POS tags

In [17]:
#use nltk to retrieve each sentence POS tags

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

tokenized_sentences = []
tagged_sentences = []

sentList = [x[0] for x in training_data] +  [x[0] for x in testing_data]

for sentence in sentList:
    tagged_sentence = nltk.pos_tag(sentence)
    tagged_sentences.append(tagged_sentence)
    
print(tagged_sentences[0])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhida\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\zhida\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('i', 'JJ'), ('charge', 'NN'), ('it', 'PRP'), ('at', 'IN'), ('night', 'NN'), ('and', 'CC'), ('skip', 'NN'), ('taking', 'VBG'), ('the', 'DT'), ('cord', 'NN'), ('with', 'IN'), ('me', 'PRP'), ('because', 'IN'), ('of', 'IN'), ('the', 'DT'), ('good', 'JJ'), ('battery', 'NN'), ('life', 'NN'), ('.', '.')]


In [18]:
#get all possible POS tags in vocab
POS_sent_tags = [[w[1] for w in s] for s in tagged_sentences]
POS_list_set = set().union(*POS_sent_tags)
all_POS_tags = (list(POS_list_set))

print(all_POS_tags)

['WDT', 'VBD', ',', 'FW', 'JJS', 'JJR', 'RP', 'JJ', 'VB', 'CD', 'VBG', '(', 'UH', 'POS', 'RBR', '.', 'IN', 'VBZ', 'LS', 'MD', 'VBN', 'RBS', "''", 'SYM', 'PRP$', 'VBP', ')', 'DT', 'NNS', 'EX', 'PRP', 'PDT', '#', 'WRB', 'NN', 'WP', 'NNP', ':', '$', 'TO', 'CC', 'RB']


In [19]:
#create a mapping for POS and index
POS2idx = {}
counter = 0
punctuation_str = '!"#$%&\'()*+, -./:;<=>?@[\\]^_`{|}~'
for i, tag in enumerate(all_POS_tags):
    if tag in punctuation_str:
        POS2idx[tag] = 0
    else:
        POS2idx[tag] = counter
        counter +=1

num_POS_tags = counter
print(num_POS_tags)
print(POS2idx)

35
{'WDT': 0, 'VBD': 1, ',': 0, 'FW': 2, 'JJS': 3, 'JJR': 4, 'RP': 5, 'JJ': 6, 'VB': 7, 'CD': 8, 'VBG': 9, '(': 0, 'UH': 10, 'POS': 11, 'RBR': 12, '.': 0, 'IN': 13, 'VBZ': 14, 'LS': 15, 'MD': 16, 'VBN': 17, 'RBS': 18, "''": 19, 'SYM': 20, 'PRP$': 21, 'VBP': 22, ')': 0, 'DT': 23, 'NNS': 24, 'EX': 25, 'PRP': 26, 'PDT': 27, '#': 0, 'WRB': 28, 'NN': 29, 'WP': 30, 'NNP': 31, ':': 0, '$': 0, 'TO': 32, 'CC': 33, 'RB': 34}


### Defining BiLSTM-CRF

In [20]:
#this section define the BiLSTM_CRF model
#References - PyTorch documenation and PyTorch Tutorial https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    #print(seq)
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, embedding_matrix, Embedding_feature, POS_feature, num_POS_tags, POS_hidden_dim, device):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.word_embeds = nn.Embedding.from_pretrained(embedding_matrix)
        assert self.word_embeds.weight.shape == embedding_matrix.shape
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
        
        #POS embedding and LSTM
        pos_vocab_size = num_POS_tags
        pos_embedding_dim = 10
        pos_lstm_hidden_dim = POS_hidden_dim
        self.pos_hidden_dim = pos_lstm_hidden_dim
        self.pos_embeds = nn.Embedding(pos_vocab_size, pos_embedding_dim)
        self.pos_lstm = nn.LSTM(pos_embedding_dim, pos_lstm_hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        if POS_feature == 'Yes':
            self.hidden2tag = nn.Linear(hidden_dim+pos_lstm_hidden_dim, self.tagset_size)
        else:
            self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()
        self.pos_hidden = self.init_pos_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2,device=device),
                torch.randn(2, 1, self.hidden_dim // 2,device=device))
    
    def init_pos_hidden(self):
        return (torch.randn(2, 1, self.pos_hidden_dim // 2,device=device),
                torch.randn(2, 1, self.pos_hidden_dim // 2,device=device))
    
    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.,device=device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence, pos_seq):
        #for sentence
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        
        #for pos
        self.pos_hidden = self.init_pos_hidden()
        pos_embeds = self.pos_embeds(pos_seq).view(len(pos_seq), 1, -1)
        pos_lstm_out, self.pos_hidden = self.pos_lstm(pos_embeds, self.pos_hidden)
        pos_lstm_out = pos_lstm_out.view(len(pos_seq), self.pos_hidden_dim)
        
        if POS_feature == 'Yes':
            combined_lstm_out = torch.cat((lstm_out,pos_lstm_out),1)
        else:
            combined_lstm_out = lstm_out
    
        lstm_feats = self.hidden2tag(combined_lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1,device=device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.,device=device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags, pos_seq):

        feats = self._get_lstm_features(sentence, pos_seq)

        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        
        score, tag_seq = self._viterbi_decode(feats)
        
        return forward_score - gold_score, tag_seq

    def forward(self, sentence, pos_seq): 
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence, pos_seq)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [21]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 300
HIDDEN_DIM = 256

tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}

In [22]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
print(device)

cuda:0


In [23]:
#this function return all the aspect terms in the sentence based on the corresponding BIO tags
def get_aspect(sentence,tags):
    #sentence: list of tokens / words
    #tags: 0-Beginning, 1-Inside, 2-Outside
    #sentence = ['but', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.']
    #tags = [0, 0, 0, 0, 0, 0, 0, 0, 1]
    aspects = []
    aspect = ''
    #print('sentence',sentence)
    #print('len',len(sentence))
    for i in range(len(sentence)):
        #print(i)
        if tags[i] == 0:
            if i>1:
                if tags[i-1] == 1 or tags[i-1] == 0:
                    aspect = aspect.lstrip(' ')
                    aspects.append(aspect)
                    aspect = ''
            aspect = sentence[i]
        elif tags[i] == 1:
            aspect = aspect+' '+sentence[i]
        elif tags[i] == 2:
            if i>0:
                if tags[i-1] == 1 or tags[i-1] == 0:
                    aspect = aspect.lstrip(' ')
                    aspects.append(aspect)
                    aspect = ''
        if i == len(sentence)-1 and aspect != '':
            aspects.append(aspect)
            aspect = ''
    
    #print(aspects)
    return aspects

### Train Model

In [24]:
#to compute F1 score
def F1score(pred_aspectTerms, gold_aspectTerms):
    True_positives = 0
    num_predictions = 0
    num_gold = 0
    for i in range(len(pred_aspectTerms)):
        preds = pred_aspectTerms[i]
        num_predictions += len(preds)
        golds = gold_aspectTerms[i].copy()
        num_gold += len(golds)
        for j in range(len(preds)):
            if preds[j] in golds:
                True_positives += 1
                golds.remove(preds[j])
    if True_positives == 0:
        return 0
    precision = True_positives / num_predictions
    recall = True_positives / num_gold
    
    return 2 * (precision * recall) / (precision + recall)

In [25]:
#to train the model based on the optimiser and max num epochs
#F1 score and accuracy are output for each epoch
def train_model(model, optimizer, num_epochs):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_f1 = 0.0
    best_epoch = -1
    results = []

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            pred_aspectTerms = [] 
            gold_aspectTerms = []

            if phase == 'train':
                data = training_data
            else:
                data = testing_data
                
            # Iterate over data.
            i = 0
            for sentence, tags in (data):
                sentence_in = prepare_sequence(sentence, word2idx).to(device)
                targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long).to(device)
                pos_seq = prepare_sequence(list(zip(*nltk.pos_tag(sentence)))[1],POS2idx).to(device)
                
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    loss, pred_tag = model.neg_log_likelihood(sentence_in, targets, pos_seq)


                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                ground_truth_tags = torch.tensor([tag_to_ix[t] for t in data[i][1]], dtype=torch.long)
                running_loss += loss.item()
                if ground_truth_tags.tolist() == pred_tag:
                    running_corrects += 1
                i += 1
                predicted_aspects_term = get_aspect(sentence,pred_tag)
                ground_truth_aspects_term = get_aspect(sentence,ground_truth_tags.tolist())
                pred_aspectTerms.append(predicted_aspects_term)
                gold_aspectTerms.append(ground_truth_aspects_term)
                #print('pred',pred_aspectTerms)
                #print('gold',gold_aspectTerms)
                
            epoch_loss = running_loss / len(data)
            epoch_acc = running_corrects/ len(data)
            epoch_f1 = F1score(pred_aspectTerms,gold_aspectTerms)

            print('{} Loss: {:.4f} F1: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_f1, epoch_acc))
            result = [phase, epoch_loss, epoch_f1, epoch_acc]
            results.append(result)
            # deep copy the model
            if phase == 'val' and epoch_f1 > best_f1:
                best_f1 = epoch_f1
                best_model_wts = copy.deepcopy(model.state_dict())
                best_epoch = epoch
                
        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best f1 score: {best_f1:4f}')
    print(f'Best epoch: {best_epoch}')
    df = pd.DataFrame(results, columns = ['Phase', 'Loss','F1','Acc'])
    filename = './'+Dataset_domain+'_Embed_'+Embedding_feature+'_POS_'+POS_feature+'_POSdim_'+str(POS_hidden_dim)+'.csv'
    df.to_csv(filename) 
    
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [41]:
#define model, optimiser and execute training
model = BiLSTM_CRF(len(word2idx), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, embedding_matrix, Embedding_feature, POS_feature, num_POS_tags, POS_hidden_dim, device).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) 
num_epochs = 30
model = train_model(model, optimizer, num_epochs)

Epoch 0/29
----------
train Loss: 2.1511 F1: 0.4627 Acc: 0.6201
val Loss: 2.1069 F1: 0.3963 Acc: 0.5750

Epoch 1/29
----------
train Loss: 1.3304 F1: 0.6617 Acc: 0.7083
val Loss: 1.7792 F1: 0.5354 Acc: 0.6388

Epoch 2/29
----------
train Loss: 1.1474 F1: 0.7059 Acc: 0.7411
val Loss: 1.6977 F1: 0.5670 Acc: 0.6538

Epoch 3/29
----------
train Loss: 1.0424 F1: 0.7333 Acc: 0.7572
val Loss: 1.6469 F1: 0.5962 Acc: 0.6737

Epoch 4/29
----------
train Loss: 0.9635 F1: 0.7507 Acc: 0.7680
val Loss: 1.5959 F1: 0.6097 Acc: 0.6700

Epoch 5/29
----------
train Loss: 0.8902 F1: 0.7758 Acc: 0.7874
val Loss: 1.4931 F1: 0.6392 Acc: 0.6963

Epoch 6/29
----------
train Loss: 0.8157 F1: 0.7915 Acc: 0.8005
val Loss: 1.4424 F1: 0.6574 Acc: 0.7050

Epoch 7/29
----------
train Loss: 0.7600 F1: 0.8087 Acc: 0.8150
val Loss: 1.3620 F1: 0.6895 Acc: 0.7338

Epoch 8/29
----------
train Loss: 0.7064 F1: 0.8275 Acc: 0.8301
val Loss: 1.3547 F1: 0.6894 Acc: 0.7300

Epoch 9/29
----------
train Loss: 0.6491 F1: 0.8391 Acc

In [42]:
#save the model that yield best result
ROOT_DIR = os.path.abspath(os.curdir)
torch.save(model.state_dict(), os.path.join(ROOT_DIR, 'parameters.pt'))
print('model saved')

model saved


### Evaluate Test set

In [45]:
#save the results for the test set for further error analysis
s = []
gt = []
pred = []
data_save = []

for i in range(len(testing_data)):
    
    sentence = testing_data[i][0]
    tags = testing_data[i][1]
    
    sentence_in = prepare_sequence(sentence, word2idx).to(device)
    targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long).to(device)
    pos_seq = prepare_sequence(list(zip(*nltk.pos_tag(sentence)))[1],POS2idx).to(device)
    loss, pred_tag = model.neg_log_likelihood(sentence_in, targets, pos_seq)
    ground_truth_tags = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
    predicted_aspects_term = get_aspect(sentence,pred_tag)
    ground_truth_aspects_term = get_aspect(sentence,ground_truth_tags.tolist())
    
    data_save.append([sentence,ground_truth_aspects_term,predicted_aspects_term])
    s.append(testing_data[i][1])
    gt.append(ground_truth_aspects_term)
    pred.append(predicted_aspects_term)

# Create the pandas DataFrame
df = pd.DataFrame(data_save, columns = ['sentence', 'ground_truth','pred'])
df.to_csv('Laptop_NO_POS_test.csv')  

In [48]:
#Compare results based on different number of aspect terms in each review
cat_a_gt = []
cat_b_gt = []
cat_c_gt = []
cat_a_pred = []
cat_b_pred = []
cat_c_pred = []
for i in range(len(s)):
    if len(gt[i]) == 0: #if zero aspect term
        cat_a_gt.append(gt[i])
        cat_a_pred.append(pred[i])
    elif len(gt[i]) == 1:  #if 1 aspect term
        cat_b_gt.append(gt[i])
        cat_b_pred.append(pred[i])
    elif len(gt[i]) > 1: #if >1 aspect term
        cat_c_gt.append(gt[i])
        cat_c_pred.append(pred[i])
print(F1score(cat_a_pred,cat_a_gt))
print(F1score(cat_b_pred,cat_b_gt))
print(F1score(cat_c_pred,cat_c_gt))

0
0.7520325203252033
0.6761768901569187


In [49]:
#Compare results based on if there are multi-word aspect terms in the review
cat_a_gt = []
cat_b_gt = []
cat_a_pred = []
cat_b_pred = []
for i in range(len(s)):
    if ' ' in ''.join(gt[i]):  #if consist multi-word aspect terms
        cat_a_gt.append(gt[i])
        cat_a_pred.append(pred[i])
    else:
        cat_b_gt.append(gt[i])
        cat_b_pred.append(pred[i])
print(F1score(cat_a_pred,cat_a_gt))
print(F1score(cat_b_pred,cat_b_gt))

0.6162624821683309
0.8223552894211577
