## Data import

In [3]:
# Import Training data and Testing data
# The data is from https://www.kaggle.com/competitions/2022-comp5046-a2/data
import numpy as np
# Download dataset from kaggle using API provided https://github.com/Kaggle/kaggle-api
!pip install kaggle
! mkdir ~/.kaggle
# make sure you imported kaggle.json to /content directory
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c 2022-comp5046-a2
# Unzip dataset
!apt install unzip
!unzip  /content/2022-comp5046-a2.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading 2022-comp5046-a2.zip to /content
  0% 0.00/376k [00:00<?, ?B/s]
100% 376k/376k [00:00<00:00, 85.0MB/s]
Reading package lists... Done
Building dependency tree       
Reading state information... Done
unzip is already the newest version (6.0-21ubuntu1.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Archive:  /content/2022-comp5046-a2.zip
  inflating: sample.csv              
  inflating: test_without_labels.csv  
  inflating: train.csv               
  inflating: val.csv                 


In [55]:
# Read training dataset and dislay 10 samples from it.
import pandas as pd
import numpy as np
training_data = pd.read_csv('/content/train.csv', encoding = "ISO-8859-1")
testing_data = pd.read_csv('/content/val.csv', encoding = "ISO-8859-1")
validation_data = pd.read_csv('/content/test_without_labels.csv', encoding = "ISO-8859-1")
# training_data.head(10)

In [56]:
# prepare training set and testing set, transfer them to sentence list
training_sent = training_data['sents'].tolist()
training_label = [label.split(' ') for label in training_data['labels'].tolist()]

testing_sent = testing_data['sents'].tolist()
testing_label = [label.split(' ') for label in testing_data['labels'].tolist()]

validation_sent = testing_data['sents'].tolist()
# print(training_sent[:10])
# print(training_label[:10])

## Data Preprocess

In [57]:
def to_lowercase(sentences):
  return [sentence.lower() for sentence in sentences]
def tokenize(sentences):
  result = []
  for sentence in sentences:
    n = sentence.split(' ')
    result.append(n)
  return result

from nltk.stem.snowball import *
stemmer = SnowballStemmer('english')

def stemming(data):
  return [[stemmer.stem(word) for word in sentence] for sentence in data]

training_data_after_preprocess = stemming(tokenize(to_lowercase(training_sent)))
testing_data_after_preprocess = stemming(tokenize(to_lowercase(testing_sent)))

train_data = training_data_after_preprocess
target_y_train = training_label
validation_data = testing_data_after_preprocess
target_y_validation = testing_label

# for word embedding
corpus = train_data+validation_data
# print(len(train_data))
# print(len(target_y_train))

# Word Embedding

In [58]:
# word dict
word_to_ix = {}
word_to_ix['UNKNOWN'] = 0
for sentence in corpus:
    for word in sentence:
        word = word.lower()
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in training_label+testing_label:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

## POS TAGGING

In [59]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag, word_tokenize
def PoS_embedding(corpus):
  result = []
  for text in corpus:
    result.append(nltk.pos_tag(text))
  return result;
# get a list of sentences with its pos tags
pos_taggings = PoS_embedding(corpus)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [60]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
pos_word_list = []
tag_list = []
for i in range(len(pos_taggings)):
  for j in range(len(pos_taggings[i])):
    pos_word_list.append(pos_taggings[i][j][0])
    tag_list.append(pos_taggings[i][j][1])
values = array(tag_list)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
pos_tagging_dict = dict(zip(pos_word_list, onehot_encoded))

## Domain Feature

In [52]:
from gensim.models import FastText

EMBEDDING_DIM = 128
word_emb_model = FastText(sentences=training_data_after_preprocess+testing_data_after_preprocess, size=EMBEDDING_DIM)
embedding_matrix = []
for word in word_list:
    try:
        embedding_matrix.append(word_emb_model.wv[word])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)

In [61]:
import pandas as pd
import numpy as np
cyberbully_data = pd.read_csv('/content/youtube_parsed_dataset.csv')
cyberbully_text = cyberbully_data['Text']
cyberbully_sent = cyberbully_data['Text'].tolist()
cyberbully_sent[0]
cyberbully_text_after_preprocess = stemming(tokenize(to_lowercase(cyberbully_sent)))

feature_word_to_ix = {}
feature_word_to_ix['UNKNOWN'] = 0
for sentence in cyberbully_text_after_preprocess:
    for word in sentence:
        word = word.lower()
        if word not in feature_word_to_ix:
            feature_word_to_ix[word] = len(feature_word_to_ix)
feature_word_list = list(feature_word_to_ix.keys())


EMBEDDING_DIM = 128
feature_domain_embedding_model = FastText(sentences=cyberbully_text_after_preprocess, size=EMBEDDING_DIM)
feature_domain_embedding_matrix = []
for word in feature_word_list:
    try:
        feature_domain_embedding_matrix.append(feature_domain_embedding_model.wv[word])
    except:
        feature_domain_embedding_matrix.append([0]*EMBEDDING_DIM)
feature_domain_embedding_matrix = np.array(feature_domain_embedding_matrix)
feature_domain_embedding_matrix.shape

(69270, 128)

# Input Concatenation

In [63]:
# Get the final embedding table(should be the combination of 3 aspects)
syntactic_textual_model = word_emb_model
semantic_textual_model = pos_tagging_dict
domain_model = feature_domain_embedding_model

# The embedding table would be like a list of dict [{'word1':[word2vec_1]}, {'word2':[word2vec_2]}, {'word1':[word2vec_2]}, {'word1':[word2vec_2]}] so that we can use it in later embedding layer


import numpy as np
emb_dim1 = word_emb_model.vector_size
emb_dim2 = 39
emb_dim3 = feature_domain_embedding_model.vector_size

# Embedding lookup table via concatenation
emb_table_w2v = []
for i, word in enumerate(word_list):
    if word in word_emb_model:
        emb_table_w2v.append(word_emb_model[word])
    else:
        emb_table_w2v.append([0]*(emb_dim1))
emb_table = np.array(emb_table_w2v)

# Embedding lookup table via concatenation
emb_table_w2v_pos = []

for i, word in enumerate(word_list):
    if word in word_emb_model and word in pos_tagging_dict:
        emb_table_w2v_pos.append(list(semantic_textual_model[word])+list(syntactic_textual_model[word]))
    else:
        emb_table_w2v_pos.append(np.array([0]*(emb_dim1+emb_dim2)))
emb_table_w2v_pos = np.array(emb_table_w2v_pos)

# Embedding lookup table via concatenation
emb_table_w2v_pos_feature = []
for i, word in enumerate(word_list):
    if word in word_emb_model and word in pos_tagging_dict and word in feature_domain_embedding_model:

        emb_table_w2v_pos_feature.append(list(semantic_textual_model[word])+list(syntactic_textual_model[word])+list(domain_model[word]))
    else:
        emb_table_w2v_pos_feature.append([0]*(emb_dim1+emb_dim2+emb_dim3))
emb_table_w2v_pos_feature = np.array(emb_table_w2v_pos_feature)





In [64]:
print(emb_table_w2v_pos_feature.shape)

(9682, 295)


# Sentence Encoding

In [65]:
def to_index(data, to_ix, type):
    input_index_list = []
    for sent in data:
      result = []
      for w in sent:
        if type == 'tag':
          result.append(to_ix[w])
        else:
          if w in word_list:
            result.append(to_ix[w])
          else:
            result.append(to_ix['UNKNOWN'])
      input_index_list.append(result)
    return input_index_list

train_input_index =  to_index(train_data,word_to_ix,'word')
train_output_index = to_index(target_y_train,tag_to_ix,'tag')
val_input_index = to_index(validation_data,word_to_ix,'word')
val_output_index = to_index(target_y_validation,tag_to_ix,'tag')

# Train Data& Test Data



## Seq2Seq Model

## Model Define

In [66]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

In [67]:
device = device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## BiLSTM Encoder

In [68]:
# LSTM Encoder
# input -> torch.Size([seq_length, embedding_size]) 
# output -> torch.Size([seq_length, hidden_size*2])
# final_hidden -> torch.Size([layers_num*2, hidden_size])
## Encoder LSTM

class LstmEncoder(nn.Module):
  def __init__(self, hidden_size, layers_num, word_embedding_matrix):
    super(LstmEncoder, self).__init__()
    self.bidirectional = True
    self.word_embeds = nn.Embedding(word_embedding_matrix.shape[0], word_embedding_matrix.shape[1])
    self.word_embeds.weight.data.copy_(torch.from_numpy(word_embedding_matrix))
    self.hidden_dim = hidden_size
    self.layers_num = layers_num
    self.lstm = nn.LSTM(word_embedding_matrix.shape[1], hidden_size, num_layers=layers_num, 
                        bidirectional=self.bidirectional)

  def forward(self, input, hidden, cell):
    input = self.word_embeds(input)
    output, (final_hidden, final_cell) = self.lstm(input, (hidden, cell))
    return output, final_hidden, final_cell

## BiLSTM Decoder

In [69]:
# LSTM Decoder
## Decoder LSTM

class LstmDecoder(nn.Module):
  def __init__(self, hidden_size, embedding_size, layers_num):
    super(LstmDecoder, self).__init__()
    self.bidirectional = True
    self.hidden_dim = hidden_size
    self.layers_num = layers_num
    self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers=layers_num, 
                        bidirectional=self.bidirectional)

  def forward(self, input, hidden, cell):
    output, (final_hidden, final_cell) = self.lstm(input, (hidden, cell))
    return output, final_hidden, final_cell

## BiLSTM with CRF Model

In [119]:
torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, hidden_dim, word_embedding_matrix, layers_num, with_crf, use_baseline, attention_method):
        super(BiLSTM_CRF, self).__init__()
        self.with_crf = with_crf
        self.use_baseline = use_baseline
        self.attention_method = attention_method
        self.hidden_dim = hidden_dim
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.layers_num = layers_num
        self.bidirectional = True
        self.lstm = LstmEncoder(hidden_dim, layers_num, word_embedding_matrix)

        self.decoder = LstmDecoder(hidden_dim, hidden_dim*2*(self.layers_num+1), layers_num)
        self.attention_size = self.hidden_dim        # just for vT * tanh(W1h+W2s)
        self.v = torch.rand(1, self.attention_size, requires_grad=True)

        self.attention_W1 = nn.Linear(2*self.layers_num*self.hidden_dim,self.hidden_dim)
        self.attention_W2 = nn.Linear(2*self.layers_num*self.hidden_dim,self.hidden_dim)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim * 2, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        self.hidden, self.cell = self.init_hidden()


    def init_hidden(self):
      n = 1
      if self.bidirectional:
        n = 2
      return (torch.zeros(n*self.layers_num,self.hidden_dim), torch.zeros(n*self.layers_num,self.hidden_dim))
        
    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence

        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        # encoding sentence and get memory, final hidden state

        encoder_hidden, encoder_cell = self.init_hidden()
        # memory is each step hidden state(output) seq_length * (2 hidden_dim)
        # final_hidden 
        memory, final_hidden, final_cell = self.lstm(sentence, self.hidden, self.cell) 

        if self.use_baseline:
          lstm_feats = self.hidden2tag(memory)
          return lstm_feats
        else:
          # attention weight
          

          # attn_weight = F.softmax(torch.bmm(final_hidden.reshape(1,-1).unsqueeze(0),memory.repeat(1,self.layers_num).unsqueeze(0).permute(0,2,1)).squeeze(0),1)
          attn_weight = self._calculate_attention(final_hidden, memory)

          # decoder sentence with the memory and final hidden state
          deocder_outputs = torch.zeros(sentence.size()[0], self.hidden_dim*2)
          decoder_hidden = final_hidden
          decoder_cell = encoder_cell
          for di in range(sentence.size(0)):
            context = torch.bmm(attn_weight.unsqueeze(0),memory.unsqueeze(0)).squeeze(0)      #context
            decoder_input = torch.cat((decoder_hidden.reshape(1,-1),context), dim=1)        # concenate context with hidden state
            # print(decoder_input.size())
            decoder_output, decoder_hidden, decoder_cell = self.decoder(decoder_input, decoder_hidden, decoder_cell)
            # decoder_input = decoder_output
            deocder_outputs[di] = decoder_output
            # attn_weight = F.softmax(torch.bmm(decoder_output.unsqueeze(0),memory.unsqueeze(0).permute(0,2,1)).squeeze(0),1)
            attn_weight = self._calculate_attention(decoder_hidden, memory)

          lstm_feats = self.hidden2tag(deocder_outputs)
          return lstm_feats

    def _calculate_attention(self, input, memory):
        # input is like query vector
        # memory is from encoder
        input = input.reshape(1,-1).unsqueeze(0)
        # print(input.size())
        # seq_length * (hidden_dim*2)   8*100 but here we need to repeat them to 8*layers*2*hidden
        memory = memory.repeat(1,self.layers_num).unsqueeze(0).permute(0,2,1)
        # print(memory.size())
        if self.attention_method == 1:       # dot product
          attn_weight = torch.bmm(input, memory).squeeze(0)
          attn_weight = F.softmax(attn_weight, dim=1)
          return attn_weight
        if self.attention_method == 2:       # scaled dot product
          scale = 1.0/np.sqrt(input.size()[1])
          attn_weight = torch.bmm(input, memory).squeeze(0) * scale
          attn_weight = F.softmax(attn_weight, dim=1)
          return attn_weight
        else:                  # tanh
          part_h = self.attention_W1(memory.permute(0,2,1))
          part_s = self.attention_W2(input)
          part_s = part_s.repeat(1,part_h.size()[1],1)
          attn_weight = torch.bmm(self.v.unsqueeze(0), torch.tanh(part_h + part_s).permute(0,2,1)).squeeze(0)
          attn_weight = F.softmax(attn_weight, dim=1)
          return attn_weight



    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def crossEntropyLoss(self, sentence, tags):
        criteria = nn.CrossEntropyLoss()
        feats = self._get_lstm_features(sentence)
        return criteria(feats, tags)

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)
        without_crf_seq = F.softmax(lstm_feats, dim=1)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        if self.with_crf:
          return score, tag_seq
        else:
          return torch.argmax(without_crf_seq,dim=1)

## Calculate Accuracy

In [91]:
import numpy as np
from sklearn.metrics import f1_score
def cal_acc(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        if model.with_crf:
          score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        else:
          pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    score = f1_score(ground_truth, predicted, average='weighted')
    return predicted, ground_truth, accuracy, score

## Tran Model

In [100]:
"""Each epoch will take about 1-2 minutes"""

import datetime
def train(model, combination):
  f1_score = 0
  for epoch in range(EPOCH):  
      time1 = datetime.datetime.now()
      train_loss = 0

      model.train()
      for i, idxs in enumerate(train_input_index):
          tags_index = train_output_index[i]
          # Step 1. Remember that Pytorch accumulates gradients.
          # We need to clear them out before each instance
          model.zero_grad()

          # Step 2. Get our inputs ready for the network, that is,
          # turn them into Tensors of word indices.
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          # print(sentence_in.size())
          # print(sentence_in.view(1,-1).size())
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          # Step 3. Run our forward pass.
          if WITH_CRF:
            loss = model.neg_log_likelihood(sentence_in, targets)
          else:
            loss = model.crossEntropyLoss(sentence_in, targets)
          # Step 4. Compute the loss, gradients, and update the parameters by
          # calling optimizer.step()
          loss.backward()
          optimizer.step()

          train_loss+=loss.item()
          # if i%200 == 0:
          #   print("Epoch:%d, Batch number: %d, tol number: %d" %(epoch+1, i, len(train_input_index)))
          # break
      model.eval()
      # for i in train_input_index:
        # print(type(i))
      # train_input_index_eval = [torch.tensor(sentence, dtype=torch.long).view(1,-1) for sentence in train_input_index]
      # val_input_index_eval = [torch.tensor(sentence, dtype=torch.long).view(1,-1) for sentence in val_input_index]
      # Call the cal_acc functions you implemented as required
      _, _, train_acc, train_f1 = cal_acc(model,train_input_index,train_output_index)
      _, _, val_acc, val_f1 = cal_acc(model,val_input_index,val_output_index)

      val_loss = 0
      for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, targets)
          val_loss+=loss.item()
      time2 = datetime.datetime.now()
      f1_score = val_f1
      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))
  print('Validation F1 Score: '+ str(f1_score))
  torch.save(model, combination+'.pt')

# The log below is the sample output for this section
# Please make sure you keep your own running log for submission

In [101]:
Word_Embedding_Model = emb_table_w2v_pos        # emb_table, emb_table_w2v_pos, emb_table_w2v_pos_feature
VOCAB_SIZE = Word_Embedding_Model.shape[0]
EMBEDDING_DIM = Word_Embedding_Model.shape[1]
HIDDEN_SIZE = 50
LAYER_NUM = 1
OUTPUT_SIZE = len(tag_to_ix)                  # 9
MAX_SEQ_LENGTH = max(len(i) for i in train_input_index)  # 57
ATTENTION_METHOD = 3
EPOCH = 2
WITH_CRF = True
USE_BASELINE = True
#tag_to_ix, hidden_dim, word_embedding_matrix, layers_num, with_crf, use_baseline, attention_method


In [129]:
from sklearn.metrics import classification_report
target_names = ['O','T','P','SEPA','S','D','C']

## Train and Test the model

In [102]:
# test for word embedding
# fastText
# tag_to_ix, hidden_dim, word_embedding_matrix, layers_num, with_crf, use_baseline, attention_method
model = BiLSTM_CRF(tag_to_ix=tag_to_ix, 
          hidden_dim=50, 
          word_embedding_matrix=emb_table, 
          layers_num=1, 
          with_crf=True, 
          use_baseline=True, 
          attention_method=1).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
train(model, 'baseline_word2vec')

Epoch:1, Training loss: 29293.42, train acc: 0.9805, val loss: 2697.69, val acc: 0.9785, time: 557.60s
Epoch:2, Training loss: 4934.80, train acc: 0.9916, val loss: 1745.01, val acc: 0.9879, time: 563.09s
Validation F1 Score: 0.9879031482732025


In [130]:
model = torch.load('baseline_word2vec.pt')
predicted, ground_truth, accuracy, score = cal_acc(model,val_input_index,val_output_index)
print(classification_report(ground_truth, predicted, target_names=target_names))

              precision    recall  f1-score   support

           O       0.99      0.99      0.99     18985
           T       0.98      0.97      0.97      1469
           P       1.00      1.00      1.00      3936
        SEPA       1.00      1.00      1.00      3603
           S       0.97      0.97      0.97      3322
           D       0.92      0.90      0.91       398
           C       0.98      0.98      0.98      1641

    accuracy                           0.99     33354
   macro avg       0.98      0.97      0.97     33354
weighted avg       0.99      0.99      0.99     33354



In [103]:
# test for word embedding
# fastText+POS
# tag_to_ix, hidden_dim, word_embedding_matrix, layers_num, with_crf, use_baseline, attention_method
model = BiLSTM_CRF(tag_to_ix=tag_to_ix, 
          hidden_dim=50, 
          word_embedding_matrix=emb_table_w2v_pos, 
          layers_num=1, 
          with_crf=True, 
          use_baseline=True, 
          attention_method=1).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
train(model, 'baseline_word2vec_pos')

Epoch:1, Training loss: 24806.16, train acc: 0.9837, val loss: 2358.55, val acc: 0.9818, time: 598.88s
Epoch:2, Training loss: 4546.99, train acc: 0.9914, val loss: 1814.33, val acc: 0.9877, time: 576.09s
Validation F1 Score: 0.9876530994618894


In [131]:
model = torch.load('baseline_word2vec_pos.pt')
predicted, ground_truth, accuracy, score = cal_acc(model,val_input_index,val_output_index)
print(classification_report(ground_truth, predicted, target_names=target_names))

              precision    recall  f1-score   support

           O       0.99      0.99      0.99     18985
           T       0.98      0.97      0.97      1469
           P       1.00      1.00      1.00      3936
        SEPA       1.00      1.00      1.00      3603
           S       0.97      0.97      0.97      3322
           D       0.92      0.89      0.91       398
           C       0.98      0.98      0.98      1641

    accuracy                           0.99     33354
   macro avg       0.98      0.97      0.97     33354
weighted avg       0.99      0.99      0.99     33354



In [104]:
# test for word embedding
# fastText+POS
# tag_to_ix, hidden_dim, word_embedding_matrix, layers_num, with_crf, use_baseline, attention_method
model = BiLSTM_CRF(tag_to_ix=tag_to_ix, 
          hidden_dim=50, 
          word_embedding_matrix=emb_table_w2v_pos_feature, 
          layers_num=1, 
          with_crf=True, 
          use_baseline=True, 
          attention_method=1).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
train(model, 'baseline_word2vec_pos_domain')

Epoch:1, Training loss: 22456.81, train acc: 0.9769, val loss: 2919.05, val acc: 0.9744, time: 728.57s
Epoch:2, Training loss: 5404.35, train acc: 0.9903, val loss: 1872.08, val acc: 0.9863, time: 723.99s
Validation F1 Score: 0.9862201854073053


In [132]:
model = torch.load('baseline_word2vec_pos_domain.pt')
predicted, ground_truth, accuracy, score = cal_acc(model,val_input_index,val_output_index)
print(classification_report(ground_truth, predicted, target_names=target_names))

              precision    recall  f1-score   support

           O       0.99      0.99      0.99     18985
           T       0.97      0.96      0.97      1469
           P       1.00      1.00      1.00      3936
        SEPA       1.00      1.00      1.00      3603
           S       0.97      0.96      0.97      3322
           D       0.91      0.86      0.88       398
           C       0.98      0.97      0.98      1641

    accuracy                           0.99     33354
   macro avg       0.97      0.96      0.97     33354
weighted avg       0.99      0.99      0.99     33354



In [108]:
# test for attention
# Dot product attention
# tag_to_ix, hidden_dim, word_embedding_matrix, layers_num, with_crf, use_baseline, attention_method
model = BiLSTM_CRF(tag_to_ix=tag_to_ix, 
          hidden_dim=50, 
          word_embedding_matrix=emb_table_w2v_pos_feature, 
          layers_num=1, 
          with_crf=True, 
          use_baseline=False, 
          attention_method=1).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
train(model, 'fastText_dot_product_attn')

Epoch:1, Training loss: 73686.96, train acc: 0.8323, val loss: 16010.16, val acc: 0.8260, time: 1005.66s
Epoch:2, Training loss: 36076.14, train acc: 0.9114, val loss: 9992.22, val acc: 0.9027, time: 1026.19s
Validation F1 Score: 0.8974388813789972


In [133]:
model = torch.load('fastText_dot_product_attn.pt')
predicted, ground_truth, accuracy, score = cal_acc(model,val_input_index,val_output_index)
print(classification_report(ground_truth, predicted, target_names=target_names))

              precision    recall  f1-score   support

           O       0.91      0.96      0.93     18985
           T       0.87      0.82      0.85      1469
           P       0.92      0.82      0.87      3936
        SEPA       0.89      0.87      0.88      3603
           S       0.94      0.88      0.91      3322
           D       0.94      0.04      0.07       398
           C       0.77      0.86      0.81      1641

    accuracy                           0.90     33354
   macro avg       0.89      0.75      0.76     33354
weighted avg       0.90      0.90      0.90     33354



In [109]:
# test for attention
# Scaled Dot product attention
# tag_to_ix, hidden_dim, word_embedding_matrix, layers_num, with_crf, use_baseline, attention_method
model = BiLSTM_CRF(tag_to_ix=tag_to_ix, 
          hidden_dim=50, 
          word_embedding_matrix=emb_table, 
          layers_num=1, 
          with_crf=True, 
          use_baseline=False, 
          attention_method=2).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
train(model, 'fastText_scaled_dot_product_attn')

Epoch:1, Training loss: 72002.44, train acc: 0.8716, val loss: 13303.04, val acc: 0.8649, time: 981.77s
Epoch:2, Training loss: 34056.31, train acc: 0.8651, val loss: 14032.07, val acc: 0.8586, time: 995.24s
Validation F1 Score: 0.8522524238648989


In [134]:
model = torch.load('fastText_scaled_dot_product_attn.pt')
predicted, ground_truth, accuracy, score = cal_acc(model,val_input_index,val_output_index)
print(classification_report(ground_truth, predicted, target_names=target_names))

              precision    recall  f1-score   support

           O       0.89      0.92      0.91     18985
           T       0.83      0.75      0.78      1469
           P       0.73      0.81      0.77      3936
        SEPA       0.91      0.83      0.87      3603
           S       0.79      0.89      0.83      3322
           D       1.00      0.00      0.01       398
           C       0.86      0.57      0.69      1641

    accuracy                           0.86     33354
   macro avg       0.86      0.68      0.69     33354
weighted avg       0.86      0.86      0.85     33354



In [110]:
# test for attention
# Bahdanau attention
# tag_to_ix, hidden_dim, word_embedding_matrix, layers_num, with_crf, use_baseline, attention_method
model = BiLSTM_CRF(tag_to_ix=tag_to_ix, 
          hidden_dim=50, 
          word_embedding_matrix=emb_table, 
          layers_num=1, 
          with_crf=True, 
          use_baseline=False, 
          attention_method=3).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
train(model, 'fastText_Bahdanau_attn')

Epoch:1, Training loss: 74585.16, train acc: 0.8636, val loss: 13048.20, val acc: 0.8596, time: 1030.44s
Epoch:2, Training loss: 29108.64, train acc: 0.8721, val loss: 17604.20, val acc: 0.8695, time: 1025.33s
Validation F1 Score: 0.8646863275922325


In [135]:
model = torch.load('fastText_Bahdanau_attn.pt')
predicted, ground_truth, accuracy, score = cal_acc(model,val_input_index,val_output_index)
print(classification_report(ground_truth, predicted, target_names=target_names))

              precision    recall  f1-score   support

           O       0.89      0.94      0.91     18985
           T       0.81      0.73      0.77      1469
           P       0.82      0.78      0.80      3936
        SEPA       0.82      0.80      0.81      3603
           S       0.94      0.84      0.89      3322
           D       0.77      0.10      0.18       398
           C       0.80      0.79      0.79      1641

    accuracy                           0.87     33354
   macro avg       0.84      0.71      0.74     33354
weighted avg       0.87      0.87      0.86     33354



In [120]:
# test for stack layers 2 LAYERS
# Bahdanau attention
# tag_to_ix, hidden_dim, word_embedding_matrix, layers_num, with_crf, use_baseline, attention_method
model = BiLSTM_CRF(tag_to_ix=tag_to_ix, 
          hidden_dim=50, 
          word_embedding_matrix=emb_table, 
          layers_num=2, 
          with_crf=True, 
          use_baseline=False, 
          attention_method=1).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
train(model, 'fastText_dot_product_attn_2layers')

Epoch:1, Training loss: 111052.31, train acc: 0.7566, val loss: 23950.70, val acc: 0.7550, time: 988.88s
Epoch:2, Training loss: 48665.40, train acc: 0.8637, val loss: 14111.33, val acc: 0.8605, time: 967.81s
Validation F1 Score: 0.8518655995462657


In [136]:
model = torch.load('fastText_dot_product_attn_2layers.pt')
predicted, ground_truth, accuracy, score = cal_acc(model,val_input_index,val_output_index)
print(classification_report(ground_truth, predicted, target_names=target_names))

              precision    recall  f1-score   support

           O       0.88      0.96      0.92     18985
           T       0.48      0.49      0.49      1469
           P       0.91      0.82      0.86      3936
        SEPA       0.92      0.82      0.87      3603
           S       0.90      0.84      0.87      3322
           D       0.00      0.00      0.00       398
           C       0.53      0.42      0.47      1641

    accuracy                           0.86     33354
   macro avg       0.66      0.62      0.64     33354
weighted avg       0.85      0.86      0.85     33354



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [122]:
# test for without CRF
# Bahdanau attention
# tag_to_ix, hidden_dim, word_embedding_matrix, layers_num, with_crf, use_baseline, attention_method
model = BiLSTM_CRF(tag_to_ix=tag_to_ix, 
          hidden_dim=50, 
          word_embedding_matrix=emb_table, 
          layers_num=1, 
          with_crf=False, 
          use_baseline=False, 
          attention_method=1).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
train(model, 'fastText_dot_product_attn_1layers_withoutcrf')

Epoch:1, Training loss: 87461.89, train acc: 0.8564, val loss: 13360.14, val acc: 0.8492, time: 767.11s
Epoch:2, Training loss: 44315.81, train acc: 0.7809, val loss: 18999.40, val acc: 0.7756, time: 799.26s
Validation F1 Score: 0.775821488048554


In [137]:
model = torch.load('fastText_dot_product_attn_1layers_withoutcrf.pt')
predicted, ground_truth, accuracy, score = cal_acc(model,val_input_index,val_output_index)
print(classification_report(ground_truth, predicted, target_names=target_names))

              precision    recall  f1-score   support

           O       0.87      0.84      0.86     18985
           T       0.43      0.72      0.54      1469
           P       0.72      0.74      0.73      3936
        SEPA       0.60      0.69      0.64      3603
           S       0.83      0.76      0.80      3322
           D       0.00      0.00      0.00       398
           C       0.66      0.55      0.60      1641

    accuracy                           0.78     33354
   macro avg       0.59      0.62      0.60     33354
weighted avg       0.78      0.78      0.78     33354



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [112]:
# torch.save(model, 'mymodel.pt')
# the_saved_model = torch.load('mymodel.pt')
# ix_tag = {
#     0:'<START>',
#     1:'<STOP>',
#     2:'O',
#     3:'T',
#     4:'P',
#     5:'SEPA',
#     6:'S',
#     7:'D',
#     8:'C'
# }

# text = 'retard' #@param {type:"string"}
# result = []
# # try:
# data_after_preprocessing = tokenize(to_lowercase([text]))
# data_encoded = to_index(data_after_preprocessing,word_to_ix,'word')
# # print(torch.tensor(data_encoded[0], dtype=torch.long))
# # print(torch.tensor(train_input_index[5]))
# predit = the_saved_model(torch.tensor(data_encoded[0], dtype=torch.long)).tolist()
# result.append([ix_tag[ix] for ix in predit])
# print(result)
# # except:
# #   print('error')
