In [1]:
# Mount Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# import libraries
import csv
import torch
import numpy as np
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence

In [3]:
class Dataset():
  def __init__(self, filename):
      self.sentences, self.score = self.read_data(filename)
  def read_data(self, filename):
      sentences = []
      scores = []

      current_sentence = []

      with open(filename, encoding='utf8') as f:
        csvreader = csv.reader(f, delimiter=',')
        for row in csvreader:
          comment_num = row[0]
          comment = tokenizer(row[1])
          label = row[2]
          sentences.append(comment)
          scores.append(label)
      
      return sentences,scores

In [7]:
sentences = []
scores = []
tokenizer = get_tokenizer("spacy")

# open data
with open('/content/drive/My Drive/NLP_259/final_labels.csv', encoding='utf8') as f:
  csvreader = csv.reader(f, delimiter=',')
  for row in csvreader:
    comment = tokenizer(row[1])
    label = row[2]
    sentences.append(comment)
    scores.append(label)

# Get ride of header 
sentences = sentences[1:]
scores = scores[1:]

# Make float
scores_new = []
for item in scores:
    scores_new.append(float(item))

print("Size of sentences and scores:", len(sentences),len(scores_new))
print ("Example: ", sentences[0][:10])

Size of sentences and scores: 120 120
Example:  ['I', 'was', 'in', 'the', 'Navy', ',', 'and', 'I', 'used', 'to']


In [8]:
def read_embeddings(filename, vocab_size=10000):
  """
  Utility function, loads in the `vocab_size` most common embeddings from `filename`
  
  Arguments:
  - filename:     path to file
                  automatically infers correct embedding dimension from filename
  - vocab_size:   maximum number of embeddings to load

  Returns 
  - embeddings:   torch.FloatTensor matrix of size (vocab_size x word_embedding_dim)
  - vocab:        dictionary mapping word (str) to index (int) in embedding matrix
  """

  # get the embedding size from the first embedding
  with open(filename, encoding="utf-8") as file:
    word_embedding_dim = len(file.readline().split(" ")) - 1

  vocab = {}

  embeddings = np.zeros((vocab_size, word_embedding_dim))

  with open(filename, encoding="utf-8") as file:
    for idx, line in enumerate(file):

      if idx + 2 >= vocab_size:
        break

      cols = line.rstrip().split(" ")
      val = np.array(cols[1:])
      word = cols[0]
      embeddings[idx + 2] = val
      vocab[word] = idx + 2
  
  # a FloatTensor is a multidimensional matrix
  # that contains 32-bit floats in every entry
  # https://pytorch.org/docs/stable/tensors.html
  return torch.FloatTensor(embeddings), vocab

In [10]:
# this loads the 10,000 most common word 50-dimensional embeddings
embedding_file = '/content/drive/My Drive/NLP_259/glove.6B.50d.txt'
vocab_size = 10000
embeddings, vocab = read_embeddings(embedding_file, vocab_size)

In [None]:
# # This batching Process
# batch_size = 10
# PAD_INDEX = 0             # reserved for padding words
# UNKNOWN_INDEX = 1         # reserved for unknown words

# np.random.seed(159) # don't change this, for reproducibility
# shuffle = np.random.permutation(range(len(sentences)))

# #grabs the relevant data from the random permutation
# shuffled_sentences = [sentences[i] for i in shuffle]
# shuffled_scores = [scores_new[i] for i in shuffle]

# # batched length, batched sentences, batched y-label
# batched_lengths, batched_sent_idxs, batched_scores = [], [], []

# #creates batches
# N = len(shuffled_sentences)
# if N % batch_size == 0:
#   num_batches = N // batch_size
# else:
#   num_batches = N // batch_size + 1

# # loop through batches
# for b in range(num_batches):
#   start = b * batch_size # Batch starting splitting index 
#   stop = min((b+1) * batch_size, len(shuffled_sentences)) # batch ending index
  
#   # At the batch level
#   #calculates the max lengths of response and past turn sequences for this batch
#   # this is for padding purposes
#   max_seq_len = max([len(s) for s in shuffled_sentences[start:stop]])
#   sent_idx = np.zeros((stop-start, max_seq_len)) # matrix to capture all sentences index
#   sent_lengths = np.zeros((stop-start))
#   y_labels = np.zeros((stop-start))

#   # Within the batch
#   for i in range(start, stop):
#     #gathers the corresponding data
#     current_sent = shuffled_sentences[i]
#     #this captures the lengths 
#     sent_lengths[i - start] = len(current_sent)
#     #y-label
#     y_labels[i - start] = shuffled_scores[i]

#     #this gets the vocabulary IDs for each word in the past_turn and response
#     #UNKNOWN_INDEX is used if the word is out of vocabulary
#     for j in range(len(current_sent)):
#       if current_sent[j].lower() in vocab:
#         sent_idx[i - start][j] = vocab[current_sent[j].lower()]
#       else:
#         sent_idx[i - start][j] = UNKNOWN_INDEX      


#   batched_lengths.append(sent_lengths)
#   batched_sent_idxs.append(sent_idx)
#   batched_scores.append(y_labels)

  


In [11]:
PAD_INDEX = 0             # reserved for padding words
UNKNOWN_INDEX = 1         # reserved for unknown words

max_seq_len = max([len(s) for s in sentences])
sent_idxs = np.zeros((len(sentences), max_seq_len))
sent_lengths = np.zeros((len(sentences)))

for i in range(len(sentences)):
  current_sent = sentences[i]
  sent_lengths[i] = len(current_sent)
  for j in range(len(current_sent)):
    if current_sent[j].lower() in vocab:
      sent_idxs[i][j] = vocab[current_sent[j].lower()]
    else:
      sent_idxs[i][j] = UNKNOWN_INDEX  

len(sent_idxs)

120

In [12]:
# Sanity Check
sent_idxs[0]
sent_lengths

array([ 72.,  79.,  56.,  13.,  35.,  35.,  23.,  89.,  58.,  50.,  19.,
        26.,  63.,  45.,  93.,   5.,  25.,  21., 371., 143., 177., 153.,
        33.,  17.,  44.,  16.,  13., 107.,  55.,  92.,  16.,  40.,  83.,
        12.,  64., 100.,  21.,   8., 114.,  75.,  74.,   8.,  21., 242.,
        20., 185.,  28., 115.,  66.,  16.,  15.,  38.,  70.,  20.,  24.,
        13., 146., 149.,  82.,  96., 229.,  40.,  64.,  19.,  58.,   9.,
       203.,  83., 147.,  59.,  45.,  37.,  90.,  61.,  71.,  38.,  48.,
         7., 142.,  21.,  13.,  16.,  53.,  48.,  42.,  30.,  38.,  45.,
        74.,  37.,  31.,  11.,  19.,   7.,  13.,  27.,  95.,  32., 108.,
        57.,  25.,  67., 265.,  22.,   4.,  20.,  16., 191.,  26.,  15.,
       248., 630.,  63.,  13.,  58.,  86.,  19.,  44., 198.,  64.])

In [13]:
# Create Attention layer
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.kaiming_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim 
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)
        
        weighted_input = x * torch.unsqueeze(a, -1)

        return torch.sum(weighted_input, 1) , a

In [14]:
# Make one layer LSTM with attention layer

class Attention_Net(nn.Module):
    def __init__(self,embeddings, maxlen):
        super(Attention_Net, self).__init__()
        embed_size = embeddings.shape[1]
        self.hidden_size = 64
        self.trained_att_weights = torch.zeros(maxlen)
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=False)
        # self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        # self.embedding.weight.requires_grad = False
        # self.embedding_dropout = nn.Dropout2d(0.1)

        self.lstm = nn.LSTM(embed_size, self.hidden_size, bidirectional=True, batch_first=False)
        # self.lstm2 = nn.GRU(128*2, 64, bidirectional=True, batch_first=True)

        self.attention_layer = Attention(self.hidden_size * 2, maxlen)
        
        self.linear = nn.Linear(self.hidden_size * 2 , 32)
        self.relu = nn.ReLU()
        self.out = nn.Linear(32, 1)

    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0))
        h_lstm, _ = self.lstm(h_embedding)
        # h_lstm, _ = self.lstm2(h_lstm)
        h_lstm_atten, _ = self.attention_layer(h_lstm)
        conc = self.relu(self.linear(h_lstm_atten))
        out = self.out(conc)
        return out
    
    def get_attentweights_with_x(self,x):
        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0))
        h_lstm, _ = self.lstm(h_embedding)
        # h_lstm, _ = self.lstm2(h_lstm)
        _, trained_weight = self.attention_layer(h_lstm)
        return trained_weight

In [15]:
# Parameter for training
MAX_LENGTH = int(max(sent_lengths))
model = Attention_Net(embeddings, MAX_LENGTH)

In [16]:
# Forward Prop
forward_test = torch.LongTensor(sent_idxs)
forward_test.shape
model.forward(forward_test)[:10]

tensor([[0.0966],
        [0.0975],
        [0.0980],
        [0.0985],
        [0.0987],
        [0.0988],
        [0.0983],
        [0.0984],
        [0.0984],
        [0.0980]], grad_fn=<SliceBackward>)

In [17]:
# Training
LEARNING_RATE = 1e-3
num_epochs = 100
# let's train the model 
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
# loss_function = nn.CrossEntropyLoss(ignore_index = 1)
loss_function = nn.L1Loss(reduction='sum')

print("**** TRAINING *****")
for i in range(num_epochs):
  model.train()
  total_loss = 0
  # compute the logits
  logits = model.forward(torch.LongTensor(sent_idxs)).view(-1)
  # move labels to GPU memory
  labels = torch.LongTensor(scores_new)
  # print(logits.shape)
  # print(labels.shape)
  loss = loss_function(logits, labels)
  #loss = loss_function(logits, labels)
  total_loss += loss
  # propagate gradients backward
  loss.backward()
  optimizer.step()
  # set model gradients to zero before performing next forward pass
  model.zero_grad()

  if (i + 1) % 5 == 0:
    print("Epoch {} | Loss: {}".format(i + 1, total_loss))

**** TRAINING *****
Epoch 5 | Loss: 46.73244857788086
Epoch 10 | Loss: 46.43602752685547
Epoch 15 | Loss: 46.158573150634766
Epoch 20 | Loss: 46.16778564453125
Epoch 25 | Loss: 46.089176177978516
Epoch 30 | Loss: 46.0499153137207
Epoch 35 | Loss: 46.027503967285156
Epoch 40 | Loss: 46.00292205810547
Epoch 45 | Loss: 45.99393844604492
Epoch 50 | Loss: 45.980979919433594
Epoch 55 | Loss: 45.96519470214844
Epoch 60 | Loss: 45.94905090332031
Epoch 65 | Loss: 45.95695495605469
Epoch 70 | Loss: 45.96592330932617
Epoch 75 | Loss: 45.922019958496094
Epoch 80 | Loss: 45.918663024902344
Epoch 85 | Loss: 45.88459014892578
Epoch 90 | Loss: 45.84555435180664
Epoch 95 | Loss: 45.81193923950195
Epoch 100 | Loss: 45.78041458129883


In [18]:
# Sanity check on the attention weights
forward_test = torch.LongTensor(sent_idxs)
wd_weights = model.get_attentweights_with_x(forward_test)

In [19]:
wd_weights[0][0:30]

tensor([0.0014, 0.0014, 0.0014, 0.0014, 0.0017, 0.0014, 0.0014, 0.0014, 0.0014,
        0.0014, 0.0014, 0.0014, 0.0014, 0.0016, 0.0015, 0.0014, 0.0014, 0.0014,
        0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014,
        0.0014, 0.0014, 0.0014], grad_fn=<SliceBackward>)

In [None]:
# Credit to Text-Attention-Heatmap-Visualization project by Jie Yang
# !git clone https://github.com/jiesutd/Text-Attention-Heatmap-Visualization

Cloning into 'Text-Attention-Heatmap-Visualization'...
remote: Enumerating objects: 40, done.[K
remote: Total 40 (delta 0), reused 0 (delta 0), pack-reused 40[K
Unpacking objects: 100% (40/40), done.


In [20]:
# -*- coding: utf-8 -*-
# @Author: Jie Yang
# @Date:   2019-03-29 16:10:23
# @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
# @Last Modified time: 2019-04-12 09:56:12


## convert the text/attention list to latex code, which will further generates the text heatmap based on attention weights.
import numpy as np

latex_special_token = ["!@#$%^&*()"]

def generate(text_list, attention_list, latex_file, color='red', rescale_value = False):
	assert(len(text_list) == len(attention_list))
	if rescale_value:
		attention_list = rescale(attention_list)
	word_num = len(text_list)
	text_list = clean_word(text_list)
	with open(latex_file,'w') as f:
		f.write(r'''\documentclass[varwidth]{standalone}
\special{papersize=210mm,297mm}
\usepackage{color}
\usepackage{tcolorbox}
\usepackage{CJK}
\usepackage{adjustbox}
\tcbset{width=0.9\textwidth,boxrule=0pt,colback=red,arc=0pt,auto outer arc,left=0pt,right=0pt,boxsep=5pt}
\begin{document}
\begin{CJK*}{UTF8}{gbsn}'''+'\n')
		string = r'''{\setlength{\fboxsep}{0pt}\colorbox{white!0}{\parbox{0.9\textwidth}{'''+"\n"
		for idx in range(word_num):
			string += "\\colorbox{%s!%s}{"%(color, attention_list[idx])+"\\strut " + text_list[idx]+"} "
		string += "\n}}}"
		f.write(string+'\n')
		f.write(r'''\end{CJK*}
\end{document}''')

def rescale(input_list):
	the_array = np.asarray(input_list)
	the_max = np.max(the_array)
	the_min = np.min(the_array)
	rescale = (the_array - the_min)/(the_max-the_min)*100
	return rescale.tolist()


def clean_word(word_list):
	new_word_list = []
	for word in word_list:
		for latex_sensitive in ["\\", "%", "&", "^", "#", "_",  "{", "}"]:
			if latex_sensitive in word:
				word = word.replace(latex_sensitive, '\\'+latex_sensitive)
		new_word_list.append(word)
	return new_word_list


if __name__ == '__main__':
	## This is a demo:

	sent = '''the USS Ronald Reagan - an aircraft carrier docked in Japan - during his tour of the region, vowing to "defeat any attack and meet any use of conventional or nuclear weapons with an overwhelming and effective American response".
North Korea and the US have ratcheted up tensions in recent weeks and the movement of the strike group had raised the question of a pre-emptive strike by the US.
On Wednesday, Mr Pence described the country as the "most dangerous and urgent threat to peace and security" in the Asia-Pacific.'''
	sent = '''我 回忆 起 我 曾经 在 大学 年代 ， 我们 经常 喜欢 玩 “ Hawaii guitar ” 。 说起 Guitar ， 我 想起 了 西游记 里 的 琵琶精 。
	今年 下半年 ， 中 美 合拍 的 西游记 即将 正式 开机 ， 我 继续 扮演 美猴王 孙悟空 ， 我 会 用 美猴王 艺术 形象 努力 创造 一 个 正能量 的 形象 ， 文 体 两 开花 ， 弘扬 中华 文化 ， 希望 大家 能 多多 关注 。'''
	words = sent.split()
	word_num = len(words)
	attention = [(x+1.)/word_num*100 for x in range(word_num)]
	import random
	random.seed(42)
	random.shuffle(attention)
	color = 'red'
	generate(words, attention, "sample.tex", color)

In [21]:
sent_1 = sentences[0]
attention_1 = wd_weights[0][:72]
attention_1 = [float(i) * 10000 for i in attention_1]
generate(sent_1, attention_1, "sent_1_v4.tex", 'blue')

In [22]:
sent_83 = sentences[83]
attention_83 = wd_weights[0][:48]
attention_83 = [float(i) * 10000 for i in attention_83]
generate(sent_83, attention_83, "sent_83.tex", color)