This notebook contains the main model for the NLP Project. It contains an LSTM model as specified in our report.

# Import Libraries

In [0]:
import pandas as pd 
import numpy as np
from torch import nn
from torch import optim
from torch import autograd
import torch.nn.functional as F
import torch
from torchtext import data
import re
!pip install sentistrength
from sentistrength import PySentiStr
from sklearn.preprocessing import normalize

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
from typing import List, Tuple
from itertools import islice
import math

In [0]:
# This is the LIAR-PLUS training data

liar_id = "1znUzJPalC4z9MPmfovFLqp8S0x608Kfx"
downloaded = drive.CreateFile({'id':liar_id}) 
downloaded.GetContentFile('train2.tsv') 
train_data=pd.read_csv('train2.tsv',delimiter='\t',encoding='utf-8', header = None, index_col=0)

In [0]:
# This is the LIAR-PLUS dev data

liar_id = "1OBONJoZ05il1_xHlvZVWpsZg7GHt5xqs"
downloaded = drive.CreateFile({'id':liar_id}) 
downloaded.GetContentFile('val2.tsv') 
val_data=pd.read_csv('val2.tsv',delimiter='\t',encoding='utf-8', header = None, index_col=0)

In [0]:
# This is the LIAR-PLUS test data

liar_test_id = "1ywwBKp9b-lowXy-wfYcIZmhuVHW5xDAv"
downloaded = drive.CreateFile({'id':liar_test_id}) 
downloaded.GetContentFile('test2.tsv') 
test_data=pd.read_csv('test2.tsv',delimiter='\t',encoding='utf-8', header = None, index_col=0)

In [0]:
# This is the smaller GloVe embeddings 

glove_data_id = "1wvEbRqdbwPmgfdJpxnKsk4xHQ73kko5s"
glove_downloaded = drive.CreateFile({'id': glove_data_id})
glove_downloaded.GetContentFile("glove.6B.50d.txt")

In [0]:
# Process GloVe embeddings
# embeddings_dict is dictionary where each key is a word and each value is the vector embedding 
# words is list of words; vectorsglove is list of vectors
# word2idx maps words to id; glove maps words to vector

idx = 0
words =[]
word2idx = {}
embeddings_dict = {}
vectorsglove = []
with open("glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector
        vectorsglove.append(vector)

glove = {w: vectorsglove[word2idx[w]] for w in words}

In [0]:
# Get vocab

def get_vocab(data):
  # Returns list of vocab
  statements = data[3]
  justifications = data[15]

  vocab = []
  for i in range(len(statements)):
    # Tokenize statement: remove punctuation and split on whitespace
    sample = re.sub(r'[^\w\s]',' ',statements[i])
    words = re.split(r' ',sample.lower())

    # Tokenize justification
    if justifications[i] != justifications[i]: continue     # If justification is empty
    sample = re.sub(r'[^\w\s]',' ',justifications[i])
    justs = re.split(r' ',sample.lower())

    both = words + justs

    # Add new words to vocab
    vectors = []
    for word in both:
      if word not in vocab:
        vocab.append(word)
    
  return vocab

# Get vocab of train data
train_vocab = get_vocab(train_data) # 10156 samples

In [0]:
# Get pretrained weights/embeddings
# weights maps id to vector
# If word isn't in GloVe, then initialize as random vector

weights = np.zeros((len(train_vocab),50))

for i, word in enumerate(train_vocab):
  try:
    weights[i] = glove[word]
  except KeyError:
    weights[i] = np.random.normal(scale=.6,size=(50, ))

In [0]:
# Dictionaries to store word to index mappings and vice versa

word2id = {o:i for i,o in enumerate(train_vocab)}
id2word = {i:o for i,o in enumerate(train_vocab)}

# Process Inputs (These cells were already ran to preocess the data, no need to run again)

In [0]:
# This is EmoLex 
'''
emolex_id = "13M2WhB_etUUYUmdYDR0eQ3nbxbfJYhnG"
emolex_downloaded = drive.CreateFile({'id': emolex_id})
emolex_downloaded.GetContentFile("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
''';

In [0]:
# Process EmoLex
# Dictionary where each value is a word and each key is a 8-dim vector
'''
emo_dict = {}
with open("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", 'r') as f:
    vector = []
    for line in f:
        values = line.split()
        if len(values) == 0: continue
        word = values[0]
        vector.append(values[2])
        if len(vector) == 10:
          emo_dict[word] = vector
          vector = []
''';

In [0]:
# Hedge lexicon (only has one-word hedges)

hedges = ['largely', 'generally','often','rarely','sometimes','frequently','occasionally','seldom','usually','most','several','some',
         'almost','practically','apparently','virtually','basically','approximately','roughly','somewhat','somehow','partially',
         'actually','like','something','someone','somebody','somewhere','think','thinks','thought','believe','believed','believes',
         'consider','considers','considered','assume','assumes','assumed','understand','understands','understood','find','found',
         'finds','appear','appears','appeared','seem','seems','seemed','suppose','supposes','supposed','guess','guesses','guessed',
         'estimate','estimates','estimated','speculate','speculates','speculated','suggest','suggests','suggested','may','could',
         'should','might','surely','probably','likely','maybe','perhaps','unsure','probable','unlikely','possibly','possible',
         'read','say','says','necessarily','much','bunch','couple','few','little','really','about','around','can','effectively',
         'evidently','fairly','hopefully','mainly','mostly','overall','presumably','pretty','quite','rather','supposedly','tend',
         'doubt','indicate','will','must','would','certainly','definitely','clearly','conceivably','certain','definite','clear',
         'assumption','possibility','probability','many','improbable','always','rare','doubtful','suggestive','diagnostic',
         'inconclusive','apparent','alleged','allege','presumable']

In [0]:
# This is SentiStrength stuff
'''
sentijar_id = "1JeskOiyOdVUZ7rjhFKajOp-52OUcn64P"
senti_downloaded = drive.CreateFile({'id':sentijar_id}) 
senti_downloaded.GetContentFile('SentiStrength.jar') 

senti = PySentiStr()
senti.setSentiStrengthPath("/content/SentiStrength.jar")    
senti.setSentiStrengthLanguageFolderPath("/content/SentStrength_Data/")   # Have to reupload this folder
''';

In [0]:
# Process input data
'''
def format_input(data):
  # Returns 3 np arrays: statement embeddings, justification embeddings, features (semantic strength + pos/neg, emotion, 
  # amount of hedging, prior history x 5)
  statements = data[3]
  justifications = data[15]
  prior1 = data[9]; prior2 = data[10]; prior3 = data[11]; prior4 = data[12]; prior5 = data[13]

  sts = []
  jsts = []
  feats = []

  for i in range(len(statements)):
    # Tokenize statement: remove punctuation and split on whitespace
    sample = re.sub(r'[^\w\s]',' ',statements[i])
    words = re.split(r' ',sample.lower())

    # Tokenize justification
    if justifications[i] != justifications[i]: continue     # If justification is empty
    sample = re.sub(r'[^\w\s]',' ',justifications[i])
    justs = re.split(r' ',sample.lower())

    # GloVe embedding for each word in statement
    vectors = []
    for word in words:
      if word in embeddings_dict: 
        vectors.append(embeddings_dict[word])
    sts.append(vectors)
    
    # GloVe embedding for each word in justification
    vectors = []
    for word in justs:
      if word in embeddings_dict: 
        vectors.append(embeddings_dict[word])
    jsts.append(vectors)
    
    ifeats = []
    # Sentiment strength for the statement
    ifeats.extend(senti.getSentiment(statements[i]))

    # Sum of 8-dim emotion vectors for each word in statement
    emo = []
    for word in words:
      if word in emo_dict: emo.append([float(x) for x in emo_dict[word]])
    emo = np.array(emo)
    if len(emo) > 0:
      ifeats.extend(np.mean(emo,axis=0))
    else:
      ifeats.extend([0,0,0,0,0,0,0,0])

    # Hedging
    hedge = 0.0
    for word in words:
      if word in hedges: hedge += 1.0
    ifeats.append(hedge)

    # Prior history 5-dim vector
    ifeats.extend(np.array([float(prior1[i]),float(prior2[i]),float(prior3[i]),float(prior4[i]),float(prior5[i])]))

    feats.append(np.array(ifeats))
    
  return np.array(sts), np.array(jsts), np.array(feats)

# Format train and test data
#train_st, train_jst, train = format_input(train_data) # 10156 samples
#dev_st, dev_jst, dev = format_input(val_data)         # 1280 samples
#test_st, test_jst, test = format_input(test_data)     # 1258 samples
''';

In [0]:
# This cell was already ran so no need to run again.
# Save processed input to (Tessa's) drive

#f = open("/content/drive/My Drive/dev_st.csv","w")
#for statement in dev_st:
#  f.write('\n')
#  for word in statement:
#    w = ""
#    for each in word:
#      w += str(each) + ","
#    f.write(w)
#f.close()

#f = open("/content/drive/My Drive/dev.csv","w")
#for statement in dev:
#  f.write('\n')
#  w = ""
#  for each in statement:
#    w += str(each) + ","
#  f.write(w)
#f.close()

# Import Processed Input

In [0]:
# Load processed input from drive

def load_feats_drive(fname):
  new_arr = []
  f = open(fname,"r")
  for statement in f:
    st = []
    if len(statement) == 1: continue
    parts = statement.split(',')[:-1]
    for each in parts:
      if float(each) != float(each): st.append(0.0)
      else: st.append(float(each))
    if len(st) != 17: 
      st.insert(0,0.0)
      st.insert(0,0.0)
    new_arr.append(st)
  f.close()
  return new_arr

train_id = "1-hxnB0ngKi4Ti1WgDOAmwrnwLtEOqObe"
downloaded = drive.CreateFile({'id':train_id}) 
downloaded.GetContentFile('train.csv') 
dev_id = "1-6Os5O0E7coeK5vf8OT1Ga5Jqj61UAk2"
downloaded = drive.CreateFile({'id':dev_id}) 
downloaded.GetContentFile('dev.csv') 
test_id = "1-dK6qgdFo4IcmALUewlHCjAYTtIRAYG0"
downloaded = drive.CreateFile({'id':test_id}) 
downloaded.GetContentFile('test.csv') 

train_u = load_feats_drive("train.csv") # William thinks 'u' means unnormalized. 
dev_u =   load_feats_drive("dev.csv")
test_u =  load_feats_drive("test.csv")

In [0]:
# Scale/normalize the feature vectors

train = np.array([np.array([float(i)/sum(x) if sum(x) != 0 else 0.0 for i in x]) for x in train_u]) 
dev   = np.array([np.array([float(i)/sum(x) if sum(x) != 0 else 0.0 for i in x]) for x in dev_u])
test  = np.array([np.array([float(i)/sum(x) if sum(x) != 0 else 0.0 for i in x]) for x in test_u])

In [0]:
# Create list of hedge words
# amount of hedging of a sample is measured by the number of hedge words used in the statement
hedging = ['may', 'might', 'can', 'could','would', 'should', 'seem', 'appear', 'believe', 'assume', 'suggest', 'claim','possibility'
           'estimate', 'tend', 'think', 'argue', 'indicate', 'propose', 'speculate', 'possible', 'probable','likely','assumption',
           'estimate', 'suggestion', 'perhaps', 'possibly','probably','practically', 'likely', 'presumably','virtually', 'apparently',
           'approximately','roughly', 'about', 'often','occasionally','generally', 'usually','somewhat', 'somehow','a lot of','believe', 
           'to our knowledge', 'it is our', 'view that', 'we feel that']

In [0]:
# Get ground truth labels

def format_ans(data):
  answers = []
  lab = data[2]
  justs = data[15]
  for each in range(len(data)):
    if justs[each] != justs[each]: continue
    if   lab[each] == 'pants-fire':  answers.append(0)
    elif lab[each] == 'false':       answers.append(1)
    elif lab[each] == 'barely-true': answers.append(2)
    elif lab[each] == 'half-true':   answers.append(3)
    elif lab[each] == 'mostly-true': answers.append(4)
    elif lab[each] == 'true':        answers.append(5)
    else: print("The label inputted is not a known label.")
  return np.array(answers)

train_labels = format_ans(train_data)
dev_labels   = format_ans(val_data)
test_labels  = format_ans(test_data)

In [0]:
# Get sentences

def get_words(data):
  # Returns list of lists, where each inner list is all words in one statement and justification
  statements = data[3]
  justifications = data[15]
  stjsts = []

  for i in range(len(statements)):
    # Tokenize statement: remove punctuation and split on whitespace
    sample = re.sub(r'[^\w\s]',' ',statements[i])
    words = re.split(r' ',sample.lower())

    if justifications[i] != justifications[i]: continue     # If justification is empty
    sample = re.sub(r'[^\w\s]',' ',justifications[i])
    justs = re.split(r' ',sample.lower())

    # put together
    stjsts.append(words + justs)
    ##stjsts.append(words)
  return stjsts

train_words = get_words(train_data)
dev_words = get_words(val_data)
test_words = get_words(test_data)

In [0]:
# Convert sentences to id numbers and pad to 200 words

def word_to_id(sents):
  new_sents = np.zeros((len(sents),200),dtype=int)
  for i,sentence in enumerate(sents):
    if len(sentence) != 0:
      new_sents[i, -len(sentence):] = np.array([word2id[word] if word in word2id else 0 for word in sentence])[:200]
  return new_sents

train_w = word_to_id(train_words)
dev_w = word_to_id(dev_words)
test_w = word_to_id(test_words)

In [0]:
# Data is [words[][], features[][], labels[]]
# words is an array of arrays of ints (representing words)
# features is array of arrays of floats (representing feats)
# labels is array of ints

train_f = [torch.from_numpy(train_w).cuda(), torch.from_numpy(train).cuda(), torch.from_numpy(train_labels).cuda()]
dev_f   = [torch.from_numpy(dev_w).cuda(),   torch.from_numpy(dev).cuda(),   torch.from_numpy(dev_labels).cuda()]
test_f  = [torch.from_numpy(test_w).cuda(),  torch.from_numpy(test).cuda(),  torch.from_numpy(test_labels).cuda()]

# Define Models and Train/Test functions

In [0]:
# Define our BiLSTM class

class BiLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, batch_size, use_gpu=True, dropout=0.5):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.use_gpu = use_gpu
        self.dropout = dropout
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.load_state_dict({'weight': torch.from_numpy(weights)})
        self.embeddings.weight.requires_grad = False
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, bidirectional=True)
        self.hidden2label = nn.Linear(hidden_dim*2+17, label_size)
        #self.hidden2label = nn.Linear(hidden_dim*2, label_size)###
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # First is the hidden h
        # Second is the cell c
        if self.use_gpu:
            return (torch.zeros((2, self.batch_size, self.hidden_dim), requires_grad=True).cuda(),
                    torch.zeros((2, self.batch_size, self.hidden_dim), requires_grad=True).cuda())
        else:
            return (torch.zeros((2, self.batch_size, self.hidden_dim), requires_grad=True),
                    torch.zeros((2, self.batch_size, self.hidden_dim), requires_grad=True))

    def forward(self, batch, features):
        x = self.embeddings(batch).view(200, len(batch), -1)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        concat = torch.cat((lstm_out[-1].type(torch.FloatTensor).cuda(),features.type(torch.FloatTensor).cuda()),1)
        y = self.hidden2label(concat)  
        #y = self.hidden2label(lstm_out[-1].type(torch.FloatTensor).cuda()) ###
        probs = F.log_softmax(y, 1) 
        return probs

In [0]:
# Define training and testing functions

def get_accuracy(truth, pred):  
    # Truth and pred are arrays of values; returns percentage correct
    correct = 0
    bi = 0
    for i in range(len(truth)):
        if truth[i] == pred[i]: correct += 1.0
        if (truth[i] == 0 or truth[i] == 1 or truth[i] == 2) and (pred[i] == 0 or pred[i] == 1 or pred[i] == 2): bi += 1.0
        elif (truth[i] == 3 or truth[i] == 4 or truth[i] == 5) and (pred[i] == 3 or pred[i] == 4 or pred[i] == 5): bi += 1.0
    return correct/len(truth), bi/len(truth)

def get_f1(truth, pred):
  tp = [0,0,0,0,0,0]
  fp = [0,0,0,0,0,0]
  fn = [0,0,0,0,0,0]
  b_tp = 0
  b_fp = 0
  b_fn = 0
  for i in range(len(truth)):
    for each in range(6):
      if (truth[i] == each and pred[i] == each): tp[each] += 1.0
      if (truth[i] == each and pred[i] != each): fp[each] += 1.0
      if (truth[i] != each and pred[i] == each): fn[each] += 1.0
    if (truth[i] == 0 or truth[i] == 1 or truth[i] == 2) and (pred[i] == 0 or pred[i] == 1 or pred[i] == 2): b_tp += 1.0
    if (truth[i] == 0 or truth[i] == 1 or truth[i] == 2) and (pred[i] == 3 or pred[i] == 4 or pred[i] == 5): b_fp += 1.0
    if (truth[i] == 3 or truth[i] == 4 or truth[i] == 5) and (pred[i] == 0 or pred[i] == 1 or pred[i] == 2): b_fn += 1.0
  
  r = [tp[x]/(tp[x]+fn[x]) if (tp[x]+fn[x] != 0) else 0.0 for x in range(6)]
  p = [tp[x]/(tp[x]+fp[x]) if (tp[x]+fp[x] != 0) else 0.0 for x in range(6)]
  f1 = [2*(p[x]*r[x])/(p[x]+r[x]) if (p[x]+r[x] != 0) else 0.0 for x in range(6)]
  f1 = sum(f1)/6.0

  br = b_tp/(b_tp+b_fn) if (b_tp+b_fn != 0) else 0.0
  bp = b_tp/(b_tp+b_fp) if (b_tp+b_fp != 0) else 0.0
  bf1 = 2*(bp*br)/(bp+br) if (bp+br != 0) else 0.0

  return f1, bf1

def evaluate(model, batches, data, loss_function, optimizer, e):
    # (model: nn.Module, batches:List[Tuple[int, int]], data, loss_function, optimizer, evaluate:bool)
    if e: model.eval()
    else: model.train()
    avg_loss = 0.0
    batches = batches.copy()
    #np.random.shuffle(batches)
    truth_res = []
    pred_res = []
    for batch_i,(start,end) in enumerate(batches):
        model.hidden = model.init_hidden()
        stjst, feats, labels = data[0][start:end], data[1][start:end], data[2][start:end]
        truth_res.extend(labels)
        pred = model(stjst, feats)
        pred_label = pred.max(1)[1]
        pred_res.extend([x for x in pred_label])
        if not e: 
            model.zero_grad()
        loss = loss_function(pred, labels)
        avg_loss += loss.item()
        if not e:
            loss.backward()
            optimizer.step()
    avg_loss /= len(batches)
    acc, acc2 = get_accuracy(truth_res, pred_res)
    if e: 
      f1, bf1 = get_f1(truth_res, pred_res)
      return avg_loss, acc, acc2, f1, bf1
    return avg_loss, acc, acc2

# Train and Test Model

In [0]:
# Implement model

device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
batchsize = 64
bilstm = BiLSTM(embedding_dim=50, hidden_dim=32, vocab_size=len(train_vocab), label_size=6, use_gpu=True, batch_size=batchsize)
bilstm.to(device)

# Train and test
print('Model:')
best_model = bilstm
optimizer = optim.Adam(bilstm.parameters(), lr=0.01)
loss_function = nn.NLLLoss()
epochs = 10
best_dev_acc = 0.0
# Batches
train_iter = [(start,start+batchsize) for start in range(0,10156,batchsize)]
dev_iter =   [(start,start+batchsize) for start in range(0,1280,batchsize)]
test_iter =  [(start,start+batchsize) for start in range(0,1258,batchsize)]

for epoch in range(epochs):
    avg_loss, acc, acc2 = evaluate(bilstm, train_iter[:-1], train_f, loss_function, optimizer, e=False)
    print('Train: loss %.2f acc %.1f' % (avg_loss, acc*100))
    dev_avg_loss, dev_acc, dev_acc2, f1, bf1 = evaluate(bilstm, dev_iter, dev_f, loss_function, optimizer, e=True)
    print('Dev: loss %.2f acc %.1f' % (dev_avg_loss, dev_acc*100))
    if dev_acc > best_dev_acc:
        best_dev_acc = dev_acc
        best_model = bilstm
        #torch.save(best_model.state_dict(), '/content/bilstm_best_model.pth')
        # Evaluate on test with the best dev performance model
        #test_avg_loss, test_acc = evaluate(best_model, test_iter[:-1], test_f, loss_function, optimizer, e=True)
        #print('Test: loss %.2f acc %.1f' % (test_avg_loss, test_acc*100))
test_avg_loss, test_acc, test_acc2, f1, bf1 = evaluate(best_model, test_iter[:-1], test_f, loss_function, optimizer, e=True)
print('Final Test: loss %.2f acc %.2f 2-way acc %.2f f1 %.2f 2-way f1 %.2f' % (test_avg_loss, test_acc*100, test_acc2*100, f1, bf1))