In [1]:
from __future__ import print_function

import time
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import numpy as np
import  random

Using TensorFlow backend.


# 1. LSTM Language Models Trained with Log Loss (20 points)

## 1.1 Data Cleaning

In [21]:
B= 32
def dataset_preparation(data):
    corpus = data.split("\n")
    
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        input_sequences.append(token_list)
    max_sequence_len= 22
    k = B- (len(input_sequences) % B)
    for i in range(k):
        input_sequences.append(np.zeros(len(input_sequences[-1])))
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='post'))
    predictors, label = input_sequences[:,:-1],input_sequences[:,1:].copy()
    for i in range(predictors.shape[0]):
        for j in range(predictors.shape[1]):
            if predictors[i][j] == 2:
                predictors[i][j] = 0
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

tokenizer = Tokenizer(lower=False,filters='')
data = open('bobsue.voc.txt').read()
corpus = data.split("\n")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)
print("total words: ",end="")
print(total_words)

data = open('bobsue.lm.train.txt').read()
dev_data = open('bobsue.lm.dev.txt').read()
test_data = open('bobsue.lm.test.txt').read()
predictors, label, max_sequence_len = dataset_preparation(data)
d_predictors, d_label, max_sequence_len = dataset_preparation(dev_data)
t_predictors, t_label, t_max_sequence_len = dataset_preparation(test_data)


total words: 1498


## 1.2 LSTM Model Implementation

In [43]:
def create_model(predictors, label, X_val,Y_val, max_sequence_len, total_words):
    model = Sequential()
    #mask_zero to neglect padding during training  
    model.add(Embedding(total_words, 200, input_length=max_sequence_len-1,mask_zero=True))
    model.add(LSTM(200,return_sequences=True)) 

    model.add(Dense(total_words))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
    earlystop = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0, patience=3, verbose=1, mode='auto')
    model.fit(predictors, label, epochs=10, batch_size=B,validation_data=(X_val, Y_val),verbose=1, callbacks=[earlystop])
    return model 


## 1.3 Training and Tuning 

In [33]:
model = create_model(predictors, label,d_predictors, d_label, max_sequence_len, total_words)

Train on 6048 samples, validate on 768 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# 2. Error Analysis (15 points) 

## 2.1 Implement Error Function

In [34]:
def evaluate(model,p ,t):
    #Manually implement a evaluate to check test accuracym, so that we can make sure num of predictions is correct
    total = 0 
    k = model.predict(p)
    k = list(k)
    for i in range(len(k)):
        k[i] = k[i].argmax(axis=1)
    k = np.array(k)
    m = t.argmax(axis=2)
    correct = 0
    err_count = {}
    for i in range(k.shape[0]):
        for j in range(k.shape[1]):
            #if label is neither padding nor <s>
            if m[i][j] != 0 and m[i][j] != 1 : 
                total+=1 
                if k[i][j] == m[i][j]:
                    correct+=1 
                else: 
                    if (m[i][j],k[i][j]) in err_count:
                        err_count[(m[i][j],k[i][j])] +=1
                    else:
                        err_count[(m[i][j],k[i][j])] =1
                        
    return total,correct/total,err_count


total,acc,err_count = evaluate(model,t_predictors,t_label)
print("total predictions: ", end="")
print(total)
print("accuracy: ", end="")
print(acc)

total predictions: 8059
accuracy: 0.31666459858543244


## 2.2 Top 35 most frequent errors with counts

In [40]:
sort_err = sorted(err_count.items(),key=lambda x:x[1],reverse=True)
def translate(a):
    x,y = a 
    return (tokenizer.index_word[x],tokenizer.index_word[y])
top35 = sort_err[0:35]

print("----Top 35 Error----")
print("First item of pair is truth, second item is prediction")

for t,cnt in top35:
    true, pred = translate(t)
    print((true,pred),end=" ")
    print("Count: ",end="")
    print(cnt)
#Answer to 2.3 See Latex file 


----Top 35 Error----
First item of pair is truth, second item is prediction
('He', 'Bob') Count: 136
('She', 'Bob') Count: 112
('Sue', 'Bob') Count: 106
('to', '.') Count: 51
('had', 'was') Count: 48
('decided', 'was') Count: 45
('his', 'the') Count: 43
('and', '.') Count: 42
('her', 'the') Count: 38
('in', '.') Count: 34
('for', '.') Count: 33
('her', 'a') Count: 28
('she', 'he') Count: 28
(',', '.') Count: 28
('His', 'Bob') Count: 27
('.', 'to') Count: 27
('got', 'was') Count: 26
('the', 'a') Count: 25
('.', 'and') Count: 25
('One', 'Bob') Count: 25
('a', 'the') Count: 23
('a', 'to') Count: 23
('The', 'Bob') Count: 23
('it', 'the') Count: 22
('But', 'Bob') Count: 21
('Her', 'Bob') Count: 21
("'s", 'was') Count: 20
('!', '.') Count: 20
('wanted', 'was') Count: 20
('went', 'was') Count: 20
('When', 'Bob') Count: 19
('They', 'Bob') Count: 19
('the', '.') Count: 19
('he', 'Bob') Count: 18
('for', 'to') Count: 18


# 3. Binary Log Loss Implementation and Experimentation (20 points)

## 3.1 Implementation of Binary Log Loss

In [40]:
import keras.backend as K
import tensorflow as tf
import math
import keras


B = 32
def custom_gather(a, b):
    unstacked_a = tf.unstack(a, axis=0)
    unstacked_b = tf.unstack(b, axis=0)
    gathered = [tf.gather(x, y) for x, y in zip(unstacked_a, unstacked_b)]
    return tf.stack(gathered, axis=0)


def bin_loss(r):
    def _bin_loss(y_true, y_pred):
        #Binary Log loss
        d = B*21    
        index = [i for i in range(total_words)]
        NEG = np.zeros((r,total_words))
        u_prob = np.array(total_words*[1])
        u_prob = u_prob/sum(u_prob)
        for i in range(total_words):
            temp = u_prob.copy()
            NEG[:,i]= np.random.choice(index,r,p=temp) + 1
        neg= tf.cast(tf.convert_to_tensor(NEG),tf.int32)
        y_true = K.reshape(y_true,(d,total_words))

        y_true2 = K.cast(y_true,tf.int32)
        y_pred = K.reshape(y_pred,(d,total_words))
        MASK = K.batch_dot(y_true2,mask,axes=1)
        MASK = K.cast(MASK,tf.float32)
        good_loss = -K.dot(K.transpose(K.log(K.sigmoid(K.batch_dot(y_pred,y_true,axes=1)))),MASK)

        slices = K.dot(y_true2,K.transpose(neg))
        bad = custom_gather(y_pred,slices)
        bad_loss = K.log(K.ones(shape=(d, r)) - K.sigmoid(bad))

        MASK = K.repeat_elements(MASK,rep=r,axis=1)
        bad_loss = K.batch_dot(bad_loss,MASK,axes=1)
        loss =(good_loss-K.sum(bad_loss)/r)
        return loss
    return _bin_loss

#I used bias at scoring function
def create_model2(predictors, label,X_val, Y_val, max_sequence_len, total_words,f):

    model = Sequential()
    model.add(Embedding(total_words, 200, input_length=max_sequence_len-1,mask_zero=True))
    model.add(LSTM(200,return_sequences=True)) 
    model.add(Dense(total_words,use_bias=True))
    model.compile(loss=f, optimizer='adam', metrics=['categorical_accuracy'])
    earlystop = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0, patience=5, verbose=1, mode='auto')
    model.fit(predictors, label, epochs=20, batch_size=B,validation_data=(X_val, Y_val),verbose=1, callbacks=[earlystop])
    
    return model 


## 3.2 UNIF

In [32]:
#With some experiment, It turns out SGD is better for r=20 case than adam
def create_model2(predictors, label,X_val, Y_val, max_sequence_len, total_words,f):

    model = Sequential()
    model.add(Embedding(total_words, 200, input_length=max_sequence_len-1,mask_zero=True))
    model.add(LSTM(200,return_sequences=True)) 
    model.add(Dense(total_words,use_bias=True))
    model.compile(loss=f, optimizer='SGD', metrics=['categorical_accuracy'])
    earlystop = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0, patience=5, verbose=1, mode='auto')
    model.fit(predictors, label, epochs=20, batch_size=B,validation_data=(X_val, Y_val),verbose=1, callbacks=[earlystop])
    
    return model 
# Sample 20 words uniformaly
r = 20
mask = np.ones((B*21,total_words))
for i in range(mask.shape[0]):
    mask[i][0] = 0
mask1 = mask
mask= tf.cast(tf.convert_to_tensor(mask1),tf.int32)
model2_20 = create_model2(predictors, label,d_predictors, d_label, max_sequence_len, total_words,bin_loss(r))

Train on 6048 samples, validate on 768 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 00013: early stopping


In [44]:
# Sample 100 words uniformaly
r = 100
mask = np.ones((B*21,total_words))
for i in range(mask.shape[0]):
    mask[i][0] = 0
mask1 = mask
mask= tf.cast(tf.convert_to_tensor(mask1),tf.int32)
model2_100 = create_model2(predictors, label,d_predictors, d_label, max_sequence_len, total_words,bin_loss(r))

Train on 6048 samples, validate on 768 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [41]:
# Sample 500 words uniformaly
r = 500
mask = np.ones((B*21,total_words))
for i in range(mask.shape[0]):
    mask[i][0] = 0
mask1 = mask
mask= tf.cast(tf.convert_to_tensor(mask1),tf.int32)
model2_500 = create_model2(predictors, label,d_predictors, d_label, max_sequence_len, total_words,bin_loss(r))

Train on 6048 samples, validate on 768 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [39]:
total,acc,err_count = evaluate(model2_500,t_predictors,t_label)
print("r = 500, total predictions: ", end="")
print(total)
print("accuracy: ", end="")
print(acc)
total,acc,err_count = evaluate(model2_100,t_predictors,t_label)
print("r = 100, total predictions: ", end="")
print(total)
print("accuracy: ", end="")
print(acc)

r = 500, total predictions: 8059
accuracy: 0.28216900359846137
r = 100, total predictions: 8059
accuracy: 0.2543739918103983



In [47]:
total,acc,err_count = evaluate(model2_20,t_predictors,t_label)
print("r = 20, total predictions: ", end="")
print(total)
print("accuracy: ", end="")
print(acc)


r = 20, total predictions: 8059
accuracy: 0.19779128924184142


## 3.3 UNIG-F

In [64]:
def bin_loss_unig(r,u_prob):
    def _bin_loss_unig(y_true, y_pred):
        #Binary Log loss
        d = B*21    
        index = [i for i in range(total_words)]
        NEG = np.zeros((r,total_words))
        for i in range(total_words):
            temp = u_prob.copy()
            temp = temp/sum(temp)
            NEG[:,i]= np.random.choice(index,r,p=temp) + 1
            
        neg= tf.cast(tf.convert_to_tensor(NEG),tf.int32)
        
        y_true = K.reshape(y_true,(d,total_words))

        y_true2 = K.cast(y_true,tf.int32)
        y_pred = K.reshape(y_pred,(d,total_words))
        MASK = K.batch_dot(y_true2,mask,axes=1)
        MASK = K.cast(MASK,tf.float32)
        good_loss = -K.dot(K.transpose(K.log(K.sigmoid(K.batch_dot(y_pred,y_true,axes=1)))),MASK)

        slices = K.dot(y_true2,K.transpose(neg))
        bad = custom_gather(y_pred,slices)
        bad_loss = K.log(K.ones(shape=(d, r)) - K.sigmoid(bad))

        MASK = K.repeat_elements(MASK,rep=r,axis=1)
        bad_loss = K.batch_dot(bad_loss,MASK,axes=1)
        loss =(good_loss-K.sum(bad_loss)/r)
    
        return loss
    return _bin_loss_unig


In [58]:
prob = np.zeros(1498)
corpus = data.split("\n")
total = 0 
for i in corpus:
    k = i.split(" ")
    for w in k:
        if w != "<s>":
            total += 1 
            prob[tokenizer.word_index[w] - 1] =  prob[tokenizer.word_index[w] - 1] + 1
prob = prob / total

def unig_f(p,f):
    p = np.power(p,f)
    arr1 = p / sum(p)
    return arr1


In [59]:
f = 0.4
r= 20
u_prob = unig_f(prob,f)

mask = np.ones((B*21,total_words))
for i in range(mask.shape[0]):
    mask[i][0] = 0
mask1 = mask
mask= tf.cast(tf.convert_to_tensor(mask1),tf.int32)
model2_f1 = create_model2(predictors, label,d_predictors, d_label, max_sequence_len, total_words,bin_loss_unig(r,u_prob))

Train on 6048 samples, validate on 768 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [65]:
f = 0.25
r= 20
u_prob = unig_f(prob,f)

mask = np.ones((B*21,total_words))
for i in range(mask.shape[0]):
    mask[i][0] = 0
mask1 = mask
mask= tf.cast(tf.convert_to_tensor(mask1),tf.int32)
model2_f4 = create_model2(predictors, label,d_predictors, d_label, max_sequence_len, total_words,bin_loss_unig(r,u_prob))

Train on 6048 samples, validate on 768 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 00010: early stopping


In [73]:
total,acc,err_count = evaluate(model2_f4,t_predictors,t_label)
print("r = 20, total predictions: ", end="")
print(total)
print("accuracy: ", end="")
print(acc)

r = 20, total predictions: 8059
accuracy: 0.20486412706291104


# 4. Using a Larger Context (15 points)

## 4.1 Implementation and training

In [50]:
max_sequence_len = 37
def dataset_preparation(data):

    # basic cleanup
    corpus = data.split("\n")
    # tokenization	
    # create input sequences using list of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        t = False
        for i in range(1, len(token_list)-1):
            if token_list[i] == 1:
                t = True
            if t:
                n_gram_sequence = token_list[:i+2]
                input_sequences.append(n_gram_sequence)
    # pad sequences 
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    # create predictors and label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len, total_words
tokenizer = Tokenizer(lower=False,filters='\t')
data = open('bobsue.voc.txt').read()
corpus = data.split("\n")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)+1
print("total words: ",end="")
print(total_words-1)

data = open('bobsue.prevsent.train.tsv').read()

dev_data = open('bobsue.prevsent.dev.tsv').read()
test_data = open('bobsue.prevsent.test.tsv').read()
predictors, label, max_sequence_len1,total_words1 = dataset_preparation(data)
d_predictors, d_label, max_sequence_len2,total_words2 = dataset_preparation(dev_data)
t_predictors, t_label, t_max_sequence_len3,total_words3 = dataset_preparation(test_data)


total words: 1498


In [48]:
def create_model3(predictors, label, X_val,Y_val, max_sequence_len, total_words):
    model = Sequential()
    #mask_zero to neglect padding during training  
    model.add(Embedding(total_words, 200, input_length=max_sequence_len-1))
    model.add(LSTM(200))
    model.add(Dense(total_words))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
    earlystop = EarlyStopping(monitor='val_categorical_accuracy', min_delta=0, patience=3, verbose=1, mode='auto')
    model.fit(predictors, label, epochs=10, batch_size=B,validation_data=(X_val, Y_val),verbose=1, callbacks=[earlystop])
    return model 

In [51]:
model_prev = create_model3(predictors, label,d_predictors, d_label, max_sequence_len, total_words)

Train on 65331 samples, validate on 7957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 00010: early stopping


## 4.2 Error analysis 

In [53]:
def evaluate2(model,p ,t):
    #Manually implement a evaluate to check test accuracym, so that we can make sure num of predictions is correct
    total = 0 
    k = model.predict(p)
    k = k.argmax(axis=1)
    m = t.argmax(axis=1)
    correct = 0
    err_count = {}
    for i in range(k.shape[0]):
            #if label is neither padding nor <s>
            if m[i] != 0 and m[i] != 1 : 
                total+=1 
                if k[i] == m[i]:
                    correct+=1 
                else: 
                    if (m[i],k[i]) in err_count:
                        err_count[(m[i],k[i])] +=1
                    else:
                        err_count[(m[i],k[i])] =1
                        
    return total,correct/total,err_count


total,acc,err_count = evaluate2(model_prev,t_predictors,t_label)
print("total predictions: ", end="")
print(total)
print("accuracy: ", end="")
print(acc)

total predictions: 8059
accuracy: 0.35190470281672664


In [54]:
sort_err = sorted(err_count.items(),key=lambda x:x[1],reverse=True)
def translate(a):
    x,y = a 
    return (tokenizer.index_word[x],tokenizer.index_word[y])
top35 = sort_err[0:35]

print("----Top 35 Error----")
print("First item of pair is truth, second item is prediction")

for t,cnt in top35:
    true, pred = translate(t)
    print((true,pred),end=" ")
    print("Count: ",end="")
    print(cnt)


----Top 35 Error----
First item of pair is truth, second item is prediction
('.', 'to') Count: 38
('had', 'was') Count: 33
('to', '.') Count: 32
('and', '.') Count: 30
('decided', 'was') Count: 28
('He', 'Bob') Count: 27
('for', '.') Count: 23
('was', 'had') Count: 23
('the', 'her') Count: 22
('his', 'the') Count: 22
('Sue', 'Bob') Count: 21
('got', 'was') Count: 20
('.', 'and') Count: 20
('the', 'a') Count: 19
('Her', 'She') Count: 19
('Sue', 'She') Count: 18
('Bob', 'He') Count: 18
('in', '.') Count: 18
('!', '.') Count: 17
(',', '.') Count: 17
('a', 'to') Count: 17
('he', 'to') Count: 17
('her', 'a') Count: 16
('for', 'to') Count: 16
('the', 'his') Count: 16
("'s", 'was') Count: 16
('went', 'was') Count: 16
('.', 'for') Count: 15
('She', 'Sue') Count: 14
('Bob', 'Sue') Count: 14
('and', 'to') Count: 14
('her', 'the') Count: 13
('His', 'He') Count: 13
('on', '.') Count: 13
('His', 'Bob') Count: 13
