In [1]:
import sys, os, re, json
import getpass
import sys
import time
import itertools

import numpy as np
from numpy import *
import tensorflow as tf
from q2_initialization import xavier_weight_init
import utils2 as du
import ner as ner
from utils import data_iterator
from model import LanguageModel

In [2]:
class Config(object):
    """Holds model hyperparams and data information.

    The config class is used to store various hyperparameters and dataset
    information parameters. Model objects are passed a Config() object at
    instantiation.
    """
    embed_size = 50
    batch_size = 64
    label_size = 5
    hidden_size = 100
    max_epochs = 24 
    early_stopping = 2
    dropout = 0.9
    lr = 0.001
    l2 = 0.001
    window_size = 3

In [3]:
config = Config()
debug = False

### Load Data

In [4]:
### Load the starter word vectors
def invert_dict(d):
    return {v:k for k,v in d.items()}

def load_wv(vocabfile, wvfile):
    wv = loadtxt(wvfile, dtype=float)
    with open(vocabfile) as fd:
        words = [line.strip() for line in fd]
    num_to_word = dict(enumerate(words))
    #print(num_to_word)
    word_to_num = invert_dict(num_to_word)
    return wv, word_to_num, num_to_word

wv, word_to_num, num_to_word = load_wv('data/ner/vocab.txt', 'data/ner/wordVectors.txt')
tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
num_to_tag = dict(enumerate(tagnames))
tag_to_num = {v:k for k,v in num_to_tag.items()}

In [5]:
# Load the training set

def load_dataset(fname):
    docs = []
    with open(fname) as fd:
        cur = []
        for line in fd:
            # new sentence on -DOCSTART- or blank line
            if re.match(r"-DOCSTART-.+", line) or (len(line.strip()) == 0):
                if len(cur) > 0:
                    docs.append(cur)
                cur = []
            else: # read in tokens
                cur.append(line.strip().split("\t",1))
        # flush running buffer
        docs.append(cur)
    return docs

def flatten1(lst):
    return list(itertools.chain.from_iterable(lst))

def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset): return word
    else: return "UUUNKKK" # unknown token

# For window models
def seq_to_windows(words, tags, word_to_num, tag_to_num, left=1, right=1):
    ns = len(words)
    X = []
    y = []
    for i in range(ns):
        if words[i] == "<s>" or words[i] == "</s>":
            continue # skip sentence delimiters
        tagn = tag_to_num[tags[i]]
        idxs = [word_to_num[words[ii]]
                for ii in range((i) - int(left), (i) + int(right) + 1)]
        X.append(idxs)
        y.append(tagn)
    return array(X), array(y)    
    
def pad_sequence(seq, left=1, right=1):
    return (int(left)*[("<s>", "")]) + seq + (int(right)*[("</s>", "")])    
    
def docs_to_windows(docs, word_to_num, tag_to_num, wsize=3):
    pad = (wsize - 1)/2
    docs = flatten1([pad_sequence(seq, left=pad, right=pad) for seq in docs])

    words, tags = zip(*docs)
    words = [canonicalize_word(w, word_to_num) for w in words]
    tags = [t.split("|")[0] for t in tags]
    return seq_to_windows(words, tags, word_to_num, tag_to_num, pad, pad)

### Box Main
docs = load_dataset('data/ner/train')
#print(docs[0])
X_train, y_train = docs_to_windows(docs, word_to_num, tag_to_num, wsize=config.window_size)
if debug:
    X_train = X_train[:1024]
    y_train = y_train[:1024]

In [6]:
# Load the dev set (for tuning hyperparameters)
docs = load_dataset('data/ner/dev')
X_dev, y_dev = docs_to_windows(docs, word_to_num, tag_to_num, wsize=config.window_size)
if debug:
    X_dev = X_dev[:1024]
    y_dev = y_dev[:1024]

In [7]:
# Load the test set (dummy labels only)
docs = load_dataset('data/ner/test.masked')
X_test, y_test = docs_to_windows(docs, word_to_num, tag_to_num, wsize=config.window_size)

In [14]:
class NERModel(LanguageModel):

#     def load_data(self, debug=False):
        # Load the starter word vectors
#         self.wv, word_to_num, num_to_word = ner.load_wv('data/ner/vocab.txt', 'data/ner/wordVectors.txt')
#         tagnames = ['O', 'LOC', 'MISC', 'ORG', 'PER']
#         self.num_to_tag = dict(enumerate(tagnames))
#         tag_to_num = {v:k for k,v in self.num_to_tag.iteritems()}

#         # Load the training set
#         docs = du.load_dataset('data/ner/train')
#         self.X_train, self.y_train = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=self.config.window_size)
#         if debug:
#             self.X_train = self.X_train[:1024]
#             self.y_train = self.y_train[:1024]

#         # Load the dev set (for tuning hyperparameters)
#         docs = du.load_dataset('data/ner/dev')
#         self.X_dev, self.y_dev = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=self.config.window_size)
#         if debug:
#             self.X_dev = self.X_dev[:1024]
#             self.y_dev = self.y_dev[:1024]

#         # Load the test set (dummy labels only)
#         docs = du.load_dataset('data/ner/test.masked')
#         self.X_test, self.y_test = du.docs_to_windows(docs, word_to_num, tag_to_num, wsize=self.config.window_size)

    def add_placeholders(self):

        ### YOUR CODE HERE
        self.input_placeholder = tf.placeholder(tf.int32, shape=[None, self.config.window_size], name='Input')
        self.labels_placeholder = tf.placeholder(tf.float32, shape=[None, self.config.label_size], name='Target')
        self.dropout_placeholder = tf.placeholder(tf.float32, name='Dropout')
        ### END YOUR CODE

    def create_feed_dict(self, input_batch, dropout, label_batch=None):

        ### YOUR CODE HERE
        feed_dict = {self.input_placeholder: input_batch, }
        if label_batch is not None:
            feed_dict[self.labels_placeholder] = label_batch
        if dropout is not None:
            feed_dict[self.dropout_placeholder] = dropout
        ### END YOUR CODE
        return feed_dict

    def add_embedding(self):

        # The embedding lookup is currently only implemented for the CPU
        with tf.device('/cpu:0'):
            ### YOUR CODE HERE
            embedding = tf.get_variable('Embedding', [len(wv), self.config.embed_size])
            window = tf.nn.embedding_lookup(embedding, self.input_placeholder)
            window = tf.reshape(window, [-1, self.config.window_size * self.config.embed_size])
            ### END YOUR CODE
            return window

    def add_model(self, window):

        ### YOUR CODE HERE
        with tf.variable_scope('Layer1', initializer=xavier_weight_init()) as scope:
            W = tf.get_variable('W', [self.config.window_size * self.config.embed_size, self.config.hidden_size])
            b1 = tf.get_variable('b1', [self.config.hidden_size])
            h = tf.nn.tanh(tf.matmul(window, W) + b1)
            if self.config.l2:
                tf.add_to_collection('total_loss', 0.5 * self.config.l2 * tf.nn.l2_loss(W))

        with tf.variable_scope('Layer2', initializer=xavier_weight_init()) as scope:
            U = tf.get_variable('U', [self.config.hidden_size, self.config.label_size])
            b2 = tf.get_variable('b2', [self.config.label_size])
            y = tf.matmul(h, U) + b2
            if self.config.l2:
                tf.add_to_collection('total_loss', 0.5 * self.config.l2 * tf.nn.l2_loss(U))
        output = tf.nn.dropout(y, self.dropout_placeholder)
        ### END YOUR CODE
        return output 

    def add_loss_op(self, y):

        ### YOUR CODE HERE
        cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, self.labels_placeholder))
        tf.add_to_collection('total_loss', cross_entropy)
        loss = tf.add_n(tf.get_collection('total_loss'))
        ### END YOUR CODE
        return loss

    def add_training_op(self, loss):

        ### YOUR CODE HERE
        optimizer = tf.train.AdamOptimizer(self.config.lr)
        global_step = tf.Variable(0, name='global_step', trainable=False)
        train_op = optimizer.minimize(loss, global_step=global_step)
        ### END YOUR CODE
        return train_op

    def __init__(self, config):
        self.config = config
        #self.load_data(debug=False)
        self.add_placeholders()
        window = self.add_embedding()
        y = self.add_model(window)

        self.loss = self.add_loss_op(y)
        self.predictions = tf.nn.softmax(y)
        one_hot_prediction = tf.argmax(self.predictions, 1)
        correct_prediction = tf.equal(
            tf.argmax(self.labels_placeholder, 1), one_hot_prediction)
        self.correct_predictions = tf.reduce_sum(tf.cast(correct_prediction, 'int32'))
        self.train_op = self.add_training_op(self.loss)

    def run_epoch(self, session, input_data, input_labels, shuffle=True, verbose=True):
        orig_X, orig_y = input_data, input_labels
        dp = self.config.dropout
        # We're interested in keeping track of the loss and accuracy during training
        total_loss = []
        total_correct_examples = 0
        total_processed_examples = 0
        total_steps = len(orig_X) / self.config.batch_size
        for step, (x, y) in enumerate(data_iterator(orig_X, orig_y, batch_size=self.config.batch_size,label_size=self.config.label_size, shuffle=shuffle)):
            feed = self.create_feed_dict(input_batch=x, dropout=dp, label_batch=y)
            loss, total_correct, _ = session.run([self.loss, self.correct_predictions, self.train_op],feed_dict=feed)
            total_processed_examples += len(x)
            total_correct_examples += total_correct
            total_loss.append(loss)
            ##
            if verbose and step % verbose == 0:
                sys.stdout.write('\r{} / {} : loss = {}'.format(
                    step, total_steps, np.mean(total_loss)))
                sys.stdout.flush()
        if verbose:
            sys.stdout.write('\r')
            sys.stdout.flush()
        return np.mean(total_loss), total_correct_examples / float(total_processed_examples)

    def predict(self, session, X, y=None):
        # If y is given, the loss is also calculated
        # We deactivate dropout by setting it to 1
        dp = 1
        losses = []
        results = []
        if np.any(y):
            data = data_iterator(X, y, batch_size=self.config.batch_size,label_size=self.config.label_size, shuffle=False)
        else:
            data = data_iterator(X, batch_size=self.config.batch_size,label_size=self.config.label_size, shuffle=False)
        for step, (x, y) in enumerate(data):
            feed = self.create_feed_dict(input_batch=x, dropout=dp)
            if np.any(y):
                feed[self.labels_placeholder] = y
                loss, preds = session.run([self.loss, self.predictions], feed_dict=feed)
                losses.append(loss)
            else:
                preds = session.run(self.predictions, feed_dict=feed)
            predicted_indices = preds.argmax(axis=1)
            results.extend(predicted_indices)
        return np.mean(losses), results

def print_confusion(confusion, num_to_tag):
    # Summing top to bottom gets the total number of tags guessed as T
    total_guessed_tags = confusion.sum(axis=0)
    # Summing left to right gets the total number of true tags
    total_true_tags = confusion.sum(axis=1)
    #print
    print(confusion)
    for i, tag in sorted(num_to_tag.items()):
        prec = confusion[i, i] / float(total_guessed_tags[i])
        recall = confusion[i, i] / float(total_true_tags[i])
        print('Tag: {} - P {:2.4f} / R {:2.4f}'.format(tag, prec, recall))

def calculate_confusion(config, predicted_indices, y_indices):
    confusion = np.zeros((config.label_size, config.label_size), dtype=np.int32)
    for i in range(len(y_indices)):
        correct_label = y_indices[i]
        guessed_label = predicted_indices[i]
        confusion[correct_label, guessed_label] += 1
    return confusion

def save_predictions(predictions, filename):
    with open(filename, "w") as f:
        for prediction in predictions:
            f.write(str(prediction) + "\n")

def test_NER():

#     config = Config()
    with tf.Graph().as_default():
        model = NERModel(config)

        init = tf.initialize_all_variables()
        saver = tf.train.Saver()

        with tf.Session() as session:
            best_val_loss = float('inf')
            best_val_epoch = 0

            session.run(init)
            for epoch in range(config.max_epochs):
                print('Epoch {}'.format(epoch))
                start = time.time()
                ###
                train_loss, train_acc = model.run_epoch(session, X_train, y_train)
                val_loss, predictions = model.predict(session, X_dev, y_dev)
                print('Training loss: {}'.format(train_loss))
                print('Training acc: {}'.format(train_acc))
                print('Validation loss: {}'.format(val_loss))
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_val_epoch = epoch
                    if not os.path.exists("./weights"):
                        os.makedirs("./weights")

                    saver.save(session, './weights/ner.weights')
                if epoch - best_val_epoch > config.early_stopping:
                      break
                ###
                confusion = calculate_confusion(config, predictions, y_dev)
                print_confusion(confusion, num_to_tag)
                print('Total time: {}'.format(time.time() - start))

            saver.restore(session, './weights/ner.weights')
            print('Test')
            print('=-=-=')
            print('Writing predictions to q2_test.predicted')
            _, predictions = model.predict(session, X_test, y_test)
            save_predictions(predictions, "q2_test.predicted")

In [15]:
test_NER()

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Epoch 0
3181 / 3181.578125 : loss = 0.30291557312011723Training loss: 0.3029155731201172
Training acc: 0.9214079097931942
Validation loss: 0.1950983703136444
[[42340    90    92   142    95]
 [  193  1767    10    76    48]
 [  270    38   925    18    17]
 [  580   160    35  1211   106]
 [  453    43     5    36  2612]]
Tag: O - P 0.9659 / R 0.9902
Tag: LOC - P 0.8422 / R 0.8438
Tag: MISC - P 0.8669 / R 0.7295
Tag: ORG - P 0.8166 / R 0.5789
Tag: PER - P 0.9076 / R 0.8295
Total time: 254.55922269821167
Epoch 1
3181 / 3181.578125 : loss = 0.16783928871154785Training loss: 0.16783928871154785
Training acc: 0.9637070832576208
Validation loss: 0.1892177313566208
[[42408    59    52   154    86]
 [  179  1767    16    91    41]
 [  313    32   868    36    19]
 [  565    92    29  1314    92]
 [  448    42     2    37  2620]]
Tag: O - P 0.9657 / R 0.9918
Tag: LOC - P 0.8870 / R 0.8438
Tag: MISC - P 0.8976 / R 0.6845


  ret = ret.dtype.type(ret / rcount)
