In [38]:
from __future__ import division

import sys
import time
import logging
import StringIO
from collections import defaultdict, Counter, OrderedDict
import numpy as np
from numpy import array, zeros, allclose

FDIM = 4
P_CASE = "CASE:"
CASES = ["aa", "AA", "Aa", "aA"]
START_TOKEN = "<s>"
END_TOKEN = "</s>"
UNK = "UUUNKKK"
NUM = "NNNUMMM"
LBLS = [
    "PER",
    "ORG",
    "LOC",
    "MISC",
    "O",
    ]

class Config:
    n_word_features = 2
    window_size = 1
    n_window_feature = 0
    
    n_classes = 5
    dropout = 0.5
    embed_size = 50
    hidden_size = 200
    batch_size = 2048
    n_epochs = 10
    lr = 0.001
    
    def __init__(self, output_path=None):
        if output_path:
            self.output_path = output_path
        else:
            self.output_path = "results/window/{:%Y%m%d_%H%M%S}/".format(datetime.now())
        
        self.model_output = self.output_path + "model.weights"
        self.eval_output = self.output_path + "results.txt"
        self.log_output = self.output_path + "log"
        self.conll_output = self.output_path + "window_predictions.conll"


def casing(word):
    if len(word) == 0: return word
    if word.islower(): return "aa"
    elif word.isupper(): return "AA"
    elif word[0].isupper(): return "Aa"
    else: return "aA"
def normalize(word):
    """
    Normalize words that are numbers or have casing
    """
    if word.isdigit(): return NUM
    else: return word.lower()

def build_dict(words, max_words=None, offset=0):
    cnt = Counter(words)
    if max_words:
        words = cnt.most_common(max_words)
    else:
        words = cnt.most_common()
    
    return {word: offset+i for i, (word, _) in enumerate(words)}
    
class ModelHelper(object):
    def __init__(self, tok2id, max_length):
        self.tok2id = tok2id
        self.START = [tok2id[START_TOKEN], tok2id[P_CASE + "aa"]]
        self.END = [tok2id[END_TOKEN], tok2id[P_CASE + "aa"]]
        self.max_length = max_length
        
    @classmethod
    def build(cls, data):
        tok2id = build_dict((normalize(word) for sentence, _ in data for word in sentence), offset=1, max_words=10000)
        tok2id.update(build_dict([P_CASE + c for c in CASES], offset=len(tok2id)))
        #{'CASE:aa': 2, 'CASE:AA': 3, 'CASE:Aa': 4, 'CASE:aA': 5}
        tok2id.update(build_dict([START_TOKEN, END_TOKEN, UNK], offset=len(tok2id)))
        
        max_length = max(len(sentence) for sentence, _ in data)
        
        return cls(tok2id, max_length)
    
    def vectorize_example(self, sentence, labels=None):
        sentence_ = [[self.tok2id.get(normalize(word), self.tok2id[UNK]), self.tok2id[P_CASE + casing(word)]] for word in sentence]
        if labels:
            labels_ = [LBLS.index(l) for l in labels]
            return sentence_, labels_
        else:
            return sentence_, [LBLS[-1] for _ in sentence]
    
    def vectorize(self, data):
        return [self.vectorize_example(sentence, labels) for sentence, labels in data]

In [39]:
def read_conll(fstream):
    """
    Reads a input stream @fstream (e.g. output of `open(fname, 'r')`) in CoNLL file format.
    @returns a list of examples [(tokens), (labels)]. @tokens and @labels are lists of string.
    """
    ret = []
    current_toks, current_lbls = [], []
    for line in fstream:
        line = line.strip()
        if len(line) == 0 or line.startswith("-DOCSTART-"):
            if len(current_toks) > 0:
                assert len(current_toks) == len(current_lbls)
                ret.append((current_toks, current_lbls))
            current_toks, current_lbls = [], []
        else:
            tok, lbl = line.split("\t")
            current_toks.append(tok)
            current_lbls.append(lbl)
    if len(current_toks) > 0:
        assert len(current_toks) == len(current_lbls)
        ret.append((current_toks, current_lbls))
    return ret


In [42]:
data_train = "./data/tiny.conll"
data_dev = "./data/tiny.conll"
fstream = open(data_train, "r")
train = read_conll(fstream)
train = train[:5]
fstream = open(data_dev, "r")
dev = read_conll(fstream)
print train
helper = ModelHelper.build(train)
print helper.tok2id
train_data_ = helper.vectorize(train)
print train_data_



[(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['ORG', 'O', 'MISC', 'O', 'O', 'O', 'MISC', 'O', 'O']), (['Peter', 'Blackburn'], ['PER', 'PER']), (['BRUSSELS', '1996-08-22'], ['LOC', 'O']), (['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.'], ['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'MISC', 'O', 'O', 'O', 'O', 'O', 'MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']), (['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.'], ['LOC', 'O', 'O', 'O', '