In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [6]:
from torch import nn
import json

In [7]:
!head -n 10 eng.train

-DOCSTART- -X- -X- O

EU NNP I-NP I-ORG
rejects VBZ I-VP O
German JJ I-NP I-MISC
call NN I-NP O
to TO I-VP O
boycott VB I-VP O
British JJ I-NP I-MISC
lamb NN I-NP O


In [8]:
with open('eng.train') as fl:
    sents = fl.read().split('\n\n')[1:-1]
len(sents)

14986

In [10]:
sentences = []
# [[(word, label), (...), ...] , [...], [...]]
ner_tags = {}
for meta in sents:
    pairs = []
    for line in meta.split('\n'):
        entries = line.split()
        # word, NER tag
        pairs.append([entries[0], entries[-1]])
        ner_tags[entries[-1]] = True
    sentences.append(pairs)
    
ner_tags = list(ner_tags.keys())

print(len(sentences))
print(len(ner_tags))
print(sorted(ner_tags))

14986
8
['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


### Decide on a IOB2 tag ordering which we will use to build and read 1-hot vectors

In [13]:
lookup = ['B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'I-PER', 'O']
for ent in lookup:
    assert ent in ner_tags # sanity check
ldict = { tag: ind for ind, tag in enumerate(lookup) }

In [14]:
with open('lookup.json', 'w') as fl:
    json.dump(ldict, fl, indent=4)

In [37]:
# From now on, we will use index values to refer to NER tags
!cat lookup.json

{
    "I-LOC": 1,
    "O": 7,
    "I-ORG": 3,
    "B-MISC": 5,
    "B-LOC": 0,
    "B-ORG": 2,
    "I-MISC": 6,
    "B-PER": 4
}

### Load Google embeddings

In [49]:
# http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/
import gensim
from time import time

In [50]:
t0 = time()
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  
print('Loaded: %.2fs' % (time() - t0))

Loaded: 145.99s


In [84]:
missing = {}
uniques = {}
for sent in sentences:
    for word, tag in sent:
        if word not in model:
            if word not in missing:
                missing[word] = True
#                 print('Not in embedding:', word)
                uniques[word] = model['unk']
        else:
            uniques[word] = model[word]

uniques['unk'] = model['unk'] # also save unk for later use

print('Unique words (found in embedding):', len(uniques))
print('Missing in embedding:', len(missing))

Unique words (found in embedding): 23625
Missing in embedding: 5900


In [85]:
subset_words = sorted(list(uniques.keys()))
subset_words[-5:]

['zinc', 'zinfandel', 'zlotys', 'zone', 'zvezda']

In [86]:
with open('word_list.txt', 'w') as fl:
    fl.write('\n'.join(subset_words))

In [87]:
import numpy as np
emat = np.zeros((len(subset_words), 300))

In [88]:
for si, word in enumerate(subset_words):
    emat[si, :] = uniques[word]

In [89]:
np.save('word_embeds.npy', emat)