Skip to content

Commit

Permalink
isolating changes
Browse files Browse the repository at this point in the history
  • Loading branch information
kelseyball committed Nov 27, 2017
1 parent d5fe763 commit 267dcfb
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 43 deletions.
1 change: 0 additions & 1 deletion mimick/make_dataset.py
Expand Up @@ -72,7 +72,6 @@ def read_text_embs(files):
in_vocab += 1
training_instances.append(Instance(charseq(word, c2i), emb))
training_char_count = len(c2i)

print "Total in Embeddings vocabulary:", len(words)
print "Training set character count: ", training_char_count

Expand Down
9 changes: 2 additions & 7 deletions mimick/model.py
Expand Up @@ -199,8 +199,7 @@ def old_save(self, file_name):
members_to_save.append(self.lstm_to_rep_bias)
members_to_save.append(self.mlp_out)
members_to_save.append(self.mlp_out_bias)
#self.model.save(file_name, members_to_save)
self.model.save(file_name)
self.model.save(file_name, members_to_save)

# character mapping saved separately
cPickle.dump(self.c2i, open(file_name[:-4] + '.c2i', 'w'))
Expand Down Expand Up @@ -232,7 +231,6 @@ def dist(instance, vec):
# Argument parsing
# ===-----------------------------------------------------------------------===
parser = argparse.ArgumentParser()

parser.add_argument("--dataset", required=True, help=".pkl file to use")
parser.add_argument("--vocab", required=True, help="total vocab to output")
parser.add_argument("--output", help="file with all embeddings")
Expand Down Expand Up @@ -300,7 +298,6 @@ def dist(instance, vec):

# Load words to write
vocab_words = {}

if populate_test_insts_from_vocab:
train_words = [wordify(w, i2c) for w in training_instances]
with codecs.open(options.vocab, "r", "utf-8") as vocab_file:
Expand Down Expand Up @@ -384,7 +381,6 @@ def dist(instance, vec):

root_logger.info("\n")
root_logger.info("Epoch {} complete".format(epoch + 1))

# here used to be a learning rate update, no longer supported in dynet 2.0
print trainer.status()

Expand Down Expand Up @@ -435,8 +431,7 @@ def dist(instance, vec):
if rand < showcase_size:
showcase[rand] = word

if (len(test_instances) > 0): root_logger.info("Average norm for trained: {}".format(inferred_vec_norms / len(test_instances)))

root_logger.info("Average norm for trained: {}".format(inferred_vec_norms / len(test_instances)))

if options.debug:
similar_words = {}
Expand Down
40 changes: 5 additions & 35 deletions model.py
Expand Up @@ -5,7 +5,7 @@
from collections import Counter
from _collections import defaultdict
from evaluate_morphotags import Evaluator
from sys import maxsize
from sys import maxint

import collections
import argparse
Expand All @@ -14,7 +14,6 @@
import logging
import progressbar
import os
import codecs
import dynet as dy
import numpy as np

Expand Down Expand Up @@ -97,19 +96,6 @@ def __init__(self, tagset_sizes, num_lstm_layers, hidden_dim, word_embeddings, n
self.mlp_out[att] = self.model.add_parameters((set_size, set_size), name=att+"O")
self.mlp_out_bias[att] = self.model.add_parameters(set_size, name=att+"Ob")


def dump_embeddings(self, filename):
with codecs.open(filename, "w", "utf-8") as writer:
#writer.write("{} {}\n".format(self.vocab_size, self.word_embedding_dim))
writer.write("{} {}\n".format(self.words_lookup.shape()[0], self.words_lookup.shape()[1]))
for w,i in w2i.items():
wemb = dy.lookup(self.words_lookup, i, update=self.we_update)
writer.write(w + " ")
for i in wemb.npvalue():
writer.write(str(i) + " ")
writer.write("\n")


def word_rep(self, word, char_ids):
'''
:param word: index of word in lookup table
Expand All @@ -126,17 +112,10 @@ def word_rep(self, word, char_ids):
def build_tagging_graph(self, sentence, word_chars):
dy.renew_cg()

<<<<<<< HEAD
if word_chars is not None:
embeddings = [self.word_rep(w, chars) for w, chars in zip(sentence, word_chars)]
else:
embeddings = [self.word_rep(w, word_chars) for w in sentence]
=======
if word_chars == None:
embeddings = [self.word_rep(w, None) for w in sentence]
else:
embeddings = [self.word_rep(w, chars) for w, chars in zip(sentence, word_chars)]
>>>>>>> upstream/master

lstm_out = self.word_bi_lstm.transduce(embeddings)

Expand Down Expand Up @@ -224,8 +203,7 @@ def old_save(self, file_name):
members_to_save.extend(utils.sortvals(self.lstm_to_tags_bias))
members_to_save.extend(utils.sortvals(self.mlp_out))
members_to_save.extend(utils.sortvals(self.mlp_out_bias))
#self.model.save(file_name, members_to_save)
self.model.save(file_name)
self.model.save(file_name, members_to_save)

with open(file_name + "-atts", 'w') as attdict:
attdict.write("\t".join(sorted(self.attributes)))
Expand Down Expand Up @@ -262,8 +240,8 @@ def get_word_chars(sentence, i2w, c2i):
parser.add_argument("--num-epochs", default=20, dest="num_epochs", type=int, help="Number of full passes through training set (default - 20)")
parser.add_argument("--num-lstm-layers", default=2, dest="lstm_layers", type=int, help="Number of LSTM layers (default - 2)")
parser.add_argument("--hidden-dim", default=128, dest="hidden_dim", type=int, help="Size of LSTM hidden layers (default - 128)")
parser.add_argument("--training-sentence-size", default=maxsize, dest="training_sentence_size", type=int, help="Instance count of training set (default - unlimited)")
parser.add_argument("--token-size", default=maxsize, dest="token_size", type=int, help="Token count of training set (default - unlimited)")
parser.add_argument("--training-sentence-size", default=maxint, dest="training_sentence_size", type=int, help="Instance count of training set (default - unlimited)")
parser.add_argument("--token-size", default=maxint, dest="token_size", type=int, help="Token count of training set (default - unlimited)")
parser.add_argument("--learning-rate", default=0.01, dest="learning_rate", type=float, help="Initial learning rate (default - 0.01)")
parser.add_argument("--dropout", default=-1, dest="dropout", type=float, help="Amount of dropout to apply to LSTM part of graph (default - off)")
parser.add_argument("--no-we-update", dest="no_we_update", action="store_true", help="Word Embeddings aren't updated")
Expand Down Expand Up @@ -305,7 +283,7 @@ def get_word_chars(sentence, i2w, c2i):
options.training_sentence_size, options.token_size, options.learning_rate, options.dropout, options.loss_prop))

if options.debug:
print("DEBUG MODE")
print "DEBUG MODE"

# ===-----------------------------------------------------------------------===
# Read in dataset
Expand Down Expand Up @@ -349,8 +327,6 @@ def get_word_chars(sentence, i2w, c2i):
else:
word_embeddings = None



tag_set_sizes = { att: len(t2i) for att, t2i in t2is.items() }

if options.loss_prop:
Expand Down Expand Up @@ -423,7 +399,6 @@ def get_word_chars(sentence, i2w, c2i):
# log epoch's train phase
logging.info("\n")
logging.info("Epoch {} complete".format(epoch + 1))

# here used to be a learning rate update, no longer supported in dynet 2.0
print trainer.status()

Expand Down Expand Up @@ -525,11 +500,6 @@ def get_word_chars(sentence, i2w, c2i):

# epoch loop ends


# dump embeddings after tagger training
emb_dump = options.log_dir + "/trained-embeddings.txt"
model.dump_embeddings(emb_dump)

# evaluate test data (once)
logging.info("\n")
logging.info("Number test instances: {}".format(len(test_instances)))
Expand Down

0 comments on commit 267dcfb

Please sign in to comment.