From c263461b6958c15fb8fd04baba71e77ae986407b Mon Sep 17 00:00:00 2001 From: Trevor Cohn Date: Tue, 1 Nov 2016 16:25:05 +1100 Subject: [PATCH 1/5] Transitioned to dynet and fixed various warnings --- .gitmodules | 6 ++-- CMakeLists.txt | 8 ++--- cnn | 1 - dynet | 1 + src/CMakeLists.txt | 21 +++++++++-- src/attentional.cc | 48 ++++++++++++------------- src/attentional.h | 85 ++++++++++++++++++++++---------------------- src/biattentional.cc | 36 +++++++++---------- src/expr-xtra.h | 20 +++++------ 9 files changed, 118 insertions(+), 108 deletions(-) delete mode 160000 cnn create mode 160000 dynet diff --git a/.gitmodules b/.gitmodules index 17ad0b8..e21e53f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "cnn"] - path = cnn - url = https://github.com/clab/cnn.git +[submodule "dynet"] + path = dynet + url = https://github.com/clab/dynet.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 226dd53..7ca3813 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ project(mantis) cmake_minimum_required(VERSION 2.8 FATAL_ERROR) -set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cnn/cmake) +set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/dynet/cmake) # CNN uses Eigen which exploits modern CPU architectures. To get the # best possible performance, the following are recommended: @@ -29,7 +29,7 @@ endif() enable_testing() include_directories(${CMAKE_CURRENT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/cnn) + ${PROJECT_SOURCE_DIR}/dynet) function(find_cudnn) set(CUDNN_ROOT "" CACHE PATH "CUDNN root path") @@ -111,9 +111,9 @@ include_directories(${EIGEN3_INCLUDE_DIR}) FIND_PACKAGE(Threads REQUIRED) set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT}) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cnn/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dynet/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h) include_directories(${CMAKE_CURRENT_BINARY_DIR}) -add_subdirectory(cnn/cnn) +add_subdirectory(dynet/dynet) add_subdirectory(src) enable_testing() diff --git a/cnn b/cnn deleted file mode 160000 index ec75eb8..0000000 --- a/cnn +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ec75eb85932f964e5c3ad79515b4b2c45c84f9a3 diff --git a/dynet b/dynet new file mode 160000 index 0000000..8904224 --- /dev/null +++ b/dynet @@ -0,0 +1 @@ +Subproject commit 8904224f7c5788246035e78fb4abe5b7df6aeba3 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 42bb893..2a0df7a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,14 +2,29 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8) foreach(TARGET attentional biattentional) ADD_EXECUTABLE(${TARGET} ${TARGET}.cc) - target_link_libraries(${TARGET} cnn ${LIBS}) if(UNIX AND NOT APPLE) target_link_libraries(${TARGET} rt) endif() if (WITH_CUDA_BACKEND) - add_dependencies(${TARGET} cnncuda) - target_link_libraries(${TARGET} cnncuda) + set(CUDA_SEPARABLE_COMPILATION ON) + list(APPEND CUDA_NVCC_FLAGS "-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_52,code=sm_52;-gencode;arch=compute_52,code=compute_52;-std=c++11;-DVERBOSE;-DEIGEN_USE_GPU;-DHAVE_CUDA") + if(CMAKE_COMPILER_IS_GNUCXX) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) + # gcc 4.9 or later versions raise SEGV due to the optimization problem. + # Use -O1 instead for now. + list(APPEND CUDA_NVCC_FLAGS "-O1") + else() + list(APPEND CUDA_NVCC_FLAGS "-O2") + endif() + else() + list(APPEND CUDA_NVCC_FLAGS "-O2") + endif() + add_dependencies(${TARGET} gdynet dynetcuda) + target_link_libraries(${TARGET} gdynet dynetcuda) CUDA_ADD_CUBLAS_TO_TARGET(${TARGET}) + else() + add_dependencies(${TARGET} dynet) + target_link_libraries(${TARGET} dynet ${LIBS}) endif (WITH_CUDA_BACKEND) endforeach() diff --git a/src/attentional.cc b/src/attentional.cc index e76c7f0..4d5ffcd 100644 --- a/src/attentional.cc +++ b/src/attentional.cc @@ -10,7 +10,7 @@ #include using namespace std; -using namespace cnn; +using namespace dynet; using namespace boost::program_options; unsigned LAYERS = 1; // 2 @@ -19,8 +19,8 @@ unsigned ALIGN_DIM = 32; // 128 unsigned SRC_VOCAB_SIZE = 0; unsigned TGT_VOCAB_SIZE = 0; -cnn::Dict sd; -cnn::Dict td; +dynet::Dict sd; +dynet::Dict td; int kSRC_SOS; int kSRC_EOS; int kTGT_SOS; @@ -45,7 +45,7 @@ template int main_body(variables_map vm); int main(int argc, char** argv) { - cnn::initialize(argc, argv); + dynet::initialize(argc, argv); // command line processing variables_map vm; @@ -112,8 +112,8 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel, bool doco, float coverage, bool display, bool fert); template void test_rescore(Model &model, AM_t &am, Corpus &testing, bool doco); -template void test_decode(Model &model, AM_t &am, std::string test_file, bool doco, int beam); -template void test_kbest_arcs(Model &model, AM_t &am, string test_file, int top_k); +template void test_decode(Model &model, AM_t &am, std::string test_file, bool doco, unsigned beam); +template void test_kbest_arcs(Model &model, AM_t &am, string test_file, unsigned top_k); template void fert_stats(Model &model, AM_t &am, Corpus &devel, bool global_fert); const Sentence* context(const Corpus &corpus, unsigned i); @@ -146,8 +146,6 @@ int main_body(variables_map vm) if (vm.count("lstm")) flavour = "LSTM"; else if (vm.count("gru")) flavour = "GRU"; - typedef vector Sentence; - typedef pair SentencePair; Corpus training, devel, testing; string line; cerr << "Reading training data from " << vm["train"].as() << "...\n"; @@ -234,19 +232,19 @@ int main_body(variables_map vm) vm["epochs"].as(), doco, vm["coverage"].as(), vm.count("display"), fert); else if (vm.count("kbest")) - test_kbest_arcs(model, am, vm["kbest"].as(), vm["topk"].as()); + test_kbest_arcs(model, am, vm["kbest"].as(), vm["topk"].as()); else if (vm.count("test")) { if (vm.count("rescore")) test_rescore(model, am, testing, doco); else // test - test_decode(model, am, vm["test"].as(), doco, vm["beam"].as()); + test_decode(model, am, vm["test"].as(), doco, vm["beam"].as()); } else if (vm.count("fert-stats")) fert_stats(model, am, devel, vm.count("fertility")); delete sgd; - //cnn::Free(); + //dynet::Free(); return EXIT_SUCCESS; } @@ -264,9 +262,9 @@ void test_rescore(Model &model, AM_t &am, Corpus &testing, bool doco) tie(ssent, tsent, docid) = testing[i]; ComputationGraph cg; - am.BuildGraph(ssent, tsent, cg, nullptr, (doco) ? context(testing, i) : nullptr); + auto iloss = am.BuildGraph(ssent, tsent, cg, nullptr, (doco) ? context(testing, i) : nullptr); - double loss = as_scalar(cg.forward()); + double loss = as_scalar(cg.forward(iloss)); cout << i << " |||"; for (auto &w: ssent) cout << " " << sd.convert(w); @@ -286,10 +284,8 @@ void test_rescore(Model &model, AM_t &am, Corpus &testing, bool doco) } template -void test_decode(Model &model, AM_t &am, string test_file, bool doco, int beam) +void test_decode(Model &model, AM_t &am, string test_file, bool doco, unsigned beam) { - double tloss = 0; - int tchars = 0; int lno = 0; cerr << "Reading test examples from " << test_file << endl; @@ -339,7 +335,7 @@ void test_decode(Model &model, AM_t &am, string test_file, bool doco, int beam) } template -void test_kbest_arcs(Model &model, AM_t &am, string test_file, int top_k) +void test_kbest_arcs(Model &model, AM_t &am, string test_file, unsigned top_k) { // only suitable for monolingual setting, of predicting a sentence given preceeding sentence cerr << "Reading test examples from " << test_file << endl; @@ -369,7 +365,7 @@ void test_kbest_arcs(Model &model, AM_t &am, string test_file, int top_k) errs.push_back(i_err); } Expression i_nerr = sum(errs); - double loss = as_scalar(cg.incremental_forward()); + double loss = as_scalar(cg.incremental_forward(i_nerr)); //cout << last_last_id << ":" << last_id << " |||"; //for (auto &w: source) cout << " " << sd.convert(w); @@ -483,9 +479,9 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel, } bool first = true; - int report = 0; + unsigned report = 0; unsigned lines = 0; - int epoch = 0; + unsigned epoch = 0; Sentence ssent, tsent; int docid; @@ -497,8 +493,8 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel, tie(ssent, tsent, docid) = devel[i]; ComputationGraph cg; Expression alignment; - am.BuildGraph(ssent, tsent, cg, &alignment, (doco) ? context(devel, i) : nullptr); - cg.forward(); + auto iloss = am.BuildGraph(ssent, tsent, cg, &alignment, (doco) ? context(devel, i) : nullptr); + cg.forward(iloss); cout << "\n====== SENTENCE " << i << " =========\n"; am.display_ascii(ssent, tsent, cg, alignment, sd, td); @@ -586,7 +582,7 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel, objective = objective + fertility_nll; // perform forward computation for aggregate objective - cg.forward(); + cg.forward(objective); // grab the parts of the objective loss += as_scalar(cg.get_value(xent.i)); @@ -595,7 +591,7 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel, if (fert) loss_fert += as_scalar(cg.get_value(fertility_nll.i)); - cg.backward(); + cg.backward(objective); sgd.update(); ++lines; @@ -627,8 +623,8 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel, for (unsigned i = 0; i < devel.size(); ++i) { tie(ssent, tsent, docid) = devel[i]; ComputationGraph cg; - am.BuildGraph(ssent, tsent, cg, nullptr, (doco) ? context(devel, i) : nullptr, nullptr, nullptr); - dloss += as_scalar(cg.forward()); + auto idloss = am.BuildGraph(ssent, tsent, cg, nullptr, (doco) ? context(devel, i) : nullptr, nullptr, nullptr); + dloss += as_scalar(cg.forward(idloss)); dchars += tsent.size() - 1; } if (dloss < best) { diff --git a/src/attentional.h b/src/attentional.h index 50497eb..6e23924 100644 --- a/src/attentional.h +++ b/src/attentional.h @@ -1,14 +1,14 @@ #pragma once -#include "cnn/nodes.h" -#include "cnn/cnn.h" -#include "cnn/training.h" -#include "cnn/timing.h" -#include "cnn/rnn.h" -#include "cnn/gru.h" -#include "cnn/lstm.h" -#include "cnn/dict.h" -#include "cnn/expr.h" +#include "dynet/nodes.h" +#include "dynet/dynet.h" +#include "dynet/training.h" +#include "dynet/timing.h" +#include "dynet/rnn.h" +#include "dynet/gru.h" +#include "dynet/lstm.h" +#include "dynet/dict.h" +#include "dynet/expr.h" #include "expr-xtra.h" #include @@ -21,7 +21,7 @@ #define RNN_H0_IS_ZERO -namespace cnn { +namespace dynet { template struct AttentionalModel { @@ -51,12 +51,12 @@ struct AttentionalModel { Dict &tdict, const std::vector* ctx=0); std::vector beam_decode(const std::vector &source, ComputationGraph& cg, - int beam_width, Dict &tdict, const std::vector* ctx=0); + unsigned beam_width, Dict &tdict, const std::vector* ctx=0); std::vector sample(const std::vector &source, ComputationGraph& cg, Dict &tdict, const std::vector* ctx=0); - void add_fertility_params(cnn::Model* model, unsigned hidden_dim, bool _rnn_src_embeddings); + void add_fertility_params(dynet::Model* model, unsigned hidden_dim, bool _rnn_src_embeddings); LookupParameter p_cs; LookupParameter p_ct; @@ -125,7 +125,7 @@ struct AttentionalModel { KTHXBYE(expression) template -AttentionalModel::AttentionalModel(cnn::Model* model, +AttentionalModel::AttentionalModel(dynet::Model* model, unsigned vocab_size_src, unsigned _vocab_size_tgt, unsigned layers, unsigned hidden_dim, unsigned align_dim, bool _rnn_src_embeddings, bool _giza_positional, bool _giza_markov, bool _giza_fertility, bool _doc_context, @@ -195,7 +195,7 @@ AttentionalModel::AttentionalModel(cnn::Model* model, } template -void AttentionalModel::add_fertility_params(cnn::Model* model, unsigned hidden_dim, bool _rnn_src_embeddings) +void AttentionalModel::add_fertility_params(dynet::Model* model, unsigned hidden_dim, bool _rnn_src_embeddings) { if (_rnn_src_embeddings) { p_Wfhid = model->add_parameters({hidden_dim, 2*hidden_dim}); @@ -495,8 +495,8 @@ AttentionalModel::display_ascii(const std::vector &source, const s // display the alignment //float I = target.size() - 1; //float J = source.size() - 1; - float I = target.size(); - float J = source.size(); + unsigned I = target.size(); + unsigned J = source.size(); //vector symbols{"\u2588","\u2589","\u258A","\u258B","\u258C","\u258D","\u258E","\u258F"}; vector symbols{".","o","*","O","@"}; int num_symbols = symbols.size(); @@ -516,18 +516,18 @@ AttentionalModel::display_ascii(const std::vector &source, const s cout.setf(ios_base::adjustfield, ios_base::left); cout << setw(12) << "source" << " "; cout.setf(ios_base::adjustfield, ios_base::right); - for (int j = 0; j < J; ++j) + for (unsigned j = 0; j < J; ++j) cout << setw(2) << j << ' '; cout << endl; - for (int i = 0; i < I; ++i) { + for (unsigned i = 0; i < I; ++i) { cout.setf(ios_base::adjustfield, ios_base::left); //cout << setw(12) << td.convert(target[i+1]) << " "; cout << setw(12) << td.convert(target[i]) << " "; cout.setf(ios_base::adjustfield, ios_base::right); float max_v = 0; int max_j = -1; - for (int j = 0; j < J; ++j) { + for (unsigned j = 0; j < J; ++j) { float v = TensorTools::AccessElement(a, Dim({(unsigned int)j, (unsigned int)i})); string symbol; for (int s = 0; s <= num_symbols; ++s) { @@ -547,7 +547,7 @@ AttentionalModel::display_ascii(const std::vector &source, const s cout << setw(20) << "max Pr=" << setprecision(3) << setw(5) << max_v << " @ " << max_j << endl; } cout << resetiosflags(ios_base::adjustfield); - for (int j = 0; j < J; ++j) + for (unsigned j = 0; j < J; ++j) cout << j << ":" << sd.convert(source[j]) << ' '; cout << endl; } @@ -560,21 +560,21 @@ AttentionalModel::display_tikz(const std::vector &source, const st using namespace std; // display the alignment - float I = target.size(); - float J = source.size(); + unsigned I = target.size(); + unsigned J = source.size(); const Tensor &a = cg.get_value(alignment.i); cout << a.d[0] << " x " << a.d[1] << endl; cout << "\\begin{tikzpicture}[scale=0.5]\n"; - for (int j = 0; j < J; ++j) + for (unsigned j = 0; j < J; ++j) cout << "\\node[anchor=west,rotate=90] at (" << j+0.5 << ", " << I+0.2 << ") { " << sd.convert(source[j]) << " };\n"; - for (int i = 0; i < I; ++i) + for (unsigned i = 0; i < I; ++i) cout << "\\node[anchor=west] at (" << J+0.2 << ", " << I-i-0.5 << ") { " << td.convert(target[i]) << " };\n"; float eps = 0.01; - for (int i = 0; i < I; ++i) { - for (int j = 0; j < J; ++j) { + for (unsigned i = 0; i < I; ++i) { + for (unsigned j = 0; j < J; ++j) { float v = TensorTools::AccessElement(a, Dim({(unsigned int)j, (unsigned int)i})); //int val = int(pow(v, 0.5) * 100); int val = int(v * 100); @@ -589,7 +589,7 @@ AttentionalModel::display_tikz(const std::vector &source, const st template std::vector AttentionalModel::greedy_decode(const std::vector &source, ComputationGraph& cg, - cnn::Dict &tdict, const std::vector* ctx) + dynet::Dict &tdict, const std::vector* ctx) { const int sos_sym = tdict.convert(""); const int eos_sym = tdict.convert(""); @@ -598,7 +598,7 @@ AttentionalModel::greedy_decode(const std::vector &source, Computa target.push_back(sos_sym); //std::cerr << tdict.convert(target.back()); - int t = 0; + unsigned t = 0; start_new_instance(source, cg, ctx); while (target.back() != eos_sym) { @@ -607,7 +607,7 @@ AttentionalModel::greedy_decode(const std::vector &source, Computa // find the argmax next word (greedy) unsigned w = 0; - auto dist = as_vector(cg.incremental_forward()); // evaluates last expression, i.e., ydist + auto dist = as_vector(cg.incremental_forward(ydist)); auto pr_w = dist[w]; for (unsigned x = 1; x < dist.size(); ++x) { if (dist[x] > pr_w) { @@ -647,11 +647,11 @@ struct Hypothesis { template std::vector -AttentionalModel::beam_decode(const std::vector &source, ComputationGraph& cg, int beam_width, - cnn::Dict &tdict, const std::vector* ctx) +AttentionalModel::beam_decode(const std::vector &source, ComputationGraph& cg, + unsigned beam_width, dynet::Dict &tdict, const std::vector* ctx) { - const int sos_sym = tdict.convert(""); - const int eos_sym = tdict.convert(""); + const unsigned sos_sym = tdict.convert(""); + const unsigned eos_sym = tdict.convert(""); start_new_instance(source, cg, ctx); @@ -661,7 +661,7 @@ AttentionalModel::beam_decode(const std::vector &source, Computati std::vector vocab(boost::copy_range>(boost::irange(0u, vocab_size_tgt))); std::vector completed; - for (int steps = 0; completed.size() < beam_width && steps < 2*source.size(); ++steps) { + for (unsigned steps = 0; completed.size() < beam_width && steps < 2*source.size(); ++steps) { std::vector new_chart; for (auto &hprev: chart) { @@ -672,8 +672,7 @@ AttentionalModel::beam_decode(const std::vector &source, Computati Expression ydist = softmax(i_scores); // compiler warning, but see below // find the top k best next words - unsigned w = 0; - auto dist = as_vector(cg.incremental_forward()); // evaluates last expression, i.e., ydist + auto dist = as_vector(cg.incremental_forward(ydist)); std::partial_sort(vocab.begin(), vocab.begin()+beam_width, vocab.end(), [&dist](unsigned v1, unsigned v2) { return dist[v1] > dist[v2]; }); @@ -709,7 +708,7 @@ AttentionalModel::beam_decode(const std::vector &source, Computati template std::vector -AttentionalModel::sample(const std::vector &source, ComputationGraph& cg, cnn::Dict &tdict, +AttentionalModel::sample(const std::vector &source, ComputationGraph& cg, dynet::Dict &tdict, const std::vector *ctx) { const int sos_sym = tdict.convert(""); @@ -727,7 +726,7 @@ AttentionalModel::sample(const std::vector &source, ComputationGra Expression ydist = softmax(i_scores); // in rnnlm.cc there's a loop around this block -- why? can incremental_forward fail? - auto dist = as_vector(cg.incremental_forward()); + auto dist = as_vector(cg.incremental_forward(ydist)); double p = rand01(); unsigned w = 0; for (; w < dist.size(); ++w) { @@ -759,11 +758,11 @@ AttentionalModel::display_fertility(const std::vector &source, Dic Expression vbias = concatenate(std::vector(slen, parameter(cg, p_bfvar))); Expression fhid = tanh(transpose(fbias + parameter(cg, p_Wfhid) * src)); Expression mu = mbias + fhid * parameter(cg, p_Wfmu); - auto mu_vec = as_vector(cg.incremental_forward()); // evaluates last expression + auto mu_vec = as_vector(cg.incremental_forward(mu)); Expression var = exp(vbias + fhid * parameter(cg, p_Wfvar)); - auto var_vec = as_vector(cg.incremental_forward()); // evaluates last expression + auto var_vec = as_vector(cg.incremental_forward(var)); - for (int j = 1; j < slen-1; ++j) + for (unsigned j = 1; j < slen-1; ++j) std::cout << sd.convert(source[j]) << '\t' << mu_vec[j] << '\t' << var_vec[j] << '\n'; } @@ -776,9 +775,9 @@ AttentionalModel::display_empirical_fertility(const std::vector &s BuildGraph(source, target, cg, &alignment); Expression totals = sum_cols(alignment); - auto totals_vec = as_vector(cg.incremental_forward()); // evaluates last expression + auto totals_vec = as_vector(cg.incremental_forward(totals)); - for (int j = 0; j < slen; ++j) + for (unsigned j = 0; j < slen; ++j) std::cout << sd.convert(source[j]) << '\t' << totals_vec[j] << '\n'; } diff --git a/src/biattentional.cc b/src/biattentional.cc index 7ccae64..b4dc898 100644 --- a/src/biattentional.cc +++ b/src/biattentional.cc @@ -10,7 +10,7 @@ #include using namespace std; -using namespace cnn; +using namespace dynet; using namespace boost::program_options; unsigned LAYERS = 2; @@ -25,8 +25,8 @@ bool GIZA_F = true; bool DOC = false; bool FERT = false; -cnn::Dict sd; -cnn::Dict td; +dynet::Dict sd; +dynet::Dict td; int kSRC_SOS; int kSRC_EOS; int kTGT_SOS; @@ -53,7 +53,7 @@ struct BidirAttentionalModel { m_trace_weight = trace_weight; } - void add_fertility_params(cnn::Model* model) + void add_fertility_params(dynet::Model* model) { s2t_model.add_fertility_params(model, HIDDEN_DIM, BIDIR); t2s_model.add_fertility_params(model, HIDDEN_DIM, BIDIR); @@ -107,12 +107,12 @@ struct BidirAttentionalModel { assert(lparams.size() == 2*sm.lookup_parameters_list().size()); for (const auto &p : sm.lookup_parameters_list()) { for (unsigned i = 0; i < p->values.size(); ++i) - memcpy(lparams[lid]->values[i].v, &p->values[i].v[0], sizeof(cnn::real) * p->values[i].d.size()); + memcpy(lparams[lid]->values[i].v, &p->values[i].v[0], sizeof(dynet::real) * p->values[i].d.size()); lid++; } for (const auto &p : tm.lookup_parameters_list()) { for (unsigned i = 0; i < p->values.size(); ++i) - memcpy(lparams[lid]->values[i].v, &p->values[i].v[0], sizeof(cnn::real) * p->values[i].d.size()); + memcpy(lparams[lid]->values[i].v, &p->values[i].v[0], sizeof(dynet::real) * p->values[i].d.size()); lid++; } assert(lid == lparams.size()); @@ -120,9 +120,9 @@ struct BidirAttentionalModel { unsigned did = 0; auto &dparams = model.parameters_list(); for (const auto &p : sm.parameters_list()) - memcpy(dparams[did++]->values.v, &p->values.v[0], sizeof(cnn::real) * p->values.d.size()); + memcpy(dparams[did++]->values.v, &p->values.v[0], sizeof(dynet::real) * p->values.d.size()); for (const auto &p : tm.parameters_list()) - memcpy(dparams[did++]->values.v, &p->values.v[0], sizeof(cnn::real) * p->values.d.size()); + memcpy(dparams[did++]->values.v, &p->values.v[0], sizeof(dynet::real) * p->values.d.size()); assert(did == dparams.size()); } }; @@ -157,7 +157,7 @@ Corpus read_corpus(const string &filename) } int main(int argc, char** argv) { - cnn::initialize(argc, argv); + dynet::initialize(argc, argv); // command line processing variables_map vm; @@ -288,8 +288,8 @@ int main(int argc, char** argv) { unsigned i = 0; for (auto& spair : dev) { ComputationGraph cg; - am.build_graph(spair.first, spair.second, cg); - dloss += as_scalar(cg.incremental_forward()); + auto idloss = am.build_graph(spair.first, spair.second, cg); + dloss += as_scalar(cg.incremental_forward(idloss)); dloss_s2t += as_scalar(cg.get_value(am.s2t_xent.i)); dloss_t2s += as_scalar(cg.get_value(am.t2s_xent.i)); dloss_trace += as_scalar(cg.get_value(am.trace_bonus.i)); @@ -324,14 +324,14 @@ int main(int argc, char** argv) { unsigned dchars_s = 0, dchars_t = 0, dchars_tt = 0; for (unsigned i = 0; i < testing.size(); ++i) { ComputationGraph cg; - am.build_graph(testing[i].first, testing[i].second, cg); + auto idloss = am.build_graph(testing[i].first, testing[i].second, cg); dchars_s += testing[i].first.size() - 1; dchars_t += testing[i].second.size() - 1; dchars_tt += std::max(testing[i].first.size(), testing[i].second.size()) - 1; // max or min? //cg.forward(); - dloss += as_scalar(cg.forward()); + dloss += as_scalar(cg.forward(idloss)); double loss_s2t = as_scalar(cg.get_value(am.s2t_xent.i)); double loss_t2s = as_scalar(cg.get_value(am.t2s_xent.i)); @@ -388,12 +388,12 @@ int main(int argc, char** argv) { chars_t += spair.second.size() - 1; chars_tt += std::max(spair.first.size(), spair.second.size()) - 1; // max or min? ++si; - am.build_graph(spair.first, spair.second, cg); - loss += as_scalar(cg.forward()); + auto iloss = am.build_graph(spair.first, spair.second, cg); + loss += as_scalar(cg.forward(iloss)); loss_s2t += as_scalar(cg.get_value(am.s2t_xent.i)); loss_t2s += as_scalar(cg.get_value(am.t2s_xent.i)); loss_trace += as_scalar(cg.get_value(am.trace_bonus.i)); - cg.backward(); + cg.backward(iloss); sgd.update(1.0f); ++lines; @@ -421,8 +421,8 @@ int main(int argc, char** argv) { unsigned dchars_s = 0, dchars_t = 0, dchars_tt = 0; for (auto& spair : dev) { ComputationGraph cg; - am.build_graph(spair.first, spair.second, cg); - dloss += as_scalar(cg.incremental_forward()); + auto idloss = am.build_graph(spair.first, spair.second, cg); + dloss += as_scalar(cg.incremental_forward(idloss)); dloss_s2t += as_scalar(cg.get_value(am.s2t_xent.i)); dloss_t2s += as_scalar(cg.get_value(am.t2s_xent.i)); dloss_trace += as_scalar(cg.get_value(am.trace_bonus.i)); diff --git a/src/expr-xtra.h b/src/expr-xtra.h index be32269..2dcb585 100644 --- a/src/expr-xtra.h +++ b/src/expr-xtra.h @@ -1,16 +1,16 @@ #pragma once -#include "cnn/nodes.h" -#include "cnn/cnn.h" -#include "cnn/training.h" -#include "cnn/timing.h" -#include "cnn/rnn.h" -#include "cnn/gru.h" -#include "cnn/lstm.h" -#include "cnn/dict.h" -#include "cnn/expr.h" +#include "dynet/nodes.h" +#include "dynet/dynet.h" +#include "dynet/training.h" +#include "dynet/timing.h" +#include "dynet/rnn.h" +#include "dynet/gru.h" +#include "dynet/lstm.h" +#include "dynet/dict.h" +#include "dynet/expr.h" -using namespace cnn; +using namespace dynet; // Chris -- this should be a library function Expression arange(ComputationGraph &cg, unsigned begin, unsigned end, bool log_transform, std::vector *aux_mem) From 128f45b29f380dd22b6972549c6026501622ebe5 Mon Sep 17 00:00:00 2001 From: Trevor Cohn Date: Tue, 1 Nov 2016 17:08:02 +1100 Subject: [PATCH 2/5] mkl build compatibility --- CMakeLists.txt | 61 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ca3813..1f68fde 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,27 +3,60 @@ cmake_minimum_required(VERSION 2.8 FATAL_ERROR) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/dynet/cmake) -# CNN uses Eigen which exploits modern CPU architectures. To get the -# best possible performance, the following are recommended: -# 1. use very recent versions of gcc or Clang to build -# 2. use very recent versions of Eigen (ideally the dev version) -# 3. try compiler options like -march=native or other architecture -# flags (the compiler does not always make the best configuration -# decisions without help) - -# Cross-compiler, cross-platform options +function(find_mkl) + set(MKL_ARCH intel64) + find_path(MKL_INCLUDE_DIR mkl.h + PATHS ${MKL_ROOT} ${MKL_ROOT}/include) + find_library(MKL_CORE_LIB NAMES mkl_intel_lp64 mkl_intel_thread mkl_core + PATHS ${MKL_ROOT} ${MKL_ROOT}/lib/${MKL_ARCH} + DOC "MKL core library path") + + find_library(MKL_COMPILER_LIB NAMES iomp5 libiomp5md + PATHS ${MKL_ROOT} ${MKL_ROOT}/../compiler/lib/${MKL_ARCH} #Windows + ${MKL_ROOT}/../compilers_and_libraries/linux/lib/${MKL_ARCH}_lin #Linux + DOC "MKL compiler lib (for threaded MKL)") + + if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_COMPILER_LIB) + get_filename_component(MKL_CORE_LIB_DIR ${MKL_CORE_LIB} DIRECTORY) + get_filename_component(MKL_COMPILER_LIB_DIR ${MKL_COMPILER_LIB} DIRECTORY) + get_filename_component(MKL_COMPILER_LIB_FILE ${MKL_COMPILER_LIB} NAME) + message(STATUS "Found MKL\n * include: ${MKL_INCLUDE_DIR},\n * core library dir: ${MKL_CORE_LIB_DIR},\n * compiler library: ${MKL_COMPILER_LIB}") + + # Due to a conflict with /MT and /MD, MSVC needs mkl_intel_lp64 linked last, or we can change individual + # projects to use /MT (mkl_intel_lp64 linked with /MT, default MSVC projects use /MD), or we can instead + # link to the DLL versions. For now I'm opting for this solution which seems to work with projects still + # at their default /MD. Linux build requires the mkl_intel_lp64 to be linked first. So...: + if(MSVC) + set(LIBS ${LIBS} mkl_intel_thread mkl_core mkl_intel_lp64 ${MKL_COMPILER_LIB_FILE} PARENT_SCOPE) + else() + set(LIBS ${LIBS} mkl_intel_lp64 mkl_intel_thread mkl_core ${MKL_COMPILER_LIB_FILE} PARENT_SCOPE) + endif() + include_directories(${MKL_INCLUDE_DIR}) + link_directories(${MKL_CORE_LIB_DIR} ${MKL_COMPILER_LIB_DIR}) + set(MKL_LINK_DIRS ${MKL_CORE_LIB_DIR} ${MKL_COMPILER_LIB_DIR} PARENT_SCOPE) # Keeping this for python build + else() + message(FATAL_ERROR "Failed to find MKL in path: ${MKL_ROOT} (Did you set MKL_ROOT properly?)") + endif() +endfunction() + +######## Cross-compiler, cross-platform options set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_FAST_MATH") +if (MKL OR MKL_ROOT) + find_mkl() # sets include/lib directories and sets ${LIBS} needed for linking + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_USE_MKL_ALL") +endif() + -# Platform-specific options +######## Platform-specific options if(WIN32) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOMINMAX") # Disable min/max macros in windef.h + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOMINMAX") # Disable min/max macros in windef.h endif() -# Compiler-specific options +######## Compiler-specific options if(MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W1 -DEIGEN_HAS_C99_MATH /MP") # -Wall produces 20k warnings + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W1 -DEIGEN_HAS_C99_MATH /MP") # -Wall produces 20k warnings else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -funroll-loops -Wall -std=c++11 -Ofast -g -march=native") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -funroll-loops -Wall -std=c++11 -Ofast -g -march=native") endif() enable_testing() From 1415bdd81c9b4aa8646a95ab20ae3381fcf05916 Mon Sep 17 00:00:00 2001 From: Cong Duy Vu Hoang Date: Thu, 22 Dec 2016 17:57:18 +1100 Subject: [PATCH 3/5] update latest dynet --- src/attentional.cc | 36 +++++++++++++++++++++++++----------- src/attentional.h | 10 +++++----- src/biattentional.cc | 4 ++-- 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/src/attentional.cc b/src/attentional.cc index 4d5ffcd..0580b8a 100644 --- a/src/attentional.cc +++ b/src/attentional.cc @@ -65,6 +65,10 @@ int main(int argc, char** argv) { ("layers,l", value()->default_value(LAYERS), "use layers for RNN components") ("align,a", value()->default_value(ALIGN_DIM), "use dimensions for alignment projection") ("hidden,h", value()->default_value(HIDDEN_DIM), "use dimensions for recurrent hidden states") + ("sgd_trainer", value()->default_value(0), "use specific SGD trainer (0: vanilla SGD; 1: momentum SGD; 2: Adagrad; 3: AdaDelta; 4: Adam)") + ("lr_eta", value()->default_value(0.01f), "SGD learning rate value (e.g., 0.01 for simple SGD trainer)") + ("lr_eta_decay", value()->default_value(2.0f), "SGD learning rate decay value") + ("sparse_updates", value()->default_value(true), "enable/disable sparse update(s) for lookup parameter(s)") ("topk,k", value()->default_value(100), "use top kbest entries, used with --kbest") ("epochs,e", value()->default_value(50), "maximum number of training epochs") ("gru", "use Gated Recurrent Unit (GRU) for recurrent structure; default RNN") @@ -202,15 +206,25 @@ int main_body(variables_map vm) cerr << "Parameters will be written to: " << fname << endl; - Model model; - //bool use_momentum = false; - Trainer* sgd = nullptr; - //if (use_momentum) - //sgd = new MomentumSGDTrainer(&model); - //else - sgd = new SimpleSGDTrainer(&model); - sgd->eta = 0.01f; - //sgd = new AdadeltaTrainer(&model); + Model model; + Trainer* sgd = nullptr; + unsigned sgd_type = vm["sgd_trainer"].as(); + if (sgd_type == 1) + sgd = new MomentumSGDTrainer(model, vm["lr_eta"].as()); + else if (sgd_type == 2) + sgd = new AdagradTrainer(model, vm["lr_eta"].as()); + else if (sgd_type == 3) + sgd = new AdadeltaTrainer(model); + else if (sgd_type == 4) + sgd = new AdamTrainer(model, vm["lr_eta"].as()); + else if (sgd_type == 0)//Vanilla SGD trainer + sgd = new SimpleSGDTrainer(model, vm["lr_eta"].as()); + else + assert("Unknown SGD trainer type! (0: vanilla SGD; 1: momentum SGD; 2: Adagrad; 3: AdaDelta; 4: Adam)"); + sgd->eta_decay = vm["lr_eta_decay"].as(); + sgd->sparse_updates_enabled = vm["sparse_updates"].as(); + if (!sgd->sparse_updates_enabled) + cerr << "Sparse updates for lookup parameter(s) to be disabled!" << endl; cerr << "%% Using " << flavour << " recurrent units" << endl; AttentionalModel am(&model, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, @@ -300,7 +314,7 @@ void test_decode(Model &model, AM_t &am, string test_file, bool doco, unsigned b if (doco) source = read_numbered_sentence(line, &sd, num); else - source = read_sentence(line, &sd); + source = read_sentence(line, sd); if (source.front() != kSRC_SOS && source.back() != kSRC_EOS) { cerr << "Sentence in " << test_file << ":" << lno << " didn't start or end with , \n"; @@ -655,7 +669,7 @@ Corpus read_corpus(const string &filename, bool doco) if (doco) read_numbered_sentence_pair(line, &source, &sd, &target, &td, identifiers); else - read_sentence_pair(line, &source, &sd, &target, &td); + read_sentence_pair(line, source, sd, target, td); corpus.push_back(SentencePair(source, target, identifiers[0])); stoks += source.size(); ttoks += target.size(); diff --git a/src/attentional.h b/src/attentional.h index 6e23924..41d7e35 100644 --- a/src/attentional.h +++ b/src/attentional.h @@ -130,9 +130,9 @@ AttentionalModel::AttentionalModel(dynet::Model* model, unsigned align_dim, bool _rnn_src_embeddings, bool _giza_positional, bool _giza_markov, bool _giza_fertility, bool _doc_context, bool _global_fertility) -: builder(layers, (_rnn_src_embeddings) ? 3*hidden_dim : 2*hidden_dim, hidden_dim, model), - builder_src_fwd(1, hidden_dim, hidden_dim, model), - builder_src_bwd(1, hidden_dim, hidden_dim, model), +: builder(layers, (_rnn_src_embeddings) ? 3*hidden_dim : 2*hidden_dim, hidden_dim, *model), + builder_src_fwd(1, hidden_dim, hidden_dim, *model), + builder_src_bwd(1, hidden_dim, hidden_dim, *model), rnn_src_embeddings(_rnn_src_embeddings), giza_positional(_giza_positional), giza_markov(_giza_markov), giza_fertility(_giza_fertility), doc_context(_doc_context), @@ -460,13 +460,13 @@ Expression AttentionalModel::BuildGraph(const std::vector &source, /* log-Normal distribution */ Expression log_fert = log(i_total_trim); Expression delta = log_fert - mu_trim; - Expression exponent = cdiv(-cwise_multiply(delta, delta), 2.0f * var_trim); + Expression exponent = cdiv(-cmult(delta, delta), 2.0f * var_trim); Expression partition = -log_fert - 0.5 * log(2.0f * var_trim * 3.14159265359); *fertility = -sum_cols(transpose(partition + exponent)); #else /* Normal distribution */ Expression delta = i_total_trim - mu_trim; - Expression exponent = cdiv(-cwise_multiply(delta, delta), 2.0f * var_trim); + Expression exponent = cdiv(-cmult(delta, delta), 2.0f * var_trim); Expression partition = -0.5 * log(2.0f * var_trim * 3.14159265359); *fertility = -sum_cols(transpose(partition + exponent)); // note that as this is the value of the normal density, the errors diff --git a/src/biattentional.cc b/src/biattentional.cc index b4dc898..48b8360 100644 --- a/src/biattentional.cc +++ b/src/biattentional.cc @@ -141,7 +141,7 @@ Corpus read_corpus(const string &filename) while(getline(in, line)) { ++lc; Sentence source, target; - read_sentence_pair(line, &source, &sd, &target, &td); + read_sentence_pair(line, source, sd, target, td); corpus.push_back(SentencePair(source, target)); stoks += source.size(); ttoks += target.size(); @@ -253,7 +253,7 @@ int main(int argc, char** argv) { double best = 9e+99; Model model; - SimpleSGDTrainer sgd(&model); + SimpleSGDTrainer sgd(model); BidirAttentionalModel am(&model, 0.1); bool add_fer = false; From 562fa994f8fbbee70208134e51e01a00a6ac7e96 Mon Sep 17 00:00:00 2001 From: Cong Duy Vu Hoang Date: Thu, 22 Dec 2016 20:56:10 +1100 Subject: [PATCH 4/5] dynet --- dynet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dynet b/dynet index 8904224..b9308e4 160000 --- a/dynet +++ b/dynet @@ -1 +1 @@ -Subproject commit 8904224f7c5788246035e78fb4abe5b7df6aeba3 +Subproject commit b9308e4af080f5b6eaa15db3b20f6578c52d54e4 From f54f41344843f9ca5c7c6effa0fe77cd597584af Mon Sep 17 00:00:00 2001 From: Cong Duy Vu Hoang Date: Tue, 17 Jan 2017 11:17:35 +1100 Subject: [PATCH 5/5] fixed bug for beam search decoding --- src/attentional.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/attentional.h b/src/attentional.h index 41d7e35..8d71a6a 100644 --- a/src/attentional.h +++ b/src/attentional.h @@ -329,9 +329,14 @@ Expression AttentionalModel::add_input(int trg_tok, int t, ComputationG // alignment input Expression i_wah_rep; if (t > 0) { - //auto i_h_tm1 = builder.final_h().back(); - auto i_h_tm1 = concatenate(builder.final_h()); + Expression i_h_tm1; + if (prev_state) + i_h_tm1 = concatenate(builder.get_h(*prev_state));// This is required for beam search decoding implementation. + else + i_h_tm1 = concatenate(builder.final_h()); + Expression i_wah = i_Wa * i_h_tm1; + // want numpy style broadcasting, but have to do this manually i_wah_rep = concatenate_cols(std::vector(slen, i_wah)); }