From c263461b6958c15fb8fd04baba71e77ae986407b Mon Sep 17 00:00:00 2001
From: Trevor Cohn <tcohn@unimelb.edu.au>
Date: Tue, 1 Nov 2016 16:25:05 +1100
Subject: [PATCH 1/5] Transitioned to dynet and fixed various warnings

---
 .gitmodules          |  6 ++--
 CMakeLists.txt       |  8 ++---
 cnn                  |  1 -
 dynet                |  1 +
 src/CMakeLists.txt   | 21 +++++++++--
 src/attentional.cc   | 48 ++++++++++++-------------
 src/attentional.h    | 85 ++++++++++++++++++++++----------------------
 src/biattentional.cc | 36 +++++++++----------
 src/expr-xtra.h      | 20 +++++------
 9 files changed, 118 insertions(+), 108 deletions(-)
 delete mode 160000 cnn
 create mode 160000 dynet

diff --git a/.gitmodules b/.gitmodules
index 17ad0b8..e21e53f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "cnn"]
-	path = cnn
-	url = https://github.com/clab/cnn.git
+[submodule "dynet"]
+	path = dynet
+	url = https://github.com/clab/dynet.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 226dd53..7ca3813 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 project(mantis)
 cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
 
-set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cnn/cmake)
+set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/dynet/cmake)
 
 # CNN uses Eigen which exploits modern CPU architectures. To get the
 # best possible performance, the following are recommended:
@@ -29,7 +29,7 @@ endif()
 enable_testing()
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}
-                    ${PROJECT_SOURCE_DIR}/cnn)
+                    ${PROJECT_SOURCE_DIR}/dynet)
 
 function(find_cudnn)
   set(CUDNN_ROOT "" CACHE PATH "CUDNN root path")
@@ -111,9 +111,9 @@ include_directories(${EIGEN3_INCLUDE_DIR})
 FIND_PACKAGE(Threads REQUIRED)
 set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
 
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cnn/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dynet/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
-add_subdirectory(cnn/cnn)
+add_subdirectory(dynet/dynet)
 add_subdirectory(src)
 enable_testing()
diff --git a/cnn b/cnn
deleted file mode 160000
index ec75eb8..0000000
--- a/cnn
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ec75eb85932f964e5c3ad79515b4b2c45c84f9a3
diff --git a/dynet b/dynet
new file mode 160000
index 0000000..8904224
--- /dev/null
+++ b/dynet
@@ -0,0 +1 @@
+Subproject commit 8904224f7c5788246035e78fb4abe5b7df6aeba3
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 42bb893..2a0df7a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,14 +2,29 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
 
 foreach(TARGET attentional biattentional)
   ADD_EXECUTABLE(${TARGET} ${TARGET}.cc)
-  target_link_libraries(${TARGET} cnn ${LIBS})
   if(UNIX AND NOT APPLE)
     target_link_libraries(${TARGET} rt)
   endif()
   if (WITH_CUDA_BACKEND)
-    add_dependencies(${TARGET} cnncuda)
-    target_link_libraries(${TARGET} cnncuda)
+	  set(CUDA_SEPARABLE_COMPILATION ON)
+	  list(APPEND CUDA_NVCC_FLAGS "-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_52,code=sm_52;-gencode;arch=compute_52,code=compute_52;-std=c++11;-DVERBOSE;-DEIGEN_USE_GPU;-DHAVE_CUDA")
+	  if(CMAKE_COMPILER_IS_GNUCXX)
+	    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9)
+	      # gcc 4.9 or later versions raise SEGV due to the optimization problem.
+	      # Use -O1 instead for now.
+	      list(APPEND CUDA_NVCC_FLAGS "-O1")
+	    else()
+	      list(APPEND CUDA_NVCC_FLAGS "-O2")
+	    endif()
+	  else()
+	    list(APPEND CUDA_NVCC_FLAGS "-O2")
+	  endif()
+	  add_dependencies(${TARGET} gdynet dynetcuda)
+	  target_link_libraries(${TARGET} gdynet dynetcuda)
     CUDA_ADD_CUBLAS_TO_TARGET(${TARGET})
+  else()
+	  add_dependencies(${TARGET} dynet)
+	  target_link_libraries(${TARGET} dynet ${LIBS})
   endif (WITH_CUDA_BACKEND)
 endforeach()
 
diff --git a/src/attentional.cc b/src/attentional.cc
index e76c7f0..4d5ffcd 100644
--- a/src/attentional.cc
+++ b/src/attentional.cc
@@ -10,7 +10,7 @@
 #include <boost/program_options/variables_map.hpp>
 
 using namespace std;
-using namespace cnn;
+using namespace dynet;
 using namespace boost::program_options;
 
 unsigned LAYERS = 1; // 2
@@ -19,8 +19,8 @@ unsigned ALIGN_DIM = 32;   // 128
 unsigned SRC_VOCAB_SIZE = 0;
 unsigned TGT_VOCAB_SIZE = 0;
 
-cnn::Dict sd;
-cnn::Dict td;
+dynet::Dict sd;
+dynet::Dict td;
 int kSRC_SOS;
 int kSRC_EOS;
 int kTGT_SOS;
@@ -45,7 +45,7 @@ template <class rnn_t>
 int main_body(variables_map vm);
 
 int main(int argc, char** argv) {
-    cnn::initialize(argc, argv);
+    dynet::initialize(argc, argv);
 
     // command line processing
     variables_map vm; 
@@ -112,8 +112,8 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel,
         bool doco, float coverage, bool display, bool fert);
 
 template <class AM_t> void test_rescore(Model &model, AM_t &am, Corpus &testing, bool doco);
-template <class AM_t> void test_decode(Model &model, AM_t &am, std::string test_file, bool doco, int beam);
-template <class AM_t> void test_kbest_arcs(Model &model, AM_t &am, string test_file, int top_k);
+template <class AM_t> void test_decode(Model &model, AM_t &am, std::string test_file, bool doco, unsigned beam);
+template <class AM_t> void test_kbest_arcs(Model &model, AM_t &am, string test_file, unsigned top_k);
 template <class AM_t> void fert_stats(Model &model, AM_t &am, Corpus &devel, bool global_fert);
 
 const Sentence* context(const Corpus &corpus, unsigned i);
@@ -146,8 +146,6 @@ int main_body(variables_map vm)
     if (vm.count("lstm"))	flavour = "LSTM";
     else if (vm.count("gru"))	flavour = "GRU";
 
-    typedef vector<int> Sentence;
-    typedef pair<Sentence, Sentence> SentencePair;
     Corpus training, devel, testing;
     string line;
     cerr << "Reading training data from " << vm["train"].as<string>() << "...\n";
@@ -234,19 +232,19 @@ int main_body(variables_map vm)
                 vm["epochs"].as<int>(), doco, vm["coverage"].as<float>(), vm.count("display"),
                 fert);
     else if (vm.count("kbest"))
-    	test_kbest_arcs(model, am, vm["kbest"].as<string>(), vm["topk"].as<int>());
+    	test_kbest_arcs(model, am, vm["kbest"].as<string>(), vm["topk"].as<unsigned>());
     else if (vm.count("test")) {
         if (vm.count("rescore"))
             test_rescore(model, am, testing, doco);
         else // test
-            test_decode(model, am, vm["test"].as<string>(), doco, vm["beam"].as<int>());
+            test_decode(model, am, vm["test"].as<string>(), doco, vm["beam"].as<unsigned>());
     }
     else if (vm.count("fert-stats"))
         fert_stats(model, am, devel, vm.count("fertility"));
 
     delete sgd;
 
-    //cnn::Free();
+    //dynet::Free();
 
     return EXIT_SUCCESS;
 }
@@ -264,9 +262,9 @@ void test_rescore(Model &model, AM_t &am, Corpus &testing, bool doco)
         tie(ssent, tsent, docid) = testing[i];
 
 	ComputationGraph cg;
-        am.BuildGraph(ssent, tsent, cg, nullptr, (doco) ? context(testing, i) : nullptr);
+        auto iloss = am.BuildGraph(ssent, tsent, cg, nullptr, (doco) ? context(testing, i) : nullptr);
 
-	double loss = as_scalar(cg.forward());
+	double loss = as_scalar(cg.forward(iloss));
         cout << i << " |||";
 	for (auto &w: ssent)
 	    cout << " " << sd.convert(w);
@@ -286,10 +284,8 @@ void test_rescore(Model &model, AM_t &am, Corpus &testing, bool doco)
 }
 
 template <class AM_t>
-void test_decode(Model &model, AM_t &am, string test_file, bool doco, int beam)
+void test_decode(Model &model, AM_t &am, string test_file, bool doco, unsigned beam)
 {
-    double tloss = 0;
-    int tchars = 0;
     int lno = 0;
 
     cerr << "Reading test examples from " << test_file << endl;
@@ -339,7 +335,7 @@ void test_decode(Model &model, AM_t &am, string test_file, bool doco, int beam)
 }
 
 template <class AM_t>
-void test_kbest_arcs(Model &model, AM_t &am, string test_file, int top_k)
+void test_kbest_arcs(Model &model, AM_t &am, string test_file, unsigned top_k)
 {
     // only suitable for monolingual setting, of predicting a sentence given preceeding sentence
     cerr << "Reading test examples from " << test_file << endl;
@@ -369,7 +365,7 @@ void test_kbest_arcs(Model &model, AM_t &am, string test_file, int top_k)
                     errs.push_back(i_err);
                 }
                 Expression i_nerr = sum(errs);
-                double loss = as_scalar(cg.incremental_forward());
+                double loss = as_scalar(cg.incremental_forward(i_nerr));
 
                 //cout << last_last_id << ":" << last_id << " |||";
                 //for (auto &w: source) cout << " " << sd.convert(w);
@@ -483,9 +479,9 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel,
     }
 
     bool first = true;
-    int report = 0;
+    unsigned report = 0;
     unsigned lines = 0;
-    int epoch = 0;
+    unsigned epoch = 0;
     Sentence ssent, tsent;
     int docid;
 
@@ -497,8 +493,8 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel,
             tie(ssent, tsent, docid) = devel[i];
             ComputationGraph cg;
             Expression alignment;
-            am.BuildGraph(ssent, tsent, cg, &alignment, (doco) ? context(devel, i) : nullptr);
-            cg.forward();
+            auto iloss = am.BuildGraph(ssent, tsent, cg, &alignment, (doco) ? context(devel, i) : nullptr);
+            cg.forward(iloss);
 
             cout << "\n====== SENTENCE " << i << " =========\n";
             am.display_ascii(ssent, tsent, cg, alignment, sd, td);
@@ -586,7 +582,7 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel,
                 objective = objective + fertility_nll;
 
             // perform forward computation for aggregate objective
-            cg.forward();
+            cg.forward(objective);
 
             // grab the parts of the objective
             loss += as_scalar(cg.get_value(xent.i));
@@ -595,7 +591,7 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel,
             if (fert) 
                 loss_fert += as_scalar(cg.get_value(fertility_nll.i));
 
-            cg.backward();
+            cg.backward(objective);
 	    sgd.update();
             ++lines;
 
@@ -627,8 +623,8 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel,
             for (unsigned i = 0; i < devel.size(); ++i) {
                 tie(ssent, tsent, docid) = devel[i];
                 ComputationGraph cg;
-                am.BuildGraph(ssent, tsent, cg, nullptr, (doco) ? context(devel, i) : nullptr, nullptr, nullptr);
-                dloss += as_scalar(cg.forward());
+                auto idloss = am.BuildGraph(ssent, tsent, cg, nullptr, (doco) ? context(devel, i) : nullptr, nullptr, nullptr);
+                dloss += as_scalar(cg.forward(idloss));
                 dchars += tsent.size() - 1;
             }
             if (dloss < best) {
diff --git a/src/attentional.h b/src/attentional.h
index 50497eb..6e23924 100644
--- a/src/attentional.h
+++ b/src/attentional.h
@@ -1,14 +1,14 @@
 #pragma once
 
-#include "cnn/nodes.h"
-#include "cnn/cnn.h"
-#include "cnn/training.h"
-#include "cnn/timing.h"
-#include "cnn/rnn.h"
-#include "cnn/gru.h"
-#include "cnn/lstm.h"
-#include "cnn/dict.h"
-#include "cnn/expr.h"
+#include "dynet/nodes.h"
+#include "dynet/dynet.h"
+#include "dynet/training.h"
+#include "dynet/timing.h"
+#include "dynet/rnn.h"
+#include "dynet/gru.h"
+#include "dynet/lstm.h"
+#include "dynet/dict.h"
+#include "dynet/expr.h"
 #include "expr-xtra.h"
 
 #include <algorithm>
@@ -21,7 +21,7 @@
 
 #define RNN_H0_IS_ZERO
 
-namespace cnn {
+namespace dynet {
 
 template <class Builder>
 struct AttentionalModel {
@@ -51,12 +51,12 @@ struct AttentionalModel {
             Dict &tdict, const std::vector<int>* ctx=0);
 
     std::vector<int> beam_decode(const std::vector<int> &source, ComputationGraph& cg, 
-            int beam_width, Dict &tdict, const std::vector<int>* ctx=0);
+            unsigned beam_width, Dict &tdict, const std::vector<int>* ctx=0);
 
     std::vector<int> sample(const std::vector<int> &source, ComputationGraph& cg, 
             Dict &tdict, const std::vector<int>* ctx=0);
 
-    void add_fertility_params(cnn::Model* model, unsigned hidden_dim, bool _rnn_src_embeddings);
+    void add_fertility_params(dynet::Model* model, unsigned hidden_dim, bool _rnn_src_embeddings);
 
     LookupParameter p_cs;
     LookupParameter p_ct;
@@ -125,7 +125,7 @@ struct AttentionalModel {
     KTHXBYE(expression) 
 
 template <class Builder>
-AttentionalModel<Builder>::AttentionalModel(cnn::Model* model,
+AttentionalModel<Builder>::AttentionalModel(dynet::Model* model,
     unsigned vocab_size_src, unsigned _vocab_size_tgt, unsigned layers, unsigned hidden_dim, 
     unsigned align_dim, bool _rnn_src_embeddings, bool _giza_positional, 
     bool _giza_markov, bool _giza_fertility, bool _doc_context,
@@ -195,7 +195,7 @@ AttentionalModel<Builder>::AttentionalModel(cnn::Model* model,
 }
 
 template <class Builder>
-void AttentionalModel<Builder>::add_fertility_params(cnn::Model* model, unsigned hidden_dim, bool _rnn_src_embeddings)
+void AttentionalModel<Builder>::add_fertility_params(dynet::Model* model, unsigned hidden_dim, bool _rnn_src_embeddings)
 {
     if (_rnn_src_embeddings) {
          p_Wfhid = model->add_parameters({hidden_dim, 2*hidden_dim});
@@ -495,8 +495,8 @@ AttentionalModel<Builder>::display_ascii(const std::vector<int> &source, const s
     // display the alignment
     //float I = target.size() - 1;
     //float J = source.size() - 1;
-    float I = target.size();
-    float J = source.size();
+    unsigned I = target.size();
+    unsigned J = source.size();
     //vector<string> symbols{"\u2588","\u2589","\u258A","\u258B","\u258C","\u258D","\u258E","\u258F"};
     vector<string> symbols{".","o","*","O","@"};
     int num_symbols = symbols.size();
@@ -516,18 +516,18 @@ AttentionalModel<Builder>::display_ascii(const std::vector<int> &source, const s
     cout.setf(ios_base::adjustfield, ios_base::left);
     cout << setw(12) << "source" << "  ";
     cout.setf(ios_base::adjustfield, ios_base::right);
-    for (int j = 0; j < J; ++j) 
+    for (unsigned j = 0; j < J; ++j) 
         cout << setw(2) << j << ' ';
     cout << endl;
 
-    for (int i = 0; i < I; ++i) {
+    for (unsigned i = 0; i < I; ++i) {
         cout.setf(ios_base::adjustfield, ios_base::left);
         //cout << setw(12) << td.convert(target[i+1]) << "  ";
         cout << setw(12) << td.convert(target[i]) << "  ";
         cout.setf(ios_base::adjustfield, ios_base::right);
         float max_v = 0;
         int max_j = -1;
-        for (int j = 0; j < J; ++j) {
+        for (unsigned j = 0; j < J; ++j) {
             float v = TensorTools::AccessElement(a, Dim({(unsigned int)j, (unsigned int)i}));
             string symbol;
             for (int s = 0; s <= num_symbols; ++s) {
@@ -547,7 +547,7 @@ AttentionalModel<Builder>::display_ascii(const std::vector<int> &source, const s
         cout << setw(20) << "max Pr=" << setprecision(3) << setw(5) << max_v << " @ " << max_j << endl;
     }
     cout << resetiosflags(ios_base::adjustfield);
-    for (int j = 0; j < J; ++j) 
+    for (unsigned j = 0; j < J; ++j) 
         cout << j << ":" << sd.convert(source[j]) << ' ';
     cout << endl;
 }
@@ -560,21 +560,21 @@ AttentionalModel<Builder>::display_tikz(const std::vector<int> &source, const st
     using namespace std;
 
     // display the alignment
-    float I = target.size();
-    float J = source.size();
+    unsigned I = target.size();
+    unsigned J = source.size();
 
     const Tensor &a = cg.get_value(alignment.i);
     cout << a.d[0] << " x " << a.d[1] << endl;
 
     cout << "\\begin{tikzpicture}[scale=0.5]\n";
-    for (int j = 0; j < J; ++j) 
+    for (unsigned j = 0; j < J; ++j) 
         cout << "\\node[anchor=west,rotate=90] at (" << j+0.5 << ", " << I+0.2 << ") { " << sd.convert(source[j]) << " };\n";
-    for (int i = 0; i < I; ++i) 
+    for (unsigned i = 0; i < I; ++i) 
         cout << "\\node[anchor=west] at (" << J+0.2 << ", " << I-i-0.5 << ") { " << td.convert(target[i]) << " };\n";
 
     float eps = 0.01;
-    for (int i = 0; i < I; ++i) {
-        for (int j = 0; j < J; ++j) {
+    for (unsigned i = 0; i < I; ++i) {
+        for (unsigned j = 0; j < J; ++j) {
             float v = TensorTools::AccessElement(a, Dim({(unsigned int)j, (unsigned int)i}));
             //int val = int(pow(v, 0.5) * 100);
             int val = int(v * 100);
@@ -589,7 +589,7 @@ AttentionalModel<Builder>::display_tikz(const std::vector<int> &source, const st
 template <class Builder>
 std::vector<int>
 AttentionalModel<Builder>::greedy_decode(const std::vector<int> &source, ComputationGraph& cg, 
-        cnn::Dict &tdict, const std::vector<int>* ctx)
+        dynet::Dict &tdict, const std::vector<int>* ctx)
 {
     const int sos_sym = tdict.convert("<s>");
     const int eos_sym = tdict.convert("</s>");
@@ -598,7 +598,7 @@ AttentionalModel<Builder>::greedy_decode(const std::vector<int> &source, Computa
     target.push_back(sos_sym); 
 
     //std::cerr << tdict.convert(target.back());
-    int t = 0;
+    unsigned t = 0;
     start_new_instance(source, cg, ctx);
     while (target.back() != eos_sym) 
     {
@@ -607,7 +607,7 @@ AttentionalModel<Builder>::greedy_decode(const std::vector<int> &source, Computa
 
         // find the argmax next word (greedy)
         unsigned w = 0;
-        auto dist = as_vector(cg.incremental_forward()); // evaluates last expression, i.e., ydist
+        auto dist = as_vector(cg.incremental_forward(ydist));
         auto pr_w = dist[w];
         for (unsigned x = 1; x < dist.size(); ++x) {
             if (dist[x] > pr_w) {
@@ -647,11 +647,11 @@ struct Hypothesis {
 
 template <class Builder>
 std::vector<int>
-AttentionalModel<Builder>::beam_decode(const std::vector<int> &source, ComputationGraph& cg, int beam_width, 
-        cnn::Dict &tdict, const std::vector<int>* ctx)
+AttentionalModel<Builder>::beam_decode(const std::vector<int> &source, ComputationGraph& cg, 
+	unsigned beam_width, dynet::Dict &tdict, const std::vector<int>* ctx)
 {
-    const int sos_sym = tdict.convert("<s>");
-    const int eos_sym = tdict.convert("</s>");
+    const unsigned sos_sym = tdict.convert("<s>");
+    const unsigned eos_sym = tdict.convert("</s>");
 
     start_new_instance(source, cg, ctx);
 
@@ -661,7 +661,7 @@ AttentionalModel<Builder>::beam_decode(const std::vector<int> &source, Computati
     std::vector<unsigned> vocab(boost::copy_range<std::vector<unsigned>>(boost::irange(0u, vocab_size_tgt)));
     std::vector<Hypothesis> completed;
 
-    for (int steps = 0; completed.size() < beam_width && steps < 2*source.size(); ++steps) {
+    for (unsigned steps = 0; completed.size() < beam_width && steps < 2*source.size(); ++steps) {
         std::vector<Hypothesis> new_chart;
 
         for (auto &hprev: chart) {
@@ -672,8 +672,7 @@ AttentionalModel<Builder>::beam_decode(const std::vector<int> &source, Computati
             Expression ydist = softmax(i_scores); // compiler warning, but see below
 
             // find the top k best next words
-            unsigned w = 0;
-            auto dist = as_vector(cg.incremental_forward()); // evaluates last expression, i.e., ydist
+            auto dist = as_vector(cg.incremental_forward(ydist));
             std::partial_sort(vocab.begin(), vocab.begin()+beam_width, vocab.end(), 
                     [&dist](unsigned v1, unsigned v2) { return dist[v1] > dist[v2]; });
 
@@ -709,7 +708,7 @@ AttentionalModel<Builder>::beam_decode(const std::vector<int> &source, Computati
 
 template <class Builder>
 std::vector<int>
-AttentionalModel<Builder>::sample(const std::vector<int> &source, ComputationGraph& cg, cnn::Dict &tdict,
+AttentionalModel<Builder>::sample(const std::vector<int> &source, ComputationGraph& cg, dynet::Dict &tdict,
         const std::vector<int> *ctx)
 {
     const int sos_sym = tdict.convert("<s>");
@@ -727,7 +726,7 @@ AttentionalModel<Builder>::sample(const std::vector<int> &source, ComputationGra
         Expression ydist = softmax(i_scores);
 
 	// in rnnlm.cc there's a loop around this block -- why? can incremental_forward fail?
-        auto dist = as_vector(cg.incremental_forward());
+        auto dist = as_vector(cg.incremental_forward(ydist));
 	double p = rand01();
         unsigned w = 0;
         for (; w < dist.size(); ++w) {
@@ -759,11 +758,11 @@ AttentionalModel<Builder>::display_fertility(const std::vector<int> &source, Dic
     Expression vbias = concatenate(std::vector<Expression>(slen, parameter(cg, p_bfvar)));
     Expression fhid = tanh(transpose(fbias + parameter(cg, p_Wfhid) * src));  
     Expression mu = mbias + fhid * parameter(cg, p_Wfmu);
-    auto mu_vec = as_vector(cg.incremental_forward()); // evaluates last expression
+    auto mu_vec = as_vector(cg.incremental_forward(mu)); 
     Expression var = exp(vbias + fhid * parameter(cg, p_Wfvar));
-    auto var_vec = as_vector(cg.incremental_forward()); // evaluates last expression
+    auto var_vec = as_vector(cg.incremental_forward(var)); 
 
-    for (int j = 1; j < slen-1; ++j) 
+    for (unsigned j = 1; j < slen-1; ++j) 
         std::cout << sd.convert(source[j]) << '\t' << mu_vec[j] << '\t' << var_vec[j] << '\n';
 }
 
@@ -776,9 +775,9 @@ AttentionalModel<Builder>::display_empirical_fertility(const std::vector<int> &s
     BuildGraph(source, target, cg, &alignment);
 
     Expression totals = sum_cols(alignment);
-    auto totals_vec = as_vector(cg.incremental_forward()); // evaluates last expression
+    auto totals_vec = as_vector(cg.incremental_forward(totals));
 
-    for (int j = 0; j < slen; ++j) 
+    for (unsigned j = 0; j < slen; ++j) 
         std::cout << sd.convert(source[j]) << '\t' << totals_vec[j] << '\n';
 }
 
diff --git a/src/biattentional.cc b/src/biattentional.cc
index 7ccae64..b4dc898 100644
--- a/src/biattentional.cc
+++ b/src/biattentional.cc
@@ -10,7 +10,7 @@
 #include <boost/program_options/variables_map.hpp>
 
 using namespace std;
-using namespace cnn;
+using namespace dynet;
 using namespace boost::program_options;
 
 unsigned LAYERS = 2;
@@ -25,8 +25,8 @@ bool GIZA_F = true;
 bool DOC = false;
 bool FERT = false;
 
-cnn::Dict sd;
-cnn::Dict td;
+dynet::Dict sd;
+dynet::Dict td;
 int kSRC_SOS;
 int kSRC_EOS;
 int kTGT_SOS;
@@ -53,7 +53,7 @@ struct BidirAttentionalModel {
         m_trace_weight = trace_weight;   
     }
 
-    void add_fertility_params(cnn::Model* model)
+    void add_fertility_params(dynet::Model* model)
     {
     	s2t_model.add_fertility_params(model, HIDDEN_DIM, BIDIR);
     	t2s_model.add_fertility_params(model, HIDDEN_DIM, BIDIR);
@@ -107,12 +107,12 @@ struct BidirAttentionalModel {
 	assert(lparams.size() == 2*sm.lookup_parameters_list().size());
 	for (const auto &p : sm.lookup_parameters_list())  {
 	    for (unsigned i = 0; i < p->values.size(); ++i) 
-		memcpy(lparams[lid]->values[i].v, &p->values[i].v[0], sizeof(cnn::real) * p->values[i].d.size());
+		memcpy(lparams[lid]->values[i].v, &p->values[i].v[0], sizeof(dynet::real) * p->values[i].d.size());
 	    lid++;
 	}
 	for (const auto &p : tm.lookup_parameters_list()) {
 	    for (unsigned i = 0; i < p->values.size(); ++i) 
-		memcpy(lparams[lid]->values[i].v, &p->values[i].v[0], sizeof(cnn::real) * p->values[i].d.size());
+		memcpy(lparams[lid]->values[i].v, &p->values[i].v[0], sizeof(dynet::real) * p->values[i].d.size());
 	    lid++;
 	}
 	assert(lid == lparams.size());
@@ -120,9 +120,9 @@ struct BidirAttentionalModel {
 	unsigned did = 0;
 	auto &dparams = model.parameters_list();
 	for (const auto &p : sm.parameters_list()) 
-	    memcpy(dparams[did++]->values.v, &p->values.v[0], sizeof(cnn::real) * p->values.d.size());
+	    memcpy(dparams[did++]->values.v, &p->values.v[0], sizeof(dynet::real) * p->values.d.size());
 	for (const auto &p : tm.parameters_list()) 
-	    memcpy(dparams[did++]->values.v, &p->values.v[0], sizeof(cnn::real) * p->values.d.size());
+	    memcpy(dparams[did++]->values.v, &p->values.v[0], sizeof(dynet::real) * p->values.d.size());
 	assert(did == dparams.size());
     }
 };
@@ -157,7 +157,7 @@ Corpus read_corpus(const string &filename)
 }
 
 int main(int argc, char** argv) {
-    cnn::initialize(argc, argv);
+    dynet::initialize(argc, argv);
 
     // command line processing
     variables_map vm; 
@@ -288,8 +288,8 @@ int main(int argc, char** argv) {
             unsigned i = 0;
             for (auto& spair : dev) {
                 ComputationGraph cg;
-                am.build_graph(spair.first, spair.second, cg);
-                dloss += as_scalar(cg.incremental_forward());
+                auto idloss = am.build_graph(spair.first, spair.second, cg);
+                dloss += as_scalar(cg.incremental_forward(idloss));
                 dloss_s2t += as_scalar(cg.get_value(am.s2t_xent.i));
                 dloss_t2s += as_scalar(cg.get_value(am.t2s_xent.i));
                 dloss_trace += as_scalar(cg.get_value(am.trace_bonus.i));
@@ -324,14 +324,14 @@ int main(int argc, char** argv) {
     	unsigned dchars_s = 0, dchars_t = 0, dchars_tt = 0;
     	for (unsigned i = 0; i < testing.size(); ++i) {
             ComputationGraph cg;
-            am.build_graph(testing[i].first, testing[i].second, cg);
+            auto idloss = am.build_graph(testing[i].first, testing[i].second, cg);
 
             dchars_s += testing[i].first.size() - 1;
             dchars_t += testing[i].second.size() - 1;
             dchars_tt += std::max(testing[i].first.size(), testing[i].second.size()) - 1; // max or min?
 
             //cg.forward();
-            dloss += as_scalar(cg.forward());
+            dloss += as_scalar(cg.forward(idloss));
 
             double loss_s2t = as_scalar(cg.get_value(am.s2t_xent.i));
             double loss_t2s = as_scalar(cg.get_value(am.t2s_xent.i));
@@ -388,12 +388,12 @@ int main(int argc, char** argv) {
             chars_t += spair.second.size() - 1;
             chars_tt += std::max(spair.first.size(), spair.second.size()) - 1; // max or min?
             ++si;
-            am.build_graph(spair.first, spair.second, cg);
-            loss += as_scalar(cg.forward());
+            auto iloss = am.build_graph(spair.first, spair.second, cg);
+            loss += as_scalar(cg.forward(iloss));
             loss_s2t += as_scalar(cg.get_value(am.s2t_xent.i));
             loss_t2s += as_scalar(cg.get_value(am.t2s_xent.i));
             loss_trace += as_scalar(cg.get_value(am.trace_bonus.i));
-            cg.backward();
+            cg.backward(iloss);
             sgd.update(1.0f);
             ++lines;
 
@@ -421,8 +421,8 @@ int main(int argc, char** argv) {
             unsigned dchars_s = 0, dchars_t = 0, dchars_tt = 0;
             for (auto& spair : dev) {
                 ComputationGraph cg;
-                am.build_graph(spair.first, spair.second, cg);
-                dloss += as_scalar(cg.incremental_forward());
+                auto idloss = am.build_graph(spair.first, spair.second, cg);
+                dloss += as_scalar(cg.incremental_forward(idloss));
                 dloss_s2t += as_scalar(cg.get_value(am.s2t_xent.i));
                 dloss_t2s += as_scalar(cg.get_value(am.t2s_xent.i));
                 dloss_trace += as_scalar(cg.get_value(am.trace_bonus.i));
diff --git a/src/expr-xtra.h b/src/expr-xtra.h
index be32269..2dcb585 100644
--- a/src/expr-xtra.h
+++ b/src/expr-xtra.h
@@ -1,16 +1,16 @@
 #pragma once
 
-#include "cnn/nodes.h"
-#include "cnn/cnn.h"
-#include "cnn/training.h"
-#include "cnn/timing.h"
-#include "cnn/rnn.h"
-#include "cnn/gru.h"
-#include "cnn/lstm.h"
-#include "cnn/dict.h"
-#include "cnn/expr.h"
+#include "dynet/nodes.h"
+#include "dynet/dynet.h"
+#include "dynet/training.h"
+#include "dynet/timing.h"
+#include "dynet/rnn.h"
+#include "dynet/gru.h"
+#include "dynet/lstm.h"
+#include "dynet/dict.h"
+#include "dynet/expr.h"
 
-using namespace cnn;
+using namespace dynet;
 
 // Chris -- this should be a library function
 Expression arange(ComputationGraph &cg, unsigned begin, unsigned end, bool log_transform, std::vector<float> *aux_mem) 

From 128f45b29f380dd22b6972549c6026501622ebe5 Mon Sep 17 00:00:00 2001
From: Trevor Cohn <tcohn@unimelb.edu.au>
Date: Tue, 1 Nov 2016 17:08:02 +1100
Subject: [PATCH 2/5] mkl build compatibility

---
 CMakeLists.txt | 61 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ca3813..1f68fde 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,27 +3,60 @@ cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/dynet/cmake)
 
-# CNN uses Eigen which exploits modern CPU architectures. To get the
-# best possible performance, the following are recommended:
-#   1. use very recent versions of gcc or Clang to build
-#   2. use very recent versions of Eigen (ideally the dev version)
-#   3. try compiler options like -march=native or other architecture
-#      flags (the compiler does not always make the best configuration
-#      decisions without help)
-
-# Cross-compiler, cross-platform options
+function(find_mkl)
+  set(MKL_ARCH intel64)
+  find_path(MKL_INCLUDE_DIR mkl.h
+            PATHS ${MKL_ROOT} ${MKL_ROOT}/include)
+  find_library(MKL_CORE_LIB NAMES mkl_intel_lp64 mkl_intel_thread mkl_core
+               PATHS ${MKL_ROOT} ${MKL_ROOT}/lib/${MKL_ARCH}
+               DOC "MKL core library path")
+
+  find_library(MKL_COMPILER_LIB NAMES iomp5 libiomp5md
+               PATHS ${MKL_ROOT} ${MKL_ROOT}/../compiler/lib/${MKL_ARCH}              #Windows
+                     ${MKL_ROOT}/../compilers_and_libraries/linux/lib/${MKL_ARCH}_lin #Linux
+               DOC "MKL compiler lib (for threaded MKL)")
+
+  if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_COMPILER_LIB)
+    get_filename_component(MKL_CORE_LIB_DIR ${MKL_CORE_LIB} DIRECTORY)
+    get_filename_component(MKL_COMPILER_LIB_DIR ${MKL_COMPILER_LIB} DIRECTORY)
+    get_filename_component(MKL_COMPILER_LIB_FILE ${MKL_COMPILER_LIB} NAME)
+    message(STATUS "Found MKL\n   * include: ${MKL_INCLUDE_DIR},\n   * core library dir: ${MKL_CORE_LIB_DIR},\n   * compiler library: ${MKL_COMPILER_LIB}")
+
+    # Due to a conflict with /MT and /MD, MSVC needs mkl_intel_lp64 linked last, or we can change individual
+    # projects to use /MT (mkl_intel_lp64 linked with /MT, default MSVC projects use /MD), or we can instead
+    # link to the DLL versions. For now I'm opting for this solution which seems to work with projects still
+    # at their default /MD. Linux build requires the mkl_intel_lp64 to be linked first. So...:
+    if(MSVC)
+      set(LIBS ${LIBS} mkl_intel_thread mkl_core mkl_intel_lp64 ${MKL_COMPILER_LIB_FILE} PARENT_SCOPE)
+    else()
+      set(LIBS ${LIBS} mkl_intel_lp64 mkl_intel_thread mkl_core ${MKL_COMPILER_LIB_FILE} PARENT_SCOPE)
+    endif()
+    include_directories(${MKL_INCLUDE_DIR})
+    link_directories(${MKL_CORE_LIB_DIR} ${MKL_COMPILER_LIB_DIR})
+    set(MKL_LINK_DIRS ${MKL_CORE_LIB_DIR} ${MKL_COMPILER_LIB_DIR} PARENT_SCOPE) # Keeping this for python build
+  else()
+    message(FATAL_ERROR "Failed to find MKL in path: ${MKL_ROOT} (Did you set MKL_ROOT properly?)")
+  endif()
+endfunction()
+
+######## Cross-compiler, cross-platform options
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_FAST_MATH")
+if (MKL OR MKL_ROOT)
+  find_mkl()  # sets include/lib directories and sets ${LIBS} needed for linking
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_USE_MKL_ALL")
+endif()
+
 
-# Platform-specific options
+######## Platform-specific options
 if(WIN32)
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOMINMAX")   # Disable min/max macros in windef.h
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOMINMAX")   # Disable min/max macros in windef.h
 endif()
 
-# Compiler-specific options
+######## Compiler-specific options
 if(MSVC)
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W1 -DEIGEN_HAS_C99_MATH /MP")   # -Wall produces 20k warnings
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W1 -DEIGEN_HAS_C99_MATH /MP")   # -Wall produces 20k warnings
 else()
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -funroll-loops -Wall -std=c++11 -Ofast -g -march=native")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -funroll-loops -Wall -std=c++11 -Ofast -g -march=native")
 endif()
 
 enable_testing()

From 1415bdd81c9b4aa8646a95ab20ae3381fcf05916 Mon Sep 17 00:00:00 2001
From: Cong Duy Vu Hoang <vhoang2@student.unimelb.edu.au>
Date: Thu, 22 Dec 2016 17:57:18 +1100
Subject: [PATCH 3/5] update latest dynet

---
 src/attentional.cc   | 36 +++++++++++++++++++++++++-----------
 src/attentional.h    | 10 +++++-----
 src/biattentional.cc |  4 ++--
 3 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/src/attentional.cc b/src/attentional.cc
index 4d5ffcd..0580b8a 100644
--- a/src/attentional.cc
+++ b/src/attentional.cc
@@ -65,6 +65,10 @@ int main(int argc, char** argv) {
         ("layers,l", value<int>()->default_value(LAYERS), "use <num> layers for RNN components")
         ("align,a", value<int>()->default_value(ALIGN_DIM), "use <num> dimensions for alignment projection")
         ("hidden,h", value<int>()->default_value(HIDDEN_DIM), "use <num> dimensions for recurrent hidden states")
+	("sgd_trainer", value<unsigned>()->default_value(0), "use specific SGD trainer (0: vanilla SGD; 1: momentum SGD; 2: Adagrad; 3: AdaDelta; 4: Adam)")
+	("lr_eta", value<float>()->default_value(0.01f), "SGD learning rate value (e.g., 0.01 for simple SGD trainer)")
+        ("lr_eta_decay", value<float>()->default_value(2.0f), "SGD learning rate decay value")
+	("sparse_updates", value<bool>()->default_value(true), "enable/disable sparse update(s) for lookup parameter(s)")
         ("topk,k", value<int>()->default_value(100), "use <num> top kbest entries, used with --kbest")
         ("epochs,e", value<int>()->default_value(50), "maximum number of training epochs")
         ("gru", "use Gated Recurrent Unit (GRU) for recurrent structure; default RNN")
@@ -202,15 +206,25 @@ int main_body(variables_map vm)
 
 	cerr << "Parameters will be written to: " << fname << endl;
 
-	Model model;
-    //bool use_momentum = false;
-    Trainer* sgd = nullptr;
-    //if (use_momentum)
-        //sgd = new MomentumSGDTrainer(&model);
-    //else
-        sgd = new SimpleSGDTrainer(&model);
-	sgd->eta = 0.01f;
-    //sgd = new AdadeltaTrainer(&model);
+   Model model;
+   Trainer* sgd = nullptr;
+   unsigned sgd_type = vm["sgd_trainer"].as<unsigned>();
+   if (sgd_type == 1)
+       sgd = new MomentumSGDTrainer(model, vm["lr_eta"].as<float>());
+   else if (sgd_type == 2)
+       sgd = new AdagradTrainer(model, vm["lr_eta"].as<float>());
+   else if (sgd_type == 3)
+       sgd = new AdadeltaTrainer(model);
+   else if (sgd_type == 4)
+       sgd = new AdamTrainer(model, vm["lr_eta"].as<float>());
+   else if (sgd_type == 0)//Vanilla SGD trainer
+       sgd = new SimpleSGDTrainer(model, vm["lr_eta"].as<float>());
+   else
+       assert("Unknown SGD trainer type! (0: vanilla SGD; 1: momentum SGD; 2: Adagrad; 3: AdaDelta; 4: Adam)");
+   sgd->eta_decay = vm["lr_eta_decay"].as<float>();
+   sgd->sparse_updates_enabled = vm["sparse_updates"].as<bool>();
+   if (!sgd->sparse_updates_enabled)
+      cerr << "Sparse updates for lookup parameter(s) to be disabled!" << endl;
 
    cerr << "%% Using " << flavour << " recurrent units" << endl;
    AttentionalModel<rnn_t> am(&model, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
@@ -300,7 +314,7 @@ void test_decode(Model &model, AM_t &am, string test_file, bool doco, unsigned b
         if (doco)
             source = read_numbered_sentence(line, &sd, num);
         else 
-            source = read_sentence(line, &sd);
+            source = read_sentence(line, sd);
 
 	if (source.front() != kSRC_SOS && source.back() != kSRC_EOS) {
 	    cerr << "Sentence in " << test_file << ":" << lno << " didn't start or end with <s>, </s>\n";
@@ -655,7 +669,7 @@ Corpus read_corpus(const string &filename, bool doco)
         if (doco) 
             read_numbered_sentence_pair(line, &source, &sd, &target, &td, identifiers);
         else
-            read_sentence_pair(line, &source, &sd, &target, &td);
+            read_sentence_pair(line, source, sd, target, td);
         corpus.push_back(SentencePair(source, target, identifiers[0]));
         stoks += source.size();
         ttoks += target.size();
diff --git a/src/attentional.h b/src/attentional.h
index 6e23924..41d7e35 100644
--- a/src/attentional.h
+++ b/src/attentional.h
@@ -130,9 +130,9 @@ AttentionalModel<Builder>::AttentionalModel(dynet::Model* model,
     unsigned align_dim, bool _rnn_src_embeddings, bool _giza_positional, 
     bool _giza_markov, bool _giza_fertility, bool _doc_context,
     bool _global_fertility)
-: builder(layers, (_rnn_src_embeddings) ? 3*hidden_dim : 2*hidden_dim, hidden_dim, model),
-  builder_src_fwd(1, hidden_dim, hidden_dim, model),
-  builder_src_bwd(1, hidden_dim, hidden_dim, model),
+: builder(layers, (_rnn_src_embeddings) ? 3*hidden_dim : 2*hidden_dim, hidden_dim, *model),
+  builder_src_fwd(1, hidden_dim, hidden_dim, *model),
+  builder_src_bwd(1, hidden_dim, hidden_dim, *model),
   rnn_src_embeddings(_rnn_src_embeddings), 
   giza_positional(_giza_positional), giza_markov(_giza_markov), giza_fertility(_giza_fertility),
   doc_context(_doc_context),
@@ -460,13 +460,13 @@ Expression AttentionalModel<Builder>::BuildGraph(const std::vector<int> &source,
             /* log-Normal distribution */
             Expression log_fert = log(i_total_trim);
             Expression delta = log_fert - mu_trim;
-            Expression exponent = cdiv(-cwise_multiply(delta, delta), 2.0f * var_trim);
+            Expression exponent = cdiv(-cmult(delta, delta), 2.0f * var_trim);
             Expression partition = -log_fert - 0.5 * log(2.0f * var_trim * 3.14159265359);
             *fertility = -sum_cols(transpose(partition + exponent));
 #else
             /* Normal distribution */
             Expression delta = i_total_trim - mu_trim;
-            Expression exponent = cdiv(-cwise_multiply(delta, delta), 2.0f * var_trim);
+            Expression exponent = cdiv(-cmult(delta, delta), 2.0f * var_trim);
             Expression partition = -0.5 * log(2.0f * var_trim * 3.14159265359);
             *fertility = -sum_cols(transpose(partition + exponent));
             // note that as this is the value of the normal density, the errors
diff --git a/src/biattentional.cc b/src/biattentional.cc
index b4dc898..48b8360 100644
--- a/src/biattentional.cc
+++ b/src/biattentional.cc
@@ -141,7 +141,7 @@ Corpus read_corpus(const string &filename)
     while(getline(in, line)) {
         ++lc;
         Sentence source, target;
-        read_sentence_pair(line, &source, &sd, &target, &td);
+        read_sentence_pair(line, source, sd, target, td);
         corpus.push_back(SentencePair(source, target));
         stoks += source.size();
         ttoks += target.size();
@@ -253,7 +253,7 @@ int main(int argc, char** argv) {
     double best = 9e+99;
 
     Model model;
-    SimpleSGDTrainer sgd(&model);
+    SimpleSGDTrainer sgd(model);
     BidirAttentionalModel<LSTMBuilder> am(&model, 0.1);
 
     bool add_fer = false;

From 562fa994f8fbbee70208134e51e01a00a6ac7e96 Mon Sep 17 00:00:00 2001
From: Cong Duy Vu Hoang <vhoang2@student.unimelb.edu.au>
Date: Thu, 22 Dec 2016 20:56:10 +1100
Subject: [PATCH 4/5] dynet

---
 dynet | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dynet b/dynet
index 8904224..b9308e4 160000
--- a/dynet
+++ b/dynet
@@ -1 +1 @@
-Subproject commit 8904224f7c5788246035e78fb4abe5b7df6aeba3
+Subproject commit b9308e4af080f5b6eaa15db3b20f6578c52d54e4

From f54f41344843f9ca5c7c6effa0fe77cd597584af Mon Sep 17 00:00:00 2001
From: Cong Duy Vu Hoang <vhoang2@student.unimelb.edu.au>
Date: Tue, 17 Jan 2017 11:17:35 +1100
Subject: [PATCH 5/5] fixed bug for beam search decoding

---
 src/attentional.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/attentional.h b/src/attentional.h
index 41d7e35..8d71a6a 100644
--- a/src/attentional.h
+++ b/src/attentional.h
@@ -329,9 +329,14 @@ Expression AttentionalModel<Builder>::add_input(int trg_tok, int t, ComputationG
     // alignment input 
     Expression i_wah_rep;
     if (t > 0) {
-	//auto i_h_tm1 = builder.final_h().back();
-	auto i_h_tm1 = concatenate(builder.final_h());
+	Expression i_h_tm1;
+	if (prev_state)
+	    i_h_tm1 = concatenate(builder.get_h(*prev_state));// This is required for beam search decoding implementation.
+	else
+	    i_h_tm1 = concatenate(builder.final_h());
+
 	Expression i_wah = i_Wa * i_h_tm1;
+
 	// want numpy style broadcasting, but have to do this manually
 	i_wah_rep = concatenate_cols(std::vector<Expression>(slen, i_wah));
     }