Merge pull request #1 from trevorcohn/moving_to_dynet

Moving to dynet
trevorcohn · Jul 21, 2017 · 0049ada · 0049ada
2 parents 7fecf03 + f54f413
commit 0049ada
Show file tree

Hide file tree

Showing 9 changed files with 204 additions and 142 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "cnn"]
-	path = cnn
-	url = https://github.com/clab/cnn.git
+[submodule "dynet"]
+	path = dynet
+	url = https://github.com/clab/dynet.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,35 +1,68 @@
 project(mantis)
 cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
 
-set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cnn/cmake)
-
-# CNN uses Eigen which exploits modern CPU architectures. To get the
-# best possible performance, the following are recommended:
-#   1. use very recent versions of gcc or Clang to build
-#   2. use very recent versions of Eigen (ideally the dev version)
-#   3. try compiler options like -march=native or other architecture
-#      flags (the compiler does not always make the best configuration
-#      decisions without help)
+set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/dynet/cmake)
+
+function(find_mkl)
+  set(MKL_ARCH intel64)
+  find_path(MKL_INCLUDE_DIR mkl.h
+            PATHS ${MKL_ROOT} ${MKL_ROOT}/include)
+  find_library(MKL_CORE_LIB NAMES mkl_intel_lp64 mkl_intel_thread mkl_core
+               PATHS ${MKL_ROOT} ${MKL_ROOT}/lib/${MKL_ARCH}
+               DOC "MKL core library path")
+
+  find_library(MKL_COMPILER_LIB NAMES iomp5 libiomp5md
+               PATHS ${MKL_ROOT} ${MKL_ROOT}/../compiler/lib/${MKL_ARCH}              #Windows
+                     ${MKL_ROOT}/../compilers_and_libraries/linux/lib/${MKL_ARCH}_lin #Linux
+               DOC "MKL compiler lib (for threaded MKL)")
+
+  if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_COMPILER_LIB)
+    get_filename_component(MKL_CORE_LIB_DIR ${MKL_CORE_LIB} DIRECTORY)
+    get_filename_component(MKL_COMPILER_LIB_DIR ${MKL_COMPILER_LIB} DIRECTORY)
+    get_filename_component(MKL_COMPILER_LIB_FILE ${MKL_COMPILER_LIB} NAME)
+    message(STATUS "Found MKL\n   * include: ${MKL_INCLUDE_DIR},\n   * core library dir: ${MKL_CORE_LIB_DIR},\n   * compiler library: ${MKL_COMPILER_LIB}")
+
+    # Due to a conflict with /MT and /MD, MSVC needs mkl_intel_lp64 linked last, or we can change individual
+    # projects to use /MT (mkl_intel_lp64 linked with /MT, default MSVC projects use /MD), or we can instead
+    # link to the DLL versions. For now I'm opting for this solution which seems to work with projects still
+    # at their default /MD. Linux build requires the mkl_intel_lp64 to be linked first. So...:
+    if(MSVC)
+      set(LIBS ${LIBS} mkl_intel_thread mkl_core mkl_intel_lp64 ${MKL_COMPILER_LIB_FILE} PARENT_SCOPE)
+    else()
+      set(LIBS ${LIBS} mkl_intel_lp64 mkl_intel_thread mkl_core ${MKL_COMPILER_LIB_FILE} PARENT_SCOPE)
+    endif()
+    include_directories(${MKL_INCLUDE_DIR})
+    link_directories(${MKL_CORE_LIB_DIR} ${MKL_COMPILER_LIB_DIR})
+    set(MKL_LINK_DIRS ${MKL_CORE_LIB_DIR} ${MKL_COMPILER_LIB_DIR} PARENT_SCOPE) # Keeping this for python build
+  else()
+    message(FATAL_ERROR "Failed to find MKL in path: ${MKL_ROOT} (Did you set MKL_ROOT properly?)")
+  endif()
+endfunction()
 
-# Cross-compiler, cross-platform options
+######## Cross-compiler, cross-platform options
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_FAST_MATH")
+if (MKL OR MKL_ROOT)
+  find_mkl()  # sets include/lib directories and sets ${LIBS} needed for linking
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DEIGEN_USE_MKL_ALL")
+endif()
+
 
-# Platform-specific options
+######## Platform-specific options
 if(WIN32)
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOMINMAX")   # Disable min/max macros in windef.h
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOMINMAX")   # Disable min/max macros in windef.h
 endif()
 
-# Compiler-specific options
+######## Compiler-specific options
 if(MSVC)
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W1 -DEIGEN_HAS_C99_MATH /MP")   # -Wall produces 20k warnings
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W1 -DEIGEN_HAS_C99_MATH /MP")   # -Wall produces 20k warnings
 else()
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -funroll-loops -Wall -std=c++11 -Ofast -g -march=native")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -funroll-loops -Wall -std=c++11 -Ofast -g -march=native")
 endif()
 
 enable_testing()
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}
-                    ${PROJECT_SOURCE_DIR}/cnn)
+                    ${PROJECT_SOURCE_DIR}/dynet)
 
 function(find_cudnn)
   set(CUDNN_ROOT "" CACHE PATH "CUDNN root path")
@@ -111,9 +144,9 @@ include_directories(${EIGEN3_INCLUDE_DIR})
 FIND_PACKAGE(Threads REQUIRED)
 set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
 
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cnn/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dynet/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h)
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
-add_subdirectory(cnn/cnn)
+add_subdirectory(dynet/dynet)
 add_subdirectory(src)
 enable_testing()
diff --git a/cnn b/cnn
diff --git a/dynet b/dynet
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -2,14 +2,29 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
 
 foreach(TARGET attentional biattentional)
   ADD_EXECUTABLE(${TARGET} ${TARGET}.cc)
-  target_link_libraries(${TARGET} cnn ${LIBS})
   if(UNIX AND NOT APPLE)
     target_link_libraries(${TARGET} rt)
   endif()
   if (WITH_CUDA_BACKEND)
-    add_dependencies(${TARGET} cnncuda)
-    target_link_libraries(${TARGET} cnncuda)
+	  set(CUDA_SEPARABLE_COMPILATION ON)
+	  list(APPEND CUDA_NVCC_FLAGS "-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_52,code=sm_52;-gencode;arch=compute_52,code=compute_52;-std=c++11;-DVERBOSE;-DEIGEN_USE_GPU;-DHAVE_CUDA")
+	  if(CMAKE_COMPILER_IS_GNUCXX)
+	    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9)
+	      # gcc 4.9 or later versions raise SEGV due to the optimization problem.
+	      # Use -O1 instead for now.
+	      list(APPEND CUDA_NVCC_FLAGS "-O1")
+	    else()
+	      list(APPEND CUDA_NVCC_FLAGS "-O2")
+	    endif()
+	  else()
+	    list(APPEND CUDA_NVCC_FLAGS "-O2")
+	  endif()
+	  add_dependencies(${TARGET} gdynet dynetcuda)
+	  target_link_libraries(${TARGET} gdynet dynetcuda)
     CUDA_ADD_CUBLAS_TO_TARGET(${TARGET})
+  else()
+	  add_dependencies(${TARGET} dynet)
+	  target_link_libraries(${TARGET} dynet ${LIBS})
   endif (WITH_CUDA_BACKEND)
 endforeach()
 
diff --git a/src/attentional.cc b/src/attentional.cc
@@ -10,7 +10,7 @@
 #include <boost/program_options/variables_map.hpp>
 
 using namespace std;
-using namespace cnn;
+using namespace dynet;
 using namespace boost::program_options;
 
 unsigned LAYERS = 1; // 2
@@ -19,8 +19,8 @@ unsigned ALIGN_DIM = 32;   // 128
 unsigned SRC_VOCAB_SIZE = 0;
 unsigned TGT_VOCAB_SIZE = 0;
 
-cnn::Dict sd;
-cnn::Dict td;
+dynet::Dict sd;
+dynet::Dict td;
 int kSRC_SOS;
 int kSRC_EOS;
 int kTGT_SOS;
@@ -45,7 +45,7 @@ template <class rnn_t>
 int main_body(variables_map vm);
 
 int main(int argc, char** argv) {
-    cnn::initialize(argc, argv);
+    dynet::initialize(argc, argv);
 
     // command line processing
     variables_map vm; 
@@ -65,6 +65,10 @@ int main(int argc, char** argv) {
         ("layers,l", value<int>()->default_value(LAYERS), "use <num> layers for RNN components")
         ("align,a", value<int>()->default_value(ALIGN_DIM), "use <num> dimensions for alignment projection")
         ("hidden,h", value<int>()->default_value(HIDDEN_DIM), "use <num> dimensions for recurrent hidden states")
+	("sgd_trainer", value<unsigned>()->default_value(0), "use specific SGD trainer (0: vanilla SGD; 1: momentum SGD; 2: Adagrad; 3: AdaDelta; 4: Adam)")
+	("lr_eta", value<float>()->default_value(0.01f), "SGD learning rate value (e.g., 0.01 for simple SGD trainer)")
+        ("lr_eta_decay", value<float>()->default_value(2.0f), "SGD learning rate decay value")
+	("sparse_updates", value<bool>()->default_value(true), "enable/disable sparse update(s) for lookup parameter(s)")
         ("topk,k", value<int>()->default_value(100), "use <num> top kbest entries, used with --kbest")
         ("epochs,e", value<int>()->default_value(50), "maximum number of training epochs")
         ("gru", "use Gated Recurrent Unit (GRU) for recurrent structure; default RNN")
@@ -112,8 +116,8 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel,
         bool doco, float coverage, bool display, bool fert);
 
 template <class AM_t> void test_rescore(Model &model, AM_t &am, Corpus &testing, bool doco);
-template <class AM_t> void test_decode(Model &model, AM_t &am, std::string test_file, bool doco, int beam);
-template <class AM_t> void test_kbest_arcs(Model &model, AM_t &am, string test_file, int top_k);
+template <class AM_t> void test_decode(Model &model, AM_t &am, std::string test_file, bool doco, unsigned beam);
+template <class AM_t> void test_kbest_arcs(Model &model, AM_t &am, string test_file, unsigned top_k);
 template <class AM_t> void fert_stats(Model &model, AM_t &am, Corpus &devel, bool global_fert);
 
 const Sentence* context(const Corpus &corpus, unsigned i);
@@ -146,8 +150,6 @@ int main_body(variables_map vm)
     if (vm.count("lstm"))	flavour = "LSTM";
     else if (vm.count("gru"))	flavour = "GRU";
 
-    typedef vector<int> Sentence;
-    typedef pair<Sentence, Sentence> SentencePair;
     Corpus training, devel, testing;
     string line;
     cerr << "Reading training data from " << vm["train"].as<string>() << "...\n";
@@ -204,15 +206,25 @@ int main_body(variables_map vm)
 
 	cerr << "Parameters will be written to: " << fname << endl;
 
-	Model model;
-    //bool use_momentum = false;
-    Trainer* sgd = nullptr;
-    //if (use_momentum)
-        //sgd = new MomentumSGDTrainer(&model);
-    //else
-        sgd = new SimpleSGDTrainer(&model);
-	sgd->eta = 0.01f;
-    //sgd = new AdadeltaTrainer(&model);
+   Model model;
+   Trainer* sgd = nullptr;
+   unsigned sgd_type = vm["sgd_trainer"].as<unsigned>();
+   if (sgd_type == 1)
+       sgd = new MomentumSGDTrainer(model, vm["lr_eta"].as<float>());
+   else if (sgd_type == 2)
+       sgd = new AdagradTrainer(model, vm["lr_eta"].as<float>());
+   else if (sgd_type == 3)
+       sgd = new AdadeltaTrainer(model);
+   else if (sgd_type == 4)
+       sgd = new AdamTrainer(model, vm["lr_eta"].as<float>());
+   else if (sgd_type == 0)//Vanilla SGD trainer
+       sgd = new SimpleSGDTrainer(model, vm["lr_eta"].as<float>());
+   else
+       assert("Unknown SGD trainer type! (0: vanilla SGD; 1: momentum SGD; 2: Adagrad; 3: AdaDelta; 4: Adam)");
+   sgd->eta_decay = vm["lr_eta_decay"].as<float>();
+   sgd->sparse_updates_enabled = vm["sparse_updates"].as<bool>();
+   if (!sgd->sparse_updates_enabled)
+      cerr << "Sparse updates for lookup parameter(s) to be disabled!" << endl;
 
    cerr << "%% Using " << flavour << " recurrent units" << endl;
    AttentionalModel<rnn_t> am(&model, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
@@ -234,19 +246,19 @@ int main_body(variables_map vm)
                 vm["epochs"].as<int>(), doco, vm["coverage"].as<float>(), vm.count("display"),
                 fert);
     else if (vm.count("kbest"))
-    	test_kbest_arcs(model, am, vm["kbest"].as<string>(), vm["topk"].as<int>());
+    	test_kbest_arcs(model, am, vm["kbest"].as<string>(), vm["topk"].as<unsigned>());
     else if (vm.count("test")) {
         if (vm.count("rescore"))
             test_rescore(model, am, testing, doco);
         else // test
-            test_decode(model, am, vm["test"].as<string>(), doco, vm["beam"].as<int>());
+            test_decode(model, am, vm["test"].as<string>(), doco, vm["beam"].as<unsigned>());
     }
     else if (vm.count("fert-stats"))
         fert_stats(model, am, devel, vm.count("fertility"));
 
     delete sgd;
 
-    //cnn::Free();
+    //dynet::Free();
 
     return EXIT_SUCCESS;
 }
@@ -264,9 +276,9 @@ void test_rescore(Model &model, AM_t &am, Corpus &testing, bool doco)
         tie(ssent, tsent, docid) = testing[i];
 
 	ComputationGraph cg;
-        am.BuildGraph(ssent, tsent, cg, nullptr, (doco) ? context(testing, i) : nullptr);
+        auto iloss = am.BuildGraph(ssent, tsent, cg, nullptr, (doco) ? context(testing, i) : nullptr);
 
-	double loss = as_scalar(cg.forward());
+	double loss = as_scalar(cg.forward(iloss));
         cout << i << " |||";
 	for (auto &w: ssent)
 	    cout << " " << sd.convert(w);
@@ -286,10 +298,8 @@ void test_rescore(Model &model, AM_t &am, Corpus &testing, bool doco)
 }
 
 template <class AM_t>
-void test_decode(Model &model, AM_t &am, string test_file, bool doco, int beam)
+void test_decode(Model &model, AM_t &am, string test_file, bool doco, unsigned beam)
 {
-    double tloss = 0;
-    int tchars = 0;
     int lno = 0;
 
     cerr << "Reading test examples from " << test_file << endl;
@@ -304,7 +314,7 @@ void test_decode(Model &model, AM_t &am, string test_file, bool doco, int beam)
         if (doco)
             source = read_numbered_sentence(line, &sd, num);
         else 
-            source = read_sentence(line, &sd);
+            source = read_sentence(line, sd);
 
 	if (source.front() != kSRC_SOS && source.back() != kSRC_EOS) {
 	    cerr << "Sentence in " << test_file << ":" << lno << " didn't start or end with <s>, </s>\n";
@@ -339,7 +349,7 @@ void test_decode(Model &model, AM_t &am, string test_file, bool doco, int beam)
 }
 
 template <class AM_t>
-void test_kbest_arcs(Model &model, AM_t &am, string test_file, int top_k)
+void test_kbest_arcs(Model &model, AM_t &am, string test_file, unsigned top_k)
 {
     // only suitable for monolingual setting, of predicting a sentence given preceeding sentence
     cerr << "Reading test examples from " << test_file << endl;
@@ -369,7 +379,7 @@ void test_kbest_arcs(Model &model, AM_t &am, string test_file, int top_k)
                     errs.push_back(i_err);
                 }
                 Expression i_nerr = sum(errs);
-                double loss = as_scalar(cg.incremental_forward());
+                double loss = as_scalar(cg.incremental_forward(i_nerr));
 
                 //cout << last_last_id << ":" << last_id << " |||";
                 //for (auto &w: source) cout << " " << sd.convert(w);
@@ -483,9 +493,9 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel,
     }
 
     bool first = true;
-    int report = 0;
+    unsigned report = 0;
     unsigned lines = 0;
-    int epoch = 0;
+    unsigned epoch = 0;
     Sentence ssent, tsent;
     int docid;
 
@@ -497,8 +507,8 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel,
             tie(ssent, tsent, docid) = devel[i];
             ComputationGraph cg;
             Expression alignment;
-            am.BuildGraph(ssent, tsent, cg, &alignment, (doco) ? context(devel, i) : nullptr);
-            cg.forward();
+            auto iloss = am.BuildGraph(ssent, tsent, cg, &alignment, (doco) ? context(devel, i) : nullptr);
+            cg.forward(iloss);
 
             cout << "\n====== SENTENCE " << i << " =========\n";
             am.display_ascii(ssent, tsent, cg, alignment, sd, td);
@@ -586,7 +596,7 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel,
                 objective = objective + fertility_nll;
 
             // perform forward computation for aggregate objective
-            cg.forward();
+            cg.forward(objective);
 
             // grab the parts of the objective
             loss += as_scalar(cg.get_value(xent.i));
@@ -595,7 +605,7 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel,
             if (fert) 
                 loss_fert += as_scalar(cg.get_value(fertility_nll.i));
 
-            cg.backward();
+            cg.backward(objective);
 	    sgd.update();
             ++lines;
 
@@ -627,8 +637,8 @@ void train(Model &model, AM_t &am, Corpus &training, Corpus &devel,
             for (unsigned i = 0; i < devel.size(); ++i) {
                 tie(ssent, tsent, docid) = devel[i];
                 ComputationGraph cg;
-                am.BuildGraph(ssent, tsent, cg, nullptr, (doco) ? context(devel, i) : nullptr, nullptr, nullptr);
-                dloss += as_scalar(cg.forward());
+                auto idloss = am.BuildGraph(ssent, tsent, cg, nullptr, (doco) ? context(devel, i) : nullptr, nullptr, nullptr);
+                dloss += as_scalar(cg.forward(idloss));
                 dchars += tsent.size() - 1;
             }
             if (dloss < best) {
@@ -659,7 +669,7 @@ Corpus read_corpus(const string &filename, bool doco)
         if (doco) 
             read_numbered_sentence_pair(line, &source, &sd, &target, &td, identifiers);
         else
-            read_sentence_pair(line, &source, &sd, &target, &td);
+            read_sentence_pair(line, source, sd, target, td);
         corpus.push_back(SentencePair(source, target, identifiers[0]));
         stoks += source.size();
         ttoks += target.size();