wavii · erikfrey · Jan 10, 2012 · Jan 10, 2012
diff --git a/include/pypfp/pypfp.h b/include/pypfp/pypfp.h
@@ -33,6 +33,7 @@ class pypfp
   boost::shared_ptr<workspace> pworkspace_;
 
   void init(size_t sentence_length = 45, const std::string & data_dir = "");
+  std::string _parse_tokens(const std::vector<std::string>& words);
 
 public:
 
@@ -43,6 +44,9 @@ class pypfp
   pypfp(size_t sentence_length, const std::string & data_dir);
 
   std::string parse(const std::string & sentence);
+
+  std::string parse_tokens(const boost::python::list& words);
+
 };
 
 }}} // com::wavii::pfp

diff --git a/src/pypfp/pypfp.cpp b/src/pypfp/pypfp.cpp
@@ -68,14 +68,12 @@ void pypfp::init(size_t sentence_length /*= 45*/, const std::string & data_dir /
   pworkspace_.reset(new workspace(sentence_length, states_.size()));
 }
 
-std::string pypfp::parse(const std::string & sentence)
+std::string pypfp::_parse_tokens(const std::vector<std::string>& words)
 {
-  // now some words
-  std::vector< std::string > words;
   std::vector< std::pair< state_t, float > > state_weight;
   std::vector< std::vector< state_score_t > > sentence_f;
   node result;
-  tokenizer_.tokenize(sentence, words);
+
   for (std::vector< std::string >::const_iterator it = words.begin(); it != words.end(); ++it)
   {
     state_weight.clear(); lexicon_.score(*it, std::back_inserter(state_weight));
@@ -92,14 +90,39 @@ std::string pypfp::parse(const std::string & sentence)
   // stitch together the results
   std::ostringstream oss;
   stitch(oss, result, words.begin(), states_);
-  return oss.str();
+  return oss.str();   
+}
+
+std::string pypfp::parse_tokens(const boost::python::list& words)
+{
+  std::vector<std::string> words_vec;
+  size_t len = boost::python::len(words);
+
+  // makes a copy of the content. Yes, I could avoid it by iterating but then
+  // I'd have to use stl_iterator and make parse_tokens templated.
+  // It's not really worth the hassle.
+  for (size_t i = 0; i != len; ++i)
+    words_vec.push_back(boost::python::extract<std::string>(words[i]));
+
+   return _parse_tokens(words_vec);
+}
+
+std::string pypfp::parse(const std::string & sentence)
+{
+  // now some words
+  std::vector< std::string > words;
+  tokenizer_.tokenize(sentence, words);
+  return _parse_tokens(words);
 }
 
 BOOST_PYTHON_MODULE(pfp)
 {
     class_<pypfp, boost::noncopyable>("Parser", init<>())
-          .def(init<size_t>())
-          .def(init<size_t, const std::string &>())
-      .def("parse", &pypfp::parse)
+          .def(init<size_t>(boost::python::args("max_sentence_len")))
+          .def(init<size_t, const std::string &>(boost::python::args("max_sentence_len", "data_dir")))
+      .def("parse", &pypfp::parse, boost::python::args("self", "sentence"),
+            "Will parse the given sentence")
+      .def("parse_tokens", &pypfp::parse_tokens, boost::python::args("self", "tokens"),
+            "Will parse the give tokens list")
     ;
 }