Skip to content
This repository has been archived by the owner on Apr 24, 2020. It is now read-only.

Parse tokens #8

Merged
merged 1 commit into from
Jan 10, 2012
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions include/pypfp/pypfp.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class pypfp
boost::shared_ptr<workspace> pworkspace_;

void init(size_t sentence_length = 45, const std::string & data_dir = "");
std::string _parse_tokens(const std::vector<std::string>& words);

public:

Expand All @@ -43,6 +44,9 @@ class pypfp
pypfp(size_t sentence_length, const std::string & data_dir);

std::string parse(const std::string & sentence);

std::string parse_tokens(const boost::python::list& words);

};

}}} // com::wavii::pfp
Expand Down
39 changes: 31 additions & 8 deletions src/pypfp/pypfp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,12 @@ void pypfp::init(size_t sentence_length /*= 45*/, const std::string & data_dir /
pworkspace_.reset(new workspace(sentence_length, states_.size()));
}

std::string pypfp::parse(const std::string & sentence)
std::string pypfp::_parse_tokens(const std::vector<std::string>& words)
{
// now some words
std::vector< std::string > words;
std::vector< std::pair< state_t, float > > state_weight;
std::vector< std::vector< state_score_t > > sentence_f;
node result;
tokenizer_.tokenize(sentence, words);

for (std::vector< std::string >::const_iterator it = words.begin(); it != words.end(); ++it)
{
state_weight.clear(); lexicon_.score(*it, std::back_inserter(state_weight));
Expand All @@ -92,14 +90,39 @@ std::string pypfp::parse(const std::string & sentence)
// stitch together the results
std::ostringstream oss;
stitch(oss, result, words.begin(), states_);
return oss.str();
return oss.str();
}

std::string pypfp::parse_tokens(const boost::python::list& words)
{
std::vector<std::string> words_vec;
size_t len = boost::python::len(words);

// makes a copy of the content. Yes, I could avoid it by iterating but then
// I'd have to use stl_iterator and make parse_tokens templated.
// It's not really worth the hassle.
for (size_t i = 0; i != len; ++i)
words_vec.push_back(boost::python::extract<std::string>(words[i]));

return _parse_tokens(words_vec);
}

std::string pypfp::parse(const std::string & sentence)
{
// now some words
std::vector< std::string > words;
tokenizer_.tokenize(sentence, words);
return _parse_tokens(words);
}

BOOST_PYTHON_MODULE(pfp)
{
class_<pypfp, boost::noncopyable>("Parser", init<>())
.def(init<size_t>())
.def(init<size_t, const std::string &>())
.def("parse", &pypfp::parse)
.def(init<size_t>(boost::python::args("max_sentence_len")))
.def(init<size_t, const std::string &>(boost::python::args("max_sentence_len", "data_dir")))
.def("parse", &pypfp::parse, boost::python::args("self", "sentence"),
"Will parse the given sentence")
.def("parse_tokens", &pypfp::parse_tokens, boost::python::args("self", "tokens"),
"Will parse the give tokens list")
;
}