Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Letor updates #324

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion xapian-letor/api/featurelist.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ FeatureList::normalise(std::vector<FeatureVector>& fvec) const
std::vector<FeatureVector>
FeatureList::create_feature_vectors(const Xapian::MSet & mset,
const Xapian::Query & letor_query,
const Xapian::Database & letor_db) const
const Xapian::Database& letor_db,
bool flag, double bias) const
{
LOGCALL(API, std::vector<FeatureVector>, "FeatureList::create_feature_vectors", mset | letor_query | letor_db);
if (mset.empty())
Expand All @@ -121,6 +122,7 @@ FeatureList::create_feature_vectors(const Xapian::MSet & mset,
// Append feature values
fvals.insert(fvals.end(), values.begin(), values.end());
}
if (flag) fvals.push_back(bias);
double wt = i.get_weight();
// Weight is added as a feature by default.
fvals.push_back(wt);
Expand Down
3 changes: 2 additions & 1 deletion xapian-letor/include/xapian-letor/featurelist.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ class XAPIAN_VISIBILITY_DEFAULT FeatureList {
std::vector<Xapian::FeatureVector>
create_feature_vectors(const Xapian::MSet & mset,
const Xapian::Query & letor_query,
const Xapian::Database & letor_db) const;
const Xapian::Database& letor_db, bool flag = false,
double bias = 1.0) const;

private:
/// Perform query-level normalisation of FeatureVectors.
Expand Down
6 changes: 5 additions & 1 deletion xapian-letor/include/xapian-letor/ranker.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ prepare_training_file(const std::string & db_path,
const std::string & qrel_file,
Xapian::doccount msetsize,
const std::string & filename,
const Xapian::FeatureList & flist = FeatureList());
const Xapian::FeatureList& flist = FeatureList(),
bool flag = false, double bias = 1.0);

class XAPIAN_VISIBILITY_DEFAULT Ranker : public Xapian::Internal::intrusive_base {
/// Path to Xapian::Database instance to be used.
Expand Down Expand Up @@ -191,6 +192,9 @@ class XAPIAN_VISIBILITY_DEFAULT Ranker : public Xapian::Internal::intrusive_base
const Xapian::FeatureList & flist = Xapian::FeatureList());

protected:
/// Initialize the parameters for neural network with Xavier initialization.
std::vector<double> xavier_initialisation(int feature_cnt);

/// Method to train the model. Overridden in ranker subclass.
virtual void
train(const std::vector<std::vector<FeatureVector>>& training_data) = 0;
Expand Down
7 changes: 2 additions & 5 deletions xapian-letor/ranker/listmle_ranker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -133,11 +133,8 @@ ListMLERanker::train(const vector<vector<FeatureVector>>& training_data)
throw InvalidArgumentError("Cannot train: no training data");
int feature_cnt = training_data[0][0].get_fcount();

// Initialize the parameters for neural network
vector<double> new_parameters;
for (int feature_num = 0; feature_num < feature_cnt; ++feature_num) {
new_parameters.push_back(0.0);
}
// Initialize the parameters for neural network with Xavier initialization.
vector<double> new_parameters = xavier_initialisation(feature_cnt);

for (auto& item1 : training_data) {
for (auto& item2 : item1) {
Expand Down
4 changes: 2 additions & 2 deletions xapian-letor/ranker/listnet_ranker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ ListNETRanker::train(const vector<vector<Xapian::FeatureVector>>& training_data)
throw InvalidArgumentError("Cannot train: no training data");
int feature_cnt = training_data[0][0].get_fcount();

// initialize the parameters for neural network
vector<double> new_parameters(feature_cnt, 0.0);
// Initialize the parameters for neural network with Xavier initialization.
vector<double> new_parameters = xavier_initialisation(feature_cnt);

for (auto& item1 : training_data) {
for (auto& item2 : item1) {
Expand Down
54 changes: 31 additions & 23 deletions xapian-letor/ranker/ranker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include <fstream>
#include <iterator>
#include <map>
#include <random>
#include <sstream>
#include <string>
#include <utility>
Expand Down Expand Up @@ -239,6 +240,10 @@ initialise_queryparser(const Xapian::Database & db)
Xapian::QueryParser parser;
parser.add_prefix("title", "S");
parser.add_prefix("subject", "S");
parser.add_prefix("description", "XD");
parser.add_prefix("", "");
parser.add_prefix("", "S");
parser.add_prefix("", "XD");
parser.set_database(db);
parser.set_default_op(Xapian::Query::OP_OR);
parser.set_stemmer(stemmer);
Expand All @@ -249,8 +254,11 @@ initialise_queryparser(const Xapian::Database & db)

void
Xapian::prepare_training_file(const string & db_path, const string & queryfile,
const string & qrel_file, Xapian::doccount msetsize,
const string & filename, const FeatureList & flist)
const string& qrel_file,
Xapian::doccount msetsize,
const string& filename,
const FeatureList& flist,
bool flag, double bias)
{
// Set db
Xapian::Database letor_db(db_path);
Expand Down Expand Up @@ -285,24 +293,17 @@ Xapian::prepare_training_file(const string & db_path, const string & queryfile,
throw Xapian::LetorParseError("Query id should be unique");
}

Xapian::Query query_no_prefix = parser.parse_query(querystr,
parser.FLAG_DEFAULT|
parser.FLAG_SPELLING_CORRECTION);
// query with 'title' field as default prefix "S"
Xapian::Query query_default_prefix = parser.parse_query(querystr,
parser.FLAG_DEFAULT|
parser.FLAG_SPELLING_CORRECTION,
"S");
// Combine queries
Xapian::Query query = Xapian::Query(Xapian::Query::OP_OR, query_no_prefix, query_default_prefix);
Xapian::Query query = parser.parse_query(querystr);

Xapian::Enquire enquire(letor_db);
enquire.set_query(query);
Xapian::MSet mset = enquire.get_mset(0, msetsize);

vector<FeatureVector> fvv_mset = flist.create_feature_vectors(mset,
query,
letor_db);
letor_db,
flag,
bias);
vector<FeatureVector> fvv_qrel;
int k = 0;
for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); ++i) {
Expand Down Expand Up @@ -396,6 +397,22 @@ Ranker::rank(Xapian::MSet & mset, const string & model_key, const Xapian::Featur
mset.sort_by_relevance();
}

vector<double>
Ranker::xavier_initialisation(int feature_cnt)
{
// Construct a trivial random generator engine:
// 469382313 is a random number for which we are getting the best
// performance of letor against standard benchmark datasets.
minstd_rand0 generator(469382313);
normal_distribution<double> distribution(0.0,
sqrt(2.0 / (1 + feature_cnt)));
vector<double> new_parameters;
for (int feature_num = 0; feature_num < feature_cnt; ++feature_num) {
new_parameters.push_back(distribution(generator));
}
return new_parameters;
}

void
Ranker::train_model(const std::string & input_filename, const std::string & model_key)
{
Expand Down Expand Up @@ -451,16 +468,7 @@ Ranker::score(const string & query_file, const string & qrel_file,
string querystr = parsed_query.first;
string qid = parsed_query.second;

Xapian::Query query_no_prefix = parser.parse_query(querystr,
parser.FLAG_DEFAULT|
parser.FLAG_SPELLING_CORRECTION);
// query with 'title' field as default prefix "S"
Xapian::Query query_default_prefix = parser.parse_query(querystr,
parser.FLAG_DEFAULT|
parser.FLAG_SPELLING_CORRECTION,
"S");
// Combine queries
Xapian::Query query = Xapian::Query(Xapian::Query::OP_OR, query_no_prefix, query_default_prefix);
Xapian::Query query = parser.parse_query(querystr);

Xapian::Enquire enquire(letor_db);
enquire.set_query(query);
Expand Down
1 change: 1 addition & 0 deletions xapian-letor/tests/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,5 @@
/ndcg_score_output.txt
/training_output.txt
/training_output1.txt
/training_output_data_bias.txt
/training_output_three_correct.txt
2 changes: 2 additions & 0 deletions xapian-letor/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ EXTRA_DIST +=\
testdata/querythree.txt \
testdata/score_qrel.txt \
testdata/training_data.txt \
testdata/training_data_bias.txt \
testdata/training_data_ndcg.txt \
testdata/training_data_one_document.txt \
testdata/training_data_three_correct.txt \
Expand All @@ -117,6 +118,7 @@ CLEANFILES +=\
ndcg_output_listnet_3.txt \
ndcg_score_output.txt \
ndcg_score_test.txt \
training_output_data_bias.txt \
training_output_data_one_doc.txt \
training_output_empty.txt \
training_output_three_correct.txt \
Expand Down
59 changes: 57 additions & 2 deletions xapian-letor/tests/api_letor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,59 @@ DEFINE_TESTCASE(preparetrainingfileonedb, generated && path && writable)
unlink("training_output_data_one_doc.txt");
}

DEFINE_TESTCASE(preparetrainingfile_with_bias, generated && path && writable)
{
vector<Xapian::Feature*> flist;
flist.push_back(new Xapian::TfFeature());
flist.push_back(new Xapian::TfDoclenFeature());
flist.push_back(new Xapian::IdfFeature());
flist.push_back(new Xapian::CollTfCollLenFeature());
flist.push_back(new Xapian::TfIdfDoclenFeature());
flist.push_back(new Xapian::TfDoclenCollTfCollLenFeature());
string db_path = get_database_path("apitest_listnet_ranker1",
db_index_one_document);
string data_directory = test_driver::get_srcdir() + "/testdata/";
string query = data_directory + "queryone.txt";
string qrel = data_directory + "qrelone.txt";
string training_data = data_directory + "training_data_bias.txt";
unlink("training_output_data_bias.txt");
Xapian::prepare_training_file(db_path, query, qrel, 10,
"training_output_data_bias.txt", flist, true);
TEST(file_exists("training_output_data_bias.txt"));
ifstream if1(training_data);
ifstream if2("training_output_data_bias.txt");
string line1;
string line2;
while (getline(if1, line1)) {
TEST(getline(if2, line2));
istringstream iss1(line1);
istringstream iss2(line2);
string temp1;
string temp2;
int i = 0;
while ((iss1 >> temp1) && (iss2 >> temp2)) {
// The 0th, 1st and 22nd literals taken as input, are strings,
// and can be compared directly, They are: For example(test):
// ("1", "qid:20001" and "#docid=1") at 0th, 1st, and 22nd pos
// respectively. Whereas the other values are doubles which
// would have to tested under TEST_DOUBLE() against precision.
if (i == 0 || i == 1 || i == 22) {
TEST_EQUAL(temp1, temp2);
} else {
size_t t1 = temp1.find_first_of(':');
size_t t2 = temp2.find_first_of(':');
TEST_EQUAL_DOUBLE(stod(temp1.substr(t1 + 1)),
stod(temp2.substr(t2 + 1)));
}
i++;
}
TEST_REL(i, ==, 23);
TEST(!(iss2 >> temp2));
}
TEST(!getline(if2, line2));
unlink("training_output_data_bias.txt");
}

#define TEST_PARSE_EXCEPTION(TESTFILE) TEST_EXCEPTION(Xapian::LetorParseError,\
Xapian::prepare_training_file(db_path,\
data_directory + TESTFILE, qrel, 10,\
Expand Down Expand Up @@ -516,6 +569,7 @@ DEFINE_TESTCASE(preparetrainingfilethree, generated && path)
// ListNet_Ranker check
DEFINE_TESTCASE(listnet_ranker, generated && path && writable)
{
XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
Xapian::ListNETRanker ranker;
TEST_EXCEPTION(Xapian::FileNotFoundError, ranker.train_model(""));
string db_path = get_database_path("db_index_two_documents",
Expand Down Expand Up @@ -666,8 +720,8 @@ DEFINE_TESTCASE(scorer, generated && path && writable)
Xapian::MSet mymset = enquire.get_mset(0, 10);
string data_directory = test_driver::get_srcdir() + "/testdata/";
string query = data_directory + "querythree.txt";
string qrel = data_directory + "qrelthree_correct.txt";
string training_data = data_directory + "training_data_three_correct.txt";
string qrel = data_directory + "score_qrel.txt";
string training_data = data_directory + "training_data_ndcg.txt";
ranker.set_database_path(db_path);
TEST_EQUAL(ranker.get_database_path(), db_path);
ranker.set_query(Xapian::Query("score"));
Expand Down Expand Up @@ -696,6 +750,7 @@ DEFINE_TESTCASE(scorer, generated && path && writable)
// ListMLE_Ranker check
DEFINE_TESTCASE(listmle_ranker, generated && path && writable)
{
XFAIL_FOR_BACKEND("multi", "Testcase fails with multidatabase");
Xapian::ListMLERanker ranker;
TEST_EXCEPTION(Xapian::FileNotFoundError, ranker.train_model(""));
string db_path = get_database_path("db_index_two_documents",
Expand Down
4 changes: 2 additions & 2 deletions xapian-letor/tests/testdata/training_data.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
1 qid:20001 1:1 2:1 3:1 4:0.845488 5:1 6:1 7:-0.0841209 8:-0.0841209 9:-0.168242 10:1 11:1 12:1 13:-0.0129367 14:-0.00119658 15:-0.00155454 16:0.862496 17:1 18:1 19:1 #docid=1
2 qid:20001 1:1 2:1 3:1 4:1 5:0.684948 6:0.692569 7:-0.0841209 8:-0.0841209 9:-0.168242 10:1 11:1 12:1 13:-0.0155709 14:-0.000817232 15:-0.00107417 16:1 17:0.759881 18:0.766323 19:0.937812 #docid=2
1 qid:20001 1:1 2:1 3:1 4:0.845488 5:1 6:1 7:-0.0841209 8:-0.168242 9:-0.252363 10:1 11:1 12:1 13:-0.0129367 14:-0.00199393 15:-0.00233163 16:0.862496 17:1 18:1 19:1 #docid=1
2 qid:20001 1:1 2:1 3:1 4:1 5:0.684726 6:0.692481 7:-0.0841209 8:-0.168242 9:-0.252363 10:1 11:1 12:1 13:-0.0155709 14:-0.00136188 15:-0.00161117 16:1 17:0.75991 18:0.766305 19:0.938816 #docid=2
1 change: 1 addition & 0 deletions xapian-letor/tests/testdata/training_data_bias.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
4 qid:110001 1:1 2:1 3:1 4:1 5:1 6:1 7:-0.155541 8:-0.311083 9:-0.466624 10:1 11:1 12:1 13:-0.0269673 14:-0.00573887 15:-0.0063153 16:1 17:1 18:1 19:1 20:1 #docid=1
4 changes: 2 additions & 2 deletions xapian-letor/tests/testdata/training_data_ndcg.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
3 qid:1 1:0 2:1 3:0.760188 4:0 5:0.925687 6:0.846727 7:0 8:0 9:0 10:1 11:1 12:1 13:0 14:0 15:0 16:0 17:0.948648 18:0.652527 19:0.63916 #docid=1
5 qid:1 1:1 2:0 3:0.673239 4:1 5:0 6:0.749326 7:0 8:0 9:0 10:1 11:1 12:1 13:0 14:0 15:0 16:1 17:0 18:1 19:0.622809 #docid=2
1 qid:20001 1:1 2:1 3:1 4:0.845488 5:1 6:1 7:-0.0841209 8:-0.168242 9:-0.252363 10:1 11:1 12:1 13:-0.0129367 14:-0.00199393 15:-0.00233163 16:0.862496 17:1 18:1 19:1 #docid=1
2 qid:20001 1:1 2:1 3:1 4:1 5:0.684726 6:0.692481 7:-0.0841209 8:-0.168242 9:-0.252363 10:1 11:1 12:1 13:-0.0155709 14:-0.00136188 15:-0.00161117 16:1 17:0.75991 18:0.766305 19:0.938816 #docid=2
2 changes: 1 addition & 1 deletion xapian-letor/tests/testdata/training_data_one_document.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
4 qid:110001 1:1 2:1 3:1 4:1 5:1 6:1 7:-0.155541 8:-0.155541 9:-0.311083 10:1 11:1 12:1 13:-0.0269673 14:-0.00143116 15:-0.00210018 16:1 17:1 18:1 19:1 #docid=1
4 qid:110001 1:1 2:1 3:1 4:1 5:1 6:1 7:-0.155541 8:-0.311083 9:-0.466624 10:1 11:1 12:1 13:-0.0269673 14:-0.00573887 15:-0.0063153 16:1 17:1 18:1 19:1 #docid=1
6 changes: 3 additions & 3 deletions xapian-letor/tests/testdata/training_data_three_correct.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
1 qid:1 1:0.356207 2:1 3:1 4:0.190267 5:1 6:1 7:0 8:0 9:0 10:1 11:1 12:1 13:0 14:0 15:0 16:0.232356 17:1 18:0.934444 19:1 #docid=3
3 qid:1 1:0 2:1 3:0.760188 4:0 5:0.925687 6:0.846727 7:0 8:0 9:0 10:1 11:1 12:1 13:0 14:0 15:0 16:0 17:0.948648 18:0.652527 19:0.63916 #docid=1
5 qid:1 1:1 2:0 3:0.673239 4:1 5:0 6:0.749326 7:0 8:0 9:0 10:1 11:1 12:1 13:0 14:0 15:0 16:1 17:0 18:1 19:0.622809 #docid=2
1 qid:1 1:0.356207 2:0.954294 3:1 4:0.190267 5:0.998006 6:1 7:0 8:0 9:0 10:1 11:1 12:1 13:0 14:0 15:0 16:0.232356 17:0.979061 18:1 19:1 #docid=3
3 qid:1 1:0 2:1 3:0.886478 4:0 5:1 6:0.952881 7:0 8:0 9:0 10:1 11:1 12:1 13:0 14:0 15:0 16:0 17:1 18:0.868281 19:0.784878 #docid=1
5 qid:1 1:1 2:0 3:0.432443 4:1 5:0 6:0.517255 7:0 8:0 9:0 10:1 11:1 12:1 13:0 14:0 15:0 16:1 17:0 18:0.642348 19:0.395108 #docid=2