Permalink
Browse files

Support for scoring results! added apache lic, misc fixes

  • Loading branch information...
1 parent 150fd23 commit 04c31e46da01d124c84a56d6576d30be605b9510 T Jake Luciani committed May 13, 2010
@@ -67,15 +67,17 @@
public static final String positionVectorKey = "Position";
public static final String offsetVectorKey = "Offsets";
public static final String termFrequencyKey = "Frequencies";
+ public static final String normsKey = "Norms";
- public static final List<Integer> emptyArray = Arrays.asList( new Integer[]{0} );
+ public static final byte[] emptyByteArray = new byte[]{};
+ public static final List<Number> emptyArray = Arrays.asList( new Number[]{0} );
public static final byte delimeterBytes[] = new byte[]{(byte)255,(byte)255,(byte)255,(byte)255};
public static final String delimeter = new String(delimeterBytes);
public static final String finalToken = new String("\ufffe\ufffe");
public static final String documentIdField = delimeter+"KEY"+delimeter;
public static final String documentMetaField = delimeter+"META"+delimeter;
- public static final ColumnPath metaColumnPath = new ColumnPath(CassandraUtils.docColumnFamily).setColumn(documentMetaField.getBytes());
+ public static final ColumnPath metaColumnPath = new ColumnPath(CassandraUtils.docColumnFamily).setColumn(documentMetaField.getBytes());
public static final String hashChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
@@ -168,11 +170,18 @@ public static final int byteArrayToInt(byte[] b) {
return (b[0] << 24) + ((b[1] & 0xFF) << 16) + ((b[2] & 0xFF) << 8) + (b[3] & 0xFF);
}
- public static final byte[] intVectorToByteArray(List<Integer> intVector) {
+ public static final byte[] intVectorToByteArray(List<Number> intVector) {
+
+ if(intVector.size() == 0)
+ return emptyByteArray;
+
+ if(intVector.get(0) instanceof Byte)
+ return new byte[]{intVector.get(0).byteValue()};
+
ByteBuffer buffer = ByteBuffer.allocate(4 * intVector.size());
- for (int i : intVector) {
- buffer.putInt(i);
+ for (Number i : intVector) {
+ buffer.putInt(i.intValue());
}
return buffer.array();
@@ -243,7 +252,7 @@ public static final UUID readUUID(byte[] bytes) {
}
- public static void addToMutationMap(Map<String,Map<String,List<Mutation>>> mutationMap, String columnFamily, byte[] column, String key, byte[] value, Map<String,List<Integer>> superColumns){
+ public static void addToMutationMap(Map<String,Map<String,List<Mutation>>> mutationMap, String columnFamily, byte[] column, String key, byte[] value, Map<String,List<Number>> superColumns){
@@ -292,7 +301,8 @@ public static void addToMutationMap(Map<String,Map<String,List<Mutation>>> mutat
sc.setName(column);
sc.setColumns(columns);
- for(Map.Entry<String, List<Integer>> e : superColumns.entrySet()){
+ for(Map.Entry<String, List<Number>> e : superColumns.entrySet()){
+
columns.add(new Column(e.getKey().getBytes(), intVectorToByteArray(e.getValue()), System.currentTimeMillis()));
}
@@ -29,11 +29,13 @@
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.cassandra.thrift.Cassandra;
+import org.apache.cassandra.thrift.Column;
import org.apache.cassandra.thrift.ColumnOrSuperColumn;
import org.apache.cassandra.thrift.ColumnParent;
import org.apache.cassandra.thrift.ConsistencyLevel;
import org.apache.cassandra.thrift.SlicePredicate;
import org.apache.cassandra.thrift.SliceRange;
+import org.apache.cassandra.thrift.SuperColumn;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
@@ -50,22 +52,20 @@
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
-import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import solandra.SolandraFieldSelector;
-import com.sun.servicetag.UnauthorizedAccessException;
-
public class IndexReader extends org.apache.lucene.index.IndexReader {
- private final static int numDocs = 1000000;
- private final static byte[] norms = new byte[numDocs];
+ private final static int numDocs = 100000;
+ private Map<String,byte[]> fieldNorms = new HashMap<String, byte[]>();
private final static Directory mockDirectory = new RAMDirectory();
static {
- Arrays.fill(norms, DefaultSimilarity.encodeNorm(1.0f));
+ //Arrays.fill(norms, DefaultSimilarity.encodeNorm(1.0f));
try {
new IndexWriter(mockDirectory, new SimpleAnalyzer(), true, MaxFieldLength.LIMITED);
@@ -130,12 +130,12 @@ protected void doCommit() throws IOException {
@Override
protected void doDelete(int arg0) throws CorruptIndexException, IOException {
- // throw new UnsupportedOperationException();
+
}
@Override
protected void doSetNorm(int arg0, String arg1, byte arg2) throws CorruptIndexException, IOException {
- // throw new UnsupportedOperationException();
+
}
@Override
@@ -247,6 +247,12 @@ public Document document(int docNum, FieldSelector selector) throws CorruptIndex
Field field = null;
String fieldName = new String(col.column.name);
+ //Incase __META__ slips through
+ if(fieldName.equals(CassandraUtils.metaColumnPath)){
+ logger.debug("Filtering out __META__ key");
+ continue;
+ }
+
byte[] value;
if (col.column.value[col.column.value.length - 1] != Byte.MAX_VALUE && col.column.value[col.column.value.length - 1] != Byte.MIN_VALUE) {
@@ -341,14 +347,16 @@ public int maxDoc() {
}
@Override
- public byte[] norms(String term) throws IOException {
- return norms;
+ public byte[] norms(String field) throws IOException {
+ return fieldNorms.get(field);
}
@Override
public void norms(String arg0, byte[] arg1, int arg2) throws IOException {
// TODO Auto-generated method stub
+ throw new RuntimeException();
+
}
@Override
@@ -387,11 +395,11 @@ public TermEnum terms(Term term) throws IOException {
return termEnum;
}
- public int addDocument(byte[] docId) {
+ public int addDocument(SuperColumn docInfo, String field) {
String id;
try {
- id = new String(docId, "UTF-8");
+ id = new String(docInfo.name, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new IllegalStateException("Cant make docId a string");
}
@@ -406,11 +414,49 @@ public int addDocument(byte[] docId) {
docIdToDocIndex.put(id, idx);
docIndexToDocId.put(idx, id);
+
+ Byte norm = null;
+ for(Column c : docInfo.columns){
+ if(Arrays.equals(c.name, CassandraUtils.normsKey.getBytes())){
+ if(c.value.length != 1)
+ throw new IllegalStateException("Norm for field "+field+" must be a single byte");
+
+ norm = c.value[0];
+ }
+ }
+
+ if(norm == null)
+ norm = Similarity.encodeNorm(1.0f);
+
+ byte[] norms = fieldNorms.get(field);
+
+ if(norms == null)
+ norms = new byte[1];
+
+ byte[] _norms = new byte[norms.length+1];
+ System.arraycopy(norms, 0, _norms, 0, norms.length);
+
+ //last value is not used so we write to the -1
+ _norms[norms.length] = norm;
+
+ fieldNorms.put(field, _norms);
+
}
return idx;
}
-
+
+ public int getDocumentNumber(byte[] docId){
+ String id;
+ try {
+ id = new String(docId, "UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ throw new IllegalStateException("Cant make docId a string");
+ }
+
+ return docIdToDocIndex.get(id);
+ }
+
public String getDocumentId(int docNum) {
return docIndexToDocId.get(docNum);
}
@@ -104,7 +104,7 @@ public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexExce
}
// collect term information per field
- Map<String, Map<String,List<Integer>>> allTermInformation = new HashMap<String, Map<String,List<Integer>>>();
+ Map<String, Map<String,List<Number>>> allTermInformation = new HashMap<String, Map<String,List<Number>>>();
int lastOffset = 0;
if (position > 0) {
@@ -129,42 +129,47 @@ public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexExce
TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);
+ //store normalizations of field per term per document rather than per field.
+ //this adds more to write but less to read on other side
+ Integer tokensInField = new Integer(0);
+
while (tokens.incrementToken()) {
- String term = CassandraUtils.createColumnName(field.name(),termAttribute.term());
+ tokensInField++;
+ String term = CassandraUtils.createColumnName(field.name(),termAttribute.term());
allIndexedTerms.add(term);
//fetch all collected information for this term
- Map<String,List<Integer>> termInfo = allTermInformation.get(term);
+ Map<String,List<Number>> termInfo = allTermInformation.get(term);
if (termInfo == null) {
- termInfo = new HashMap<String,List<Integer>>();
+ termInfo = new HashMap<String,List<Number>>();
allTermInformation.put(term, termInfo);
}
//term frequency
{
- List<Integer> termFrequency = termInfo.get(CassandraUtils.termFrequencyKey);
+ List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKey);
if(termFrequency == null){
- termFrequency = new ArrayList<Integer>();
- termFrequency.add(0);
+ termFrequency = new ArrayList<Number>();
+ termFrequency.add(new Integer(0));
termInfo.put(CassandraUtils.termFrequencyKey, termFrequency);
}
//increment
- termFrequency.set(0, termFrequency.get(0)+1);
+ termFrequency.set(0, termFrequency.get(0).intValue()+1);
}
//position vector
if(field.isStorePositionWithTermVector()){
position += (posIncrAttribute.getPositionIncrement() - 1);
- List<Integer> positionVector = termInfo.get(CassandraUtils.positionVectorKey);
+ List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKey);
if(positionVector == null){
- positionVector = new ArrayList<Integer>();
+ positionVector = new ArrayList<Number>();
termInfo.put(CassandraUtils.positionVectorKey, positionVector);
}
@@ -174,9 +179,9 @@ public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexExce
//term offsets
if(field.isStoreOffsetWithTermVector()){
- List<Integer> offsetVector = termInfo.get(CassandraUtils.offsetVectorKey);
+ List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKey);
if(offsetVector == null){
- offsetVector = new ArrayList<Integer>();
+ offsetVector = new ArrayList<Number>();
termInfo.put(CassandraUtils.offsetVectorKey, offsetVector);
}
@@ -186,13 +191,28 @@ public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexExce
}
}
- for (Map.Entry<String, Map<String,List<Integer>>> term : allTermInformation.entrySet()) {
+ List<Number> bnorm = null;
+ if(!field.getOmitNorms()){
+ bnorm = new ArrayList<Number>();
+ float norm = doc.getBoost();
+ norm *= field.getBoost();
+ norm *= similarity.lengthNorm(field.name(), tokensInField);
+ bnorm.add(Similarity.encodeNorm(norm));
+ }
+
+ for (Map.Entry<String, Map<String,List<Number>>> term : allTermInformation.entrySet()) {
// Terms are stored within a unique key combination
// This is required since cassandra loads all columns
// in a key/column family into memory
String key = indexName + CassandraUtils.delimeter + term.getKey();
+ //Mix in the norm for this field alongside each term
+ //more writes but faster on read side.
+ if(!field.getOmitNorms()){
+ term.getValue().put(CassandraUtils.normsKey, bnorm );
+ }
+
CassandraUtils.addToMutationMap(mutationMap, CassandraUtils.termVecColumnFamily, docId.getBytes(), CassandraUtils.hashKey(key), null,term.getValue());
}
}
@@ -204,7 +224,7 @@ public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexExce
String key = indexName + CassandraUtils.delimeter + term;
- Map<String,List<Integer>> termMap = new HashMap<String,List<Integer>>();
+ Map<String,List<Number>> termMap = new HashMap<String,List<Number>>();
termMap.put(CassandraUtils.positionVectorKey, CassandraUtils.emptyArray);
CassandraUtils.addToMutationMap(mutationMap, CassandraUtils.termVecColumnFamily, docId.getBytes(), CassandraUtils.hashKey(key), null,termMap);
@@ -55,7 +55,7 @@ public int doc() {
if (docPosition < 0)
docPosition = 0;
- int docid = indexReader.addDocument(termDocs.get(docPosition).getSuper_column().getName());
+ int docid = indexReader.getDocumentNumber(termDocs.get(docPosition).getSuper_column().getName());
return docid;
}
@@ -65,6 +65,7 @@
private int actualInitSize = -1;
private Term initTerm = null;
private Term chunkBoundryTerm;
+ private String currentField = null;
private int chunkCount = 0;
private final Cassandra.Iface client;
@@ -85,6 +86,8 @@ public boolean skipTo(Term term) throws IOException {
return false;
loadTerms(term);
+
+ currentField = term.field();
return termBuffer.length == 0 ? false : true;
}
@@ -285,7 +288,7 @@ private void loadTerms(Term skipTo) {
Map<Integer, ColumnOrSuperColumn> termDocMap = new HashMap<Integer, ColumnOrSuperColumn>();
for (ColumnOrSuperColumn col : termDocs) {
- int docId = indexReader.addDocument(col.getSuper_column().getName());
+ int docId = indexReader.addDocument(col.getSuper_column(), currentField);
termDocMap.put(docId, col);
docIds[idx++] = docId;
}
@@ -164,6 +164,26 @@ public void testSearch() throws Exception {
assertNotNull(doc.getField("key"));
}
+
+ public void testScore() throws Exception {
+
+ IndexReader indexReader = new IndexReader(indexName, client);
+ IndexSearcher searcher = new IndexSearcher(indexReader);
+
+ QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "key", analyzer);
+ Query q = qp.parse("+key:example");
+
+ TopDocs docs = searcher.search(q, 10);
+
+ assertEquals(2, docs.totalHits);
+
+ Document doc = searcher.doc(docs.scoreDocs[0].doc);
+
+ String fld = doc.getField("key").stringValue();
+ //Highest scoring doc should be the one with higher boost
+ assertEquals(fld,"this is another example");
+
+ }
public void testMissingQuery() throws Exception {

0 comments on commit 04c31e4

Please sign in to comment.