Skip to content
This repository has been archived by the owner on May 1, 2019. It is now read-only.

a few fixes and additions for ZendSearch #8

Closed
wants to merge 8 commits into from
61 changes: 61 additions & 0 deletions library/ZendSearch/Lucene/Analysis/TokenFilter/ShortWordsUtf8.php
Original file line number Original file line Diff line number Diff line change
@@ -0,0 +1,61 @@
<?php
/**
* Zend Framework (http://framework.zend.com/)
*
* @link http://github.com/zendframework/zf2 for the canonical source repository
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @package Zend_Search
*/

namespace ZendSearch\Lucene\Analysis\TokenFilter;

use ZendSearch\Lucene\Analysis\Token;
use ZendSearch\Lucene\Exception\ExtensionNotLoadedException;

/**
* Token filter that removes short words. What is short word can be configured with constructor.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
*/
class ShortWordsUtf8 implements TokenFilterInterface
{
/**
* Minimum allowed term length
* @var integer
*/
private $length;

/**
* Constructs new instance of this filter.
*
* @param integer $short minimum allowed length of term which passes this filter (default 2)
* @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException
*/
public function __construct($length = 2)
{
$this->length = $length;

if (!function_exists('mb_strlen')) {
// mbstring extension is disabled
throw new ExtensionNotLoadedException('Utf8 compatible short words filter needs mbstring extension to be enabled.');
}
}

/**
* Normalize Token or remove it (if null is returned)
*
* @param \ZendSearch\Lucene\Analysis\Token $srcToken
* @return \ZendSearch\Lucene\Analysis\Token
*/
public function normalize(Token $srcToken)
{
if (mb_strlen($srcToken->getTermText(), 'UTF-8') < $this->length) {
return null;
} else {
return $srcToken;
}
}
}
6 changes: 4 additions & 2 deletions library/ZendSearch/Lucene/Index/SegmentInfo.php
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -1077,8 +1077,9 @@ public function termFreqs(Term $term, $shift = 0, DocsFilter $docsFilter = null)
} }
} else { } else {
$docId += $docDelta/2; $docId += $docDelta/2;
$freq = $frqFile->readVInt();
if (isset($filter[$docId])) { if (isset($filter[$docId])) {
$result[$shift + $docId] = $frqFile->readVInt(); $result[$shift + $docId] = $freq;
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
} }
} }
Expand All @@ -1099,8 +1100,9 @@ public function termFreqs(Term $term, $shift = 0, DocsFilter $docsFilter = null)
} }
} else { } else {
$docId += $docDelta/2; $docId += $docDelta/2;
$freq = $frqFile->readVInt();
if (isset($filter[$docId])) { if (isset($filter[$docId])) {
$result[$shift + $docId] = $frqFile->readVInt(); $result[$shift + $docId] = $freq;
$updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
} }
} }
Expand Down
4 changes: 2 additions & 2 deletions library/ZendSearch/Lucene/MultiSearcher.php
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ public function count()
$count = 0; $count = 0;


foreach ($this->_indices as $index) { foreach ($this->_indices as $index) {
$count += $this->_indices->count(); $count += $index->count();
} }


return $count; return $count;
Expand Down Expand Up @@ -160,7 +160,7 @@ public function numDocs()
$docs = 0; $docs = 0;


foreach ($this->_indices as $index) { foreach ($this->_indices as $index) {
$docs += $this->_indices->numDocs(); $docs += $index->numDocs();
} }


return $docs; return $docs;
Expand Down
171 changes: 136 additions & 35 deletions library/ZendSearch/Lucene/Search/Query/Phrase.php
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ class Phrase extends AbstractQuery


/** /**
* Term positions (relative positions of terms within the phrase). * Term positions (relative positions of terms within the phrase).
*
* If several terms have the same offset, they will be considered as alternate
* terms for the word at this position, thus making stemming easier (for example).
* NOTE: This feature is supported only with exact search (i.e. slop = 0).
*
* Array of integers * Array of integers
* *
* @var array * @var array
Expand Down Expand Up @@ -143,6 +148,8 @@ public function getSlop()
* Adds a term to the end of the query phrase. * Adds a term to the end of the query phrase.
* The relative position of the term is specified explicitly or the one immediately * The relative position of the term is specified explicitly or the one immediately
* after the last term added. * after the last term added.
* Duplicate offsets can be used to provide several variations for a word (UNSUPPORTED
* YET IN SLOPPY MODE).
* *
* @param \ZendSearch\Lucene\Index\Term $term * @param \ZendSearch\Lucene\Index\Term $term
* @param integer $position * @param integer $position
Expand Down Expand Up @@ -207,10 +214,59 @@ public function rewrite(Lucene\SearchIndexInterface $index)
*/ */
public function optimize(Lucene\SearchIndexInterface $index) public function optimize(Lucene\SearchIndexInterface $index)
{ {
// Check, that index contains all phrase terms // now look for possible alternate terms at one or more positions
foreach ($this->_terms as $term) { $nbUniqueOffsets = count(array_flip($this->_offsets));
if (!$index->hasTerm($term)) {
return new EmptyResult(); if (count($this->_offsets) > $nbUniqueOffsets) // alts found
{
if ($nbUniqueOffsets == 1)
{
// several terms but all at same offset (for example several stems of a single input word)
$optimizedQuery = new MultiTerm($this->_terms, array_fill(0, count($this->_terms), null));
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery->optimize($index);
}

// first, group the query terms according to their offset
$offsetAlts = array_fill_keys($this->_offsets, array());
foreach ($this->_offsets as $termId => $offset)
$offsetAlts[$offset][] = $termId;

// then for each offset, check that the index contains at least one alt
foreach ($offsetAlts as $alts)
{
$check = false;

foreach ($alts as $termId)
{
if ($index->hasTerm($this->_terms[$termId]))
{
$check = true;

// PERFORMANCE NOTE
// we could break here to save hasTerm() calls (costly) but "usually" the time lost here is less than
// what we gain later if we keep processing the alts to unset them if possible (although it heavily
// depends on the index content, stemming efficiency and input queries)
//break;
}
else
{
unset($this->_terms[$termId]);
unset($this->_offsets[$termId]);
}
}

if (!$check)
return new EmptyResult();
}
}
else // only one term per offset
{
// Check, that index contains all phrase terms
foreach ($this->_terms as $term) {
if (!$index->hasTerm($term)) {
return new EmptyResult();
}
} }
} }


Expand Down Expand Up @@ -276,31 +332,61 @@ public function _exactPhraseFreq($docId)
{ {
$freq = 0; $freq = 0;


// Term Id with lowest cardinality // offset with the lowest total (for all alts if any) cardinality
$lowCardTermId = null; $lowCardOffset = 0;
$lowCard = PHP_INT_MAX;


// Calculate $lowCardTermId // group the terms according to their offset, also filtering alts not found in this doc
foreach ($this->_terms as $termId => $term) { $docAlts = array_fill_keys($this->_offsets, array());
if ($lowCardTermId === null || foreach ($this->_offsets as $termId => $offset)
count($this->_termsPositions[$termId][$docId]) < if (isset($this->_termsPositions[$termId][$docId]))
count($this->_termsPositions[$lowCardTermId][$docId]) ) { $docAlts[$offset][] = $termId;
$lowCardTermId = $termId;
}
}


// Walk through positions of the term with lowest cardinality // look for the offset where total cardinality is the lowest
foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) { foreach ($docAlts as $offset => $alts)
// We expect phrase to be found {
$freq++; $card = 0;


// Walk through other terms foreach ($alts as $termId)
foreach ($this->_terms as $termId => $term) { $card += count($this->_termsPositions[$termId][$docId]);
if ($termId != $lowCardTermId) {
$expectedPosition = $lowCardPos + if ($card < $lowCard)
($this->_offsets[$termId] - {
$this->_offsets[$lowCardTermId]); $lowCardOffset = $offset;
$lowCard = $card;
}
}


if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) { // split the term list
$lowCardAlts = $docAlts[$lowCardOffset];
unset($docAlts[$lowCardOffset]);

// Walk through positions of all the alts at the offset with lowest cardinality
foreach ($lowCardAlts as $lowCardTermId)
{
foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos)
{
// We expect phrase to be found
$freq++;

// Walk through other terms
foreach ($docAlts as $offset => $alts)
{
// at least one alt must fulfill each remaining position (other than lowCardPos)
$expectedPosition = $lowCardPos + $offset - $lowCardOffset;
$match = false;

foreach ($alts as $termId)
{
if (in_array($expectedPosition, $this->_termsPositions[$termId][$docId]))
{
$match = true;
break;
}
}

if (!$match)
{
$freq--; // Phrase wasn't found. $freq--; // Phrase wasn't found.
break; break;
} }
Expand Down Expand Up @@ -402,16 +488,32 @@ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
$this->_resVector = array(); $this->_resVector = array();
} }


$offsetDocs = array();

// merge docs ids matching terms at the same offset
foreach ($this->_terms as $termId => $term)
{
$offset = $this->_offsets[$termId];

if (isset($offsetDocs[$offset]))
$offsetDocs[$offset] = array_merge($offsetDocs[$offset], $reader->termDocs($term));
else
$offsetDocs[$offset] = $reader->termDocs($term);

$this->_termsPositions[$termId] = $reader->termPositions($term);
}

$resVectors = array(); $resVectors = array();
$resVectorsSizes = array(); $resVectorsSizes = array();
$resVectorsIds = array(); // is used to prevent arrays comparison $resVectorsIds = array(); // is used to prevent arrays comparison
foreach ($this->_terms as $termId => $term) {
$resVectors[] = array_flip($reader->termDocs($term));
$resVectorsSizes[] = count(end($resVectors));
$resVectorsIds[] = $termId;


$this->_termsPositions[$termId] = $reader->termPositions($term); foreach ($offsetDocs as $offset => $docs)
{
$resVectors[] = array_flip($docs); // also deal with duplicates
$resVectorsSizes[] = count(end($resVectors));
$resVectorsIds[] = $offset;
} }

// sort resvectors in order of subquery cardinality increasing // sort resvectors in order of subquery cardinality increasing
array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC, array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
$resVectorsIds, SORT_ASC, SORT_NUMERIC, $resVectorsIds, SORT_ASC, SORT_NUMERIC,
Expand All @@ -422,7 +524,6 @@ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
$this->_resVector = $nextResVector; $this->_resVector = $nextResVector;
} else { } else {
//$this->_resVector = array_intersect_key($this->_resVector, $nextResVector); //$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);

/** /**
* This code is used as workaround for array_intersect_key() slowness problem. * This code is used as workaround for array_intersect_key() slowness problem.
*/ */
Expand All @@ -433,11 +534,11 @@ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
} }
} }
$this->_resVector = $updatedVector; $this->_resVector = $updatedVector;
}


if (count($this->_resVector) == 0) { if (count($this->_resVector) == 0) {
// Empty result set, we don't need to check other terms // Empty result set, we don't need to check other terms
break; break;
}
} }
} }


Expand Down
6 changes: 6 additions & 0 deletions tests/ZendSearch/Lucene/DocumentTest.php
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -176,7 +176,13 @@ public function testHtmlInlineTagsIndexing()
$hits = $index->find('ZendFramework'); $hits = $index->find('ZendFramework');
$this->assertEquals(count($hits), 1); $this->assertEquals(count($hits), 1);


// IMPORTANT : if we want to clean the directory, the instance of Index has to be actually destroyed first,
// so that it releases its file locks. In case of additional indirect references, we need a manual cycle
// of garbage collection to flush the pending objects.
unset($index); unset($index);
unset($hits); // QueryHit instances hold a reference on their owner Index instance
gc_collect_cycles(); // force the destructors to be called right now

$this->_clearDirectory(__DIR__ . '/_index/_files'); $this->_clearDirectory(__DIR__ . '/_index/_files');
} }


Expand Down
Loading