zendframework · luciole75w · Oct 14, 2013 · Oct 14, 2013 · Oct 15, 2013 · Oct 15, 2013
diff --git a/library/ZendSearch/Lucene/Analysis/TokenFilter/ShortWordsUtf8.php b/library/ZendSearch/Lucene/Analysis/TokenFilter/ShortWordsUtf8.php
@@ -0,0 +1,61 @@
+<?php
+/**
+ * Zend Framework (http://framework.zend.com/)
+ *
+ * @link      http://github.com/zendframework/zf2 for the canonical source repository
+ * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
+ * @license   http://framework.zend.com/license/new-bsd New BSD License
+ * @package   Zend_Search
+ */
+
+namespace ZendSearch\Lucene\Analysis\TokenFilter;
+
+use ZendSearch\Lucene\Analysis\Token;
+use ZendSearch\Lucene\Exception\ExtensionNotLoadedException;
+
+/**
+ * Token filter that removes short words. What is short word can be configured with constructor.
+ *
+ * @category   Zend
+ * @package    Zend_Search_Lucene
+ * @subpackage Analysis
+ */
+class ShortWordsUtf8 implements TokenFilterInterface
+{
+    /**
+     * Minimum allowed term length
+     * @var integer
+     */
+    private $length;
+
+    /**
+     * Constructs new instance of this filter.
+     *
+     * @param integer $short  minimum allowed length of term which passes this filter (default 2)
+     * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException
+     */
+    public function __construct($length = 2)
+    {
+        $this->length = $length;
+
+        if (!function_exists('mb_strlen')) {
+            // mbstring extension is disabled
+            throw new ExtensionNotLoadedException('Utf8 compatible short words filter needs mbstring extension to be enabled.');
+        }
+    }
+
+    /**
+     * Normalize Token or remove it (if null is returned)
+     *
+     * @param \ZendSearch\Lucene\Analysis\Token $srcToken
+     * @return \ZendSearch\Lucene\Analysis\Token
+     */
+    public function normalize(Token $srcToken)
+    {
+        if (mb_strlen($srcToken->getTermText(), 'UTF-8') < $this->length) {
+            return null;
+        } else {
+            return $srcToken;
+        }
+    }
+}
diff --git a/library/ZendSearch/Lucene/Index/SegmentInfo.php b/library/ZendSearch/Lucene/Index/SegmentInfo.php
@@ -1077,8 +1077,9 @@ public function termFreqs(Term $term, $shift = 0, DocsFilter $docsFilter = null)
                             }
                         } else {
                             $docId += $docDelta/2;
+                            $freq = $frqFile->readVInt();
                             if (isset($filter[$docId])) {
-                                $result[$shift + $docId] = $frqFile->readVInt();
+                                $result[$shift + $docId] = $freq;
                                 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
                             }
                         }
@@ -1099,8 +1100,9 @@ public function termFreqs(Term $term, $shift = 0, DocsFilter $docsFilter = null)
                             }
                         } else {
                             $docId += $docDelta/2;
+                            $freq = $frqFile->readVInt();
                             if (isset($filter[$docId])) {
-                                $result[$shift + $docId] = $frqFile->readVInt();
+                                $result[$shift + $docId] = $freq;
                                 $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
                             }
                         }

diff --git a/library/ZendSearch/Lucene/MultiSearcher.php b/library/ZendSearch/Lucene/MultiSearcher.php
@@ -132,7 +132,7 @@ public function count()
         $count = 0;
 
         foreach ($this->_indices as $index) {
-            $count += $this->_indices->count();
+            $count += $index->count();
         }
 
         return $count;
@@ -160,7 +160,7 @@ public function numDocs()
         $docs = 0;
 
         foreach ($this->_indices as $index) {
-            $docs += $this->_indices->numDocs();
+            $docs += $index->numDocs();
         }
 
         return $docs;

diff --git a/library/ZendSearch/Lucene/Search/Query/Phrase.php b/library/ZendSearch/Lucene/Search/Query/Phrase.php
@@ -35,6 +35,11 @@ class Phrase extends AbstractQuery
 
     /**
      * Term positions (relative positions of terms within the phrase).
+     *
+     * If several terms have the same offset, they will be considered as alternate
+     * terms for the word at this position, thus making stemming easier (for example).
+     * NOTE: This feature is supported only with exact search (i.e. slop = 0).
+     *
      * Array of integers
      *
      * @var array
@@ -143,6 +148,8 @@ public function getSlop()
      * Adds a term to the end of the query phrase.
      * The relative position of the term is specified explicitly or the one immediately
      * after the last term added.
+     * Duplicate offsets can be used to provide several variations for a word (UNSUPPORTED
+     * YET IN SLOPPY MODE).
      *
      * @param \ZendSearch\Lucene\Index\Term $term
      * @param integer $position
@@ -207,10 +214,59 @@ public function rewrite(Lucene\SearchIndexInterface $index)
      */
     public function optimize(Lucene\SearchIndexInterface $index)
     {
-        // Check, that index contains all phrase terms
+        // now look for possible alternate terms at one or more positions
-        foreach ($this->_terms as $term) {
+        $nbUniqueOffsets = count(array_flip($this->_offsets));
-            if (!$index->hasTerm($term)) {
+
-                return new EmptyResult();
+        if (count($this->_offsets) > $nbUniqueOffsets) // alts found
+        {
+            if ($nbUniqueOffsets == 1)
+            {
+                // several terms but all at same offset (for example several stems of a single input word)
+                $optimizedQuery = new MultiTerm($this->_terms, array_fill(0, count($this->_terms), null));
+                $optimizedQuery->setBoost($this->getBoost());
+                return $optimizedQuery->optimize($index);
+            }
+
+            // first, group the query terms according to their offset
+            $offsetAlts = array_fill_keys($this->_offsets, array());
+            foreach ($this->_offsets as $termId => $offset)
+                $offsetAlts[$offset][] = $termId;
+
+            // then for each offset, check that the index contains at least one alt
+            foreach ($offsetAlts as $alts)
+            {
+                $check = false;
+
+                foreach ($alts as $termId)
+                {
+                    if ($index->hasTerm($this->_terms[$termId]))
+                    {
+                        $check = true;
+
+                        // PERFORMANCE NOTE
+                        // we could break here to save hasTerm() calls (costly) but "usually" the time lost here is less than
+                        // what we gain later if we keep processing the alts to unset them if possible (although it heavily
+                        // depends on the index content, stemming efficiency and input queries)
+                        //break;
+                    }
+                    else
+                    {
+                        unset($this->_terms[$termId]);
+                        unset($this->_offsets[$termId]);
+                    }
+                }
+
+                if (!$check)
+                    return new EmptyResult();
+            }
+        }
+        else // only one term per offset
+        {
+            // Check, that index contains all phrase terms
+            foreach ($this->_terms as $term) {
+                if (!$index->hasTerm($term)) {
+                    return new EmptyResult();
+                }
             }
         }
 
@@ -276,31 +332,61 @@ public function _exactPhraseFreq($docId)
     {
         $freq = 0;
 
-        // Term Id with lowest cardinality
+        // offset with the lowest total (for all alts if any) cardinality
-        $lowCardTermId = null;
+        $lowCardOffset = 0;
+        $lowCard = PHP_INT_MAX;
 
-        // Calculate $lowCardTermId
+        // group the terms according to their offset, also filtering alts not found in this doc
-        foreach ($this->_terms as $termId => $term) {
+        $docAlts = array_fill_keys($this->_offsets, array());
-            if ($lowCardTermId === null ||
+        foreach ($this->_offsets as $termId => $offset)
-                count($this->_termsPositions[$termId][$docId]) <
+            if (isset($this->_termsPositions[$termId][$docId]))
-                count($this->_termsPositions[$lowCardTermId][$docId]) ) {
+                $docAlts[$offset][] = $termId;
-                    $lowCardTermId = $termId;
-                }
-        }
 
-        // Walk through positions of the term with lowest cardinality
+        // look for the offset where total cardinality is the lowest
-        foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) {
+        foreach ($docAlts as $offset => $alts)
-            // We expect phrase to be found
+        {
-            $freq++;
+            $card = 0;
 
-            // Walk through other terms
+            foreach ($alts as $termId)
-            foreach ($this->_terms as $termId => $term) {
+                $card += count($this->_termsPositions[$termId][$docId]);
-                if ($termId != $lowCardTermId) {
+
-                    $expectedPosition = $lowCardPos +
+            if ($card < $lowCard)
-                                            ($this->_offsets[$termId] -
+            {
-                                             $this->_offsets[$lowCardTermId]);
+                $lowCardOffset = $offset;
+                $lowCard = $card;
+            }
+        }
 
-                    if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) {
+        // split the term list
+        $lowCardAlts = $docAlts[$lowCardOffset];
+        unset($docAlts[$lowCardOffset]);
+
+        // Walk through positions of all the alts at the offset with lowest cardinality
+        foreach ($lowCardAlts as $lowCardTermId)
+        {
+            foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos)
+            {
+                // We expect phrase to be found
+                $freq++;
+
+                // Walk through other terms
+                foreach ($docAlts as $offset => $alts)
+                {
+                    // at least one alt must fulfill each remaining position (other than lowCardPos)
+                    $expectedPosition = $lowCardPos + $offset - $lowCardOffset;
+                    $match = false;
+
+                    foreach ($alts as $termId)
+                    {
+                        if (in_array($expectedPosition, $this->_termsPositions[$termId][$docId]))
+                        {
+                            $match = true;
+                            break;
+                        }
+                    }
+
+                    if (!$match)
+                    {
                         $freq--;  // Phrase wasn't found.
                         break;
                     }
@@ -402,16 +488,32 @@ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
             $this->_resVector = array();
         }
 
+        $offsetDocs = array();
+
+        // merge docs ids matching terms at the same offset
+        foreach ($this->_terms as $termId => $term)
+        {
+            $offset = $this->_offsets[$termId];
+
+            if (isset($offsetDocs[$offset]))
+                $offsetDocs[$offset] = array_merge($offsetDocs[$offset], $reader->termDocs($term));
+            else
+                $offsetDocs[$offset] = $reader->termDocs($term);
+
+            $this->_termsPositions[$termId] = $reader->termPositions($term);
+        }
+
         $resVectors      = array();
         $resVectorsSizes = array();
         $resVectorsIds   = array(); // is used to prevent arrays comparison
-        foreach ($this->_terms as $termId => $term) {
-            $resVectors[]      = array_flip($reader->termDocs($term));
-            $resVectorsSizes[] = count(end($resVectors));
-            $resVectorsIds[]   = $termId;
 
-            $this->_termsPositions[$termId] = $reader->termPositions($term);
+        foreach ($offsetDocs as $offset => $docs)
+        {
+            $resVectors[]      = array_flip($docs); // also deal with duplicates
+            $resVectorsSizes[] = count(end($resVectors));
+            $resVectorsIds[]   = $offset;
         }
+
         // sort resvectors in order of subquery cardinality increasing
         array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
                         $resVectorsIds,   SORT_ASC, SORT_NUMERIC,
@@ -422,7 +524,6 @@ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
                 $this->_resVector = $nextResVector;
             } else {
                 //$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);
-
                 /**
                  * This code is used as workaround for array_intersect_key() slowness problem.
                  */
@@ -433,11 +534,11 @@ public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null)
                     }
                 }
                 $this->_resVector = $updatedVector;
-            }
 
-            if (count($this->_resVector) == 0) {
+                if (count($this->_resVector) == 0) {
-                // Empty result set, we don't need to check other terms
+                    // Empty result set, we don't need to check other terms
-                break;
+                    break;
+                }
             }
         }
 

diff --git a/tests/ZendSearch/Lucene/DocumentTest.php b/tests/ZendSearch/Lucene/DocumentTest.php
@@ -176,7 +176,13 @@ public function testHtmlInlineTagsIndexing()
         $hits = $index->find('ZendFramework');
         $this->assertEquals(count($hits), 1);
 
+        // IMPORTANT : if we want to clean the directory, the instance of Index has to be actually destroyed first,
+        // so that it releases its file locks. In case of additional indirect references, we need a manual cycle
+        // of garbage collection to flush the pending objects.
         unset($index);
+        unset($hits); // QueryHit instances hold a reference on their owner Index instance
+        gc_collect_cycles(); // force the destructors to be called right now
+
         $this->_clearDirectory(__DIR__ . '/_index/_files');
     }