Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
new indexURLEntry class 'indexURLEntryNew', to replace old class
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2321 6c8d7289-2bf4-0310-a012-ef5d649a1542
- Loading branch information
orbiter
committed
Jul 23, 2006
1 parent
58df8b7
commit 7e0a130
Showing
1 changed file
with
294 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,294 @@ | ||
// indexURLEntryNew.java | ||
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany | ||
// first published 21.07.2006 on http://www.anomic.de | ||
// | ||
// This is a part of YaCy, a peer-to-peer based web search engine | ||
// | ||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ | ||
// $LastChangedRevision: 1986 $ | ||
// $LastChangedBy: orbiter $ | ||
// | ||
// LICENSE | ||
// | ||
// This program is free software; you can redistribute it and/or modify | ||
// it under the terms of the GNU General Public License as published by | ||
// the Free Software Foundation; either version 2 of the License, or | ||
// (at your option) any later version. | ||
// | ||
// This program is distributed in the hope that it will be useful, | ||
// but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
// GNU General Public License for more details. | ||
// | ||
// You should have received a copy of the GNU General Public License | ||
// along with this program; if not, write to the Free Software | ||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
|
||
|
||
package de.anomic.index; | ||
|
||
import de.anomic.kelondro.kelondroColumn; | ||
import de.anomic.kelondro.kelondroRow; | ||
import de.anomic.kelondro.kelondroRow.Entry; | ||
import de.anomic.plasma.plasmaWordIndex; | ||
|
||
public class indexURLEntryNew implements Cloneable, indexEntry { | ||
|
||
public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{ | ||
new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_none, indexURL.urlHashLength, "urlhash"), | ||
new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, indexURL.urlQualityLength, "quality"), | ||
new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "lastModified"), | ||
new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "hitcount"), | ||
new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_none, indexURL.urlLanguageLength, "language"), | ||
new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_none, 1, "doctype"), | ||
new kelondroColumn("f", kelondroColumn.celltype_binary, kelondroColumn.encoder_none, 1, "localflag"), | ||
new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posintext"), | ||
new kelondroColumn("r", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posinphrase"), | ||
new kelondroColumn("o", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posofphrase"), | ||
new kelondroColumn("i", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "worddistance"), | ||
new kelondroColumn("w", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "wordcount"), | ||
new kelondroColumn("p", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "phrasecount") | ||
}); | ||
|
||
private static final int col_urlhash = 0; | ||
private static final int col_quality = 1; | ||
private static final int col_lastModified = 2; | ||
private static final int col_hitcount = 3; | ||
private static final int col_language = 4; | ||
private static final int col_doctype = 5; | ||
private static final int col_localflag = 6; | ||
private static final int col_posintext = 7; | ||
private static final int col_posinphrase = 8; | ||
private static final int col_posofphrase = 9; | ||
private static final int col_worddistance = 10; | ||
private static final int col_wordcount = 11; | ||
private static final int col_phrasecount = 12; | ||
|
||
|
||
private kelondroRow.Entry entry; | ||
|
||
public indexURLEntryNew(String urlHash, | ||
int urlLength, // byte-length of complete URL | ||
int urlComps, // number of path components | ||
int titleLength, // length of description/length (longer are better?) | ||
int hitcount, //*how often appears this word in the text | ||
int wordcount, //*total number of words | ||
int phrasecount, //*total number of phrases | ||
int posintext, //*position of word in all words | ||
int posinphrase, //*position of word in its phrase | ||
int posofphrase, //*number of the phrase where word appears | ||
int worddistance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search | ||
int sizeOfPage, // # of bytes of the page | ||
long lastmodified, //*last-modified time of the document where word appears | ||
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short | ||
int quality, //*the entropy value | ||
String language, //*(guessed) language of document | ||
char doctype, //*type of document | ||
int outlinksSame, // outlinks to same domain | ||
int outlinksOther,// outlinks to other domain | ||
boolean local //*flag shows that this index was generated locally; othervise its from a remote peer | ||
) { | ||
|
||
// more needed attributes: | ||
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc | ||
// - boolean: URL attributes | ||
|
||
if ((language == null) || (language.length() != indexURL.urlLanguageLength)) language = "uk"; | ||
this.entry.setColString(col_urlhash, urlHash, null); | ||
this.entry.setColLong(col_quality, quality); | ||
this.entry.setColLong(col_lastModified, lastmodified); | ||
this.entry.setColLong(col_hitcount, hitcount); | ||
this.entry.setColString(col_language, language, null); | ||
this.entry.setColByte(col_doctype, (byte) doctype); | ||
this.entry.setColByte(col_localflag, (byte) ((local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL)); | ||
this.entry.setColLong(col_posintext, posintext); | ||
this.entry.setColLong(col_posinphrase, posinphrase); | ||
this.entry.setColLong(col_posofphrase, posofphrase); | ||
this.entry.setColLong(col_worddistance, worddistance); | ||
this.entry.setColLong(col_wordcount, wordcount); | ||
this.entry.setColLong(col_phrasecount, phrasecount); | ||
} | ||
|
||
public indexURLEntryNew(String urlHash, String code) { | ||
// the code is the external form of the row minus the leading urlHash entry | ||
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes()); | ||
} | ||
|
||
public indexURLEntryNew(String external) { | ||
|
||
} | ||
|
||
/* | ||
public indexURLEntryNew(kelondroRow.Entry entry) { | ||
this.entry = entry; | ||
} | ||
*/ | ||
|
||
public indexURLEntryNew(byte[] row) { | ||
this.entry = urlEntryRow.newEntry(row); | ||
} | ||
|
||
public Object clone() { | ||
return new indexURLEntryNew(toEncodedByteArrayForm()); | ||
} | ||
|
||
public String toEncodedStringForm() { | ||
return new String(toEncodedByteArrayForm()); | ||
} | ||
|
||
public byte[] toEncodedByteArrayForm() { | ||
return entry.bytes(); | ||
} | ||
|
||
public String toPropertyForm() { | ||
return entry.toPropertyForm(); | ||
} | ||
|
||
public Entry toKelondroEntry() { | ||
return this.entry; | ||
} | ||
|
||
public String urlHash() { | ||
return this.entry.getColString(col_urlhash, null); | ||
} | ||
|
||
public int quality() { | ||
return (int) this.entry.getColLong(col_quality); | ||
} | ||
|
||
public int virtualAge() { | ||
return plasmaWordIndex.microDateDays(lastModified()); | ||
} | ||
|
||
public long lastModified() { | ||
return (int) this.entry.getColLong(col_lastModified); | ||
} | ||
|
||
public int hitcount() { | ||
return (int) this.entry.getColLong(col_hitcount); | ||
} | ||
|
||
public int posintext() { | ||
return (int) this.entry.getColLong(col_posintext); | ||
} | ||
|
||
public int posinphrase() { | ||
return (int) this.entry.getColLong(col_posinphrase); | ||
} | ||
|
||
public int posofphrase() { | ||
return (int) this.entry.getColLong(col_posofphrase); | ||
} | ||
|
||
public int wordcount() { | ||
return (int) this.entry.getColLong(col_wordcount); | ||
} | ||
|
||
public int phrasecount() { | ||
return (int) this.entry.getColLong(col_phrasecount); | ||
} | ||
|
||
public String getLanguage() { | ||
return this.entry.getColString(col_language, null); | ||
} | ||
|
||
public char getType() { | ||
return (char) this.entry.getColByte(col_doctype); | ||
} | ||
|
||
public boolean isLocal() { | ||
return this.entry.getColByte(col_localflag) == indexEntryAttribute.LT_LOCAL; | ||
} | ||
|
||
public static indexURLEntryNew combineDistance(indexURLEntryNew ie1, indexEntry ie2) { | ||
// returns a modified entry of the first argument | ||
ie1.entry.setColLong(col_worddistance, ie1.worddistance() + ie2.worddistance() + Math.abs(ie1.posintext() - ie2.posintext())); | ||
ie1.entry.setColLong(col_posintext, Math.min(ie1.posintext(), ie2.posintext())); | ||
ie1.entry.setColLong(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/); | ||
ie1.entry.setColLong(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase())); | ||
ie1.entry.setColLong(col_wordcount, (ie1.wordcount() + ie2.wordcount()) / 2); | ||
return ie1; | ||
} | ||
|
||
public void combineDistance(indexEntry oe) { | ||
combineDistance(this, oe); | ||
} | ||
|
||
public int worddistance() { | ||
return (int) this.entry.getColLong(col_worddistance); | ||
} | ||
|
||
public static final void min(indexURLEntryNew t, indexEntry other) { | ||
if (t.hitcount() > other.hitcount()) t.entry.setColLong(col_hitcount, other.hitcount()); | ||
if (t.wordcount() > other.wordcount()) t.entry.setColLong(col_wordcount, other.wordcount()); | ||
if (t.phrasecount() > other.phrasecount()) t.entry.setColLong(col_phrasecount, other.phrasecount()); | ||
if (t.posintext() > other.posintext()) t.entry.setColLong(col_posintext, other.posintext()); | ||
if (t.posinphrase() > other.posinphrase()) t.entry.setColLong(col_posinphrase, other.posinphrase()); | ||
if (t.posofphrase() > other.posofphrase()) t.entry.setColLong(col_posofphrase, other.posofphrase()); | ||
if (t.worddistance() > other.worddistance()) t.entry.setColLong(col_worddistance, other.worddistance()); | ||
if (t.lastModified() > other.lastModified()) t.entry.setColLong(col_lastModified, other.lastModified()); | ||
if (t.quality() > other.quality()) t.entry.setColLong(col_quality, other.quality()); | ||
} | ||
|
||
public static final void max(indexURLEntryNew t, indexEntry other) { | ||
if (t.hitcount() < other.hitcount()) t.entry.setColLong(col_hitcount, other.hitcount()); | ||
if (t.wordcount() < other.wordcount()) t.entry.setColLong(col_wordcount, other.wordcount()); | ||
if (t.phrasecount() < other.phrasecount()) t.entry.setColLong(col_phrasecount, other.phrasecount()); | ||
if (t.posintext() < other.posintext()) t.entry.setColLong(col_posintext, other.posintext()); | ||
if (t.posinphrase() < other.posinphrase()) t.entry.setColLong(col_posinphrase, other.posinphrase()); | ||
if (t.posofphrase() < other.posofphrase()) t.entry.setColLong(col_posofphrase, other.posofphrase()); | ||
if (t.worddistance() < other.worddistance()) t.entry.setColLong(col_worddistance, other.worddistance()); | ||
if (t.lastModified() < other.lastModified()) t.entry.setColLong(col_lastModified, other.lastModified()); | ||
if (t.quality() < other.quality()) t.entry.setColLong(col_quality, other.quality()); | ||
} | ||
|
||
|
||
public void min(indexEntry other) { | ||
min(this, other); | ||
} | ||
|
||
public void max(indexEntry other) { | ||
max(this, other); | ||
} | ||
|
||
static void normalize(indexURLEntryNew t, indexEntry min, indexEntry max) { | ||
t.entry.setColLong(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount())); | ||
t.entry.setColLong(col_wordcount , (t.wordcount() == 0) ? 0 : 1 + 255 * (t.wordcount() - min.wordcount() ) / (1 + max.wordcount() - min.wordcount())); | ||
t.entry.setColLong(col_phrasecount , (t.phrasecount() == 0) ? 0 : 1 + 255 * (t.phrasecount() - min.phrasecount() ) / (1 + max.phrasecount() - min.phrasecount())); | ||
t.entry.setColLong(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext())); | ||
t.entry.setColLong(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase())); | ||
t.entry.setColLong(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase())); | ||
t.entry.setColLong(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); | ||
t.entry.setColLong(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified())); | ||
t.entry.setColLong(col_quality , (t.quality() == 0) ? 0 : 1 + 255 * (t.quality() - min.quality() ) / (1 + max.quality() - min.quality())); | ||
} | ||
|
||
public void normalize(indexEntry min, indexEntry max) { | ||
normalize(this, min, max); | ||
} | ||
|
||
public indexEntry generateNormalized(indexEntry min, indexEntry max) { | ||
indexURLEntryNew e = (indexURLEntryNew) this.clone(); | ||
e.normalize(min, max); | ||
return e; | ||
} | ||
|
||
public boolean isNewer(indexEntry other) { | ||
if (other == null) return true; | ||
if (this.lastModified() > other.lastModified()) return true; | ||
if (this.lastModified() == other.lastModified()) { | ||
if (this.quality() > other.quality()) return true; | ||
} | ||
return false; | ||
} | ||
|
||
public boolean isOlder(indexEntry other) { | ||
if (other == null) return false; | ||
if (this.lastModified() < ((indexAbstractEntry) other).lastModified()) return true; | ||
if (this.lastModified() == ((indexAbstractEntry) other).lastModified()) { | ||
if (this.quality() < ((indexAbstractEntry) other).quality) return true; | ||
} | ||
return false; | ||
} | ||
|
||
} |