Skip to content

Commit

Permalink
new indexURLEntry class 'indexURLEntryNew', to replace old class
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2321 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Jul 23, 2006
1 parent 58df8b7 commit 7e0a130
Showing 1 changed file with 294 additions and 0 deletions.
294 changes: 294 additions & 0 deletions source/de/anomic/index/indexURLEntryNew.java
@@ -0,0 +1,294 @@
// indexURLEntryNew.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 21.07.2006 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA


package de.anomic.index;

import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaWordIndex;

public class indexURLEntryNew implements Cloneable, indexEntry {

public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{
new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_none, indexURL.urlHashLength, "urlhash"),
new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, indexURL.urlQualityLength, "quality"),
new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "lastModified"),
new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "hitcount"),
new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_none, indexURL.urlLanguageLength, "language"),
new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_none, 1, "doctype"),
new kelondroColumn("f", kelondroColumn.celltype_binary, kelondroColumn.encoder_none, 1, "localflag"),
new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posintext"),
new kelondroColumn("r", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posinphrase"),
new kelondroColumn("o", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posofphrase"),
new kelondroColumn("i", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "worddistance"),
new kelondroColumn("w", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "wordcount"),
new kelondroColumn("p", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "phrasecount")
});

private static final int col_urlhash = 0;
private static final int col_quality = 1;
private static final int col_lastModified = 2;
private static final int col_hitcount = 3;
private static final int col_language = 4;
private static final int col_doctype = 5;
private static final int col_localflag = 6;
private static final int col_posintext = 7;
private static final int col_posinphrase = 8;
private static final int col_posofphrase = 9;
private static final int col_worddistance = 10;
private static final int col_wordcount = 11;
private static final int col_phrasecount = 12;


private kelondroRow.Entry entry;

public indexURLEntryNew(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int titleLength, // length of description/length (longer are better?)
int hitcount, //*how often appears this word in the text
int wordcount, //*total number of words
int phrasecount, //*total number of phrases
int posintext, //*position of word in all words
int posinphrase, //*position of word in its phrase
int posofphrase, //*number of the phrase where word appears
int worddistance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
int sizeOfPage, // # of bytes of the page
long lastmodified, //*last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, //*the entropy value
String language, //*(guessed) language of document
char doctype, //*type of document
int outlinksSame, // outlinks to same domain
int outlinksOther,// outlinks to other domain
boolean local //*flag shows that this index was generated locally; othervise its from a remote peer
) {

// more needed attributes:
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc
// - boolean: URL attributes

if ((language == null) || (language.length() != indexURL.urlLanguageLength)) language = "uk";
this.entry.setColString(col_urlhash, urlHash, null);
this.entry.setColLong(col_quality, quality);
this.entry.setColLong(col_lastModified, lastmodified);
this.entry.setColLong(col_hitcount, hitcount);
this.entry.setColString(col_language, language, null);
this.entry.setColByte(col_doctype, (byte) doctype);
this.entry.setColByte(col_localflag, (byte) ((local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL));
this.entry.setColLong(col_posintext, posintext);
this.entry.setColLong(col_posinphrase, posinphrase);
this.entry.setColLong(col_posofphrase, posofphrase);
this.entry.setColLong(col_worddistance, worddistance);
this.entry.setColLong(col_wordcount, wordcount);
this.entry.setColLong(col_phrasecount, phrasecount);
}

public indexURLEntryNew(String urlHash, String code) {
// the code is the external form of the row minus the leading urlHash entry
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
}

public indexURLEntryNew(String external) {

}

/*
public indexURLEntryNew(kelondroRow.Entry entry) {
this.entry = entry;
}
*/

public indexURLEntryNew(byte[] row) {
this.entry = urlEntryRow.newEntry(row);
}

public Object clone() {
return new indexURLEntryNew(toEncodedByteArrayForm());
}

public String toEncodedStringForm() {
return new String(toEncodedByteArrayForm());
}

public byte[] toEncodedByteArrayForm() {
return entry.bytes();
}

public String toPropertyForm() {
return entry.toPropertyForm();
}

public Entry toKelondroEntry() {
return this.entry;
}

public String urlHash() {
return this.entry.getColString(col_urlhash, null);
}

public int quality() {
return (int) this.entry.getColLong(col_quality);
}

public int virtualAge() {
return plasmaWordIndex.microDateDays(lastModified());
}

public long lastModified() {
return (int) this.entry.getColLong(col_lastModified);
}

public int hitcount() {
return (int) this.entry.getColLong(col_hitcount);
}

public int posintext() {
return (int) this.entry.getColLong(col_posintext);
}

public int posinphrase() {
return (int) this.entry.getColLong(col_posinphrase);
}

public int posofphrase() {
return (int) this.entry.getColLong(col_posofphrase);
}

public int wordcount() {
return (int) this.entry.getColLong(col_wordcount);
}

public int phrasecount() {
return (int) this.entry.getColLong(col_phrasecount);
}

public String getLanguage() {
return this.entry.getColString(col_language, null);
}

public char getType() {
return (char) this.entry.getColByte(col_doctype);
}

public boolean isLocal() {
return this.entry.getColByte(col_localflag) == indexEntryAttribute.LT_LOCAL;
}

public static indexURLEntryNew combineDistance(indexURLEntryNew ie1, indexEntry ie2) {
// returns a modified entry of the first argument
ie1.entry.setColLong(col_worddistance, ie1.worddistance() + ie2.worddistance() + Math.abs(ie1.posintext() - ie2.posintext()));
ie1.entry.setColLong(col_posintext, Math.min(ie1.posintext(), ie2.posintext()));
ie1.entry.setColLong(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/);
ie1.entry.setColLong(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase()));
ie1.entry.setColLong(col_wordcount, (ie1.wordcount() + ie2.wordcount()) / 2);
return ie1;
}

public void combineDistance(indexEntry oe) {
combineDistance(this, oe);
}

public int worddistance() {
return (int) this.entry.getColLong(col_worddistance);
}

public static final void min(indexURLEntryNew t, indexEntry other) {
if (t.hitcount() > other.hitcount()) t.entry.setColLong(col_hitcount, other.hitcount());
if (t.wordcount() > other.wordcount()) t.entry.setColLong(col_wordcount, other.wordcount());
if (t.phrasecount() > other.phrasecount()) t.entry.setColLong(col_phrasecount, other.phrasecount());
if (t.posintext() > other.posintext()) t.entry.setColLong(col_posintext, other.posintext());
if (t.posinphrase() > other.posinphrase()) t.entry.setColLong(col_posinphrase, other.posinphrase());
if (t.posofphrase() > other.posofphrase()) t.entry.setColLong(col_posofphrase, other.posofphrase());
if (t.worddistance() > other.worddistance()) t.entry.setColLong(col_worddistance, other.worddistance());
if (t.lastModified() > other.lastModified()) t.entry.setColLong(col_lastModified, other.lastModified());
if (t.quality() > other.quality()) t.entry.setColLong(col_quality, other.quality());
}

public static final void max(indexURLEntryNew t, indexEntry other) {
if (t.hitcount() < other.hitcount()) t.entry.setColLong(col_hitcount, other.hitcount());
if (t.wordcount() < other.wordcount()) t.entry.setColLong(col_wordcount, other.wordcount());
if (t.phrasecount() < other.phrasecount()) t.entry.setColLong(col_phrasecount, other.phrasecount());
if (t.posintext() < other.posintext()) t.entry.setColLong(col_posintext, other.posintext());
if (t.posinphrase() < other.posinphrase()) t.entry.setColLong(col_posinphrase, other.posinphrase());
if (t.posofphrase() < other.posofphrase()) t.entry.setColLong(col_posofphrase, other.posofphrase());
if (t.worddistance() < other.worddistance()) t.entry.setColLong(col_worddistance, other.worddistance());
if (t.lastModified() < other.lastModified()) t.entry.setColLong(col_lastModified, other.lastModified());
if (t.quality() < other.quality()) t.entry.setColLong(col_quality, other.quality());
}


public void min(indexEntry other) {
min(this, other);
}

public void max(indexEntry other) {
max(this, other);
}

static void normalize(indexURLEntryNew t, indexEntry min, indexEntry max) {
t.entry.setColLong(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount()));
t.entry.setColLong(col_wordcount , (t.wordcount() == 0) ? 0 : 1 + 255 * (t.wordcount() - min.wordcount() ) / (1 + max.wordcount() - min.wordcount()));
t.entry.setColLong(col_phrasecount , (t.phrasecount() == 0) ? 0 : 1 + 255 * (t.phrasecount() - min.phrasecount() ) / (1 + max.phrasecount() - min.phrasecount()));
t.entry.setColLong(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext()));
t.entry.setColLong(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase()));
t.entry.setColLong(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase()));
t.entry.setColLong(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance()));
t.entry.setColLong(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified()));
t.entry.setColLong(col_quality , (t.quality() == 0) ? 0 : 1 + 255 * (t.quality() - min.quality() ) / (1 + max.quality() - min.quality()));
}

public void normalize(indexEntry min, indexEntry max) {
normalize(this, min, max);
}

public indexEntry generateNormalized(indexEntry min, indexEntry max) {
indexURLEntryNew e = (indexURLEntryNew) this.clone();
e.normalize(min, max);
return e;
}

public boolean isNewer(indexEntry other) {
if (other == null) return true;
if (this.lastModified() > other.lastModified()) return true;
if (this.lastModified() == other.lastModified()) {
if (this.quality() > other.quality()) return true;
}
return false;
}

public boolean isOlder(indexEntry other) {
if (other == null) return false;
if (this.lastModified() < ((indexAbstractEntry) other).lastModified()) return true;
if (this.lastModified() == ((indexAbstractEntry) other).lastModified()) {
if (this.quality() < ((indexAbstractEntry) other).quality) return true;
}
return false;
}

}

0 comments on commit 7e0a130

Please sign in to comment.