Skip to content

Commit

Permalink
removed overhead by preventing generation of full search results when
Browse files Browse the repository at this point in the history
only the url is requested
  • Loading branch information
Orbiter committed Nov 23, 2012
1 parent a114bb2 commit 4eab3aa
Show file tree
Hide file tree
Showing 11 changed files with 85 additions and 56 deletions.
10 changes: 4 additions & 6 deletions htroot/IndexControlRWIs_p.java
Expand Up @@ -378,10 +378,9 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
} catch ( final SpaceExceededException e ) {
Log.logException(e);
}
final URIMetadataNode e = segment.fulltext().getMetadata(b);
url = segment.fulltext().getURL(b);
segment.fulltext().remove(b);
if ( e != null ) {
url = e.url();
if ( url != null ) {
pw.println(url.getHost() + "/" + url.getFile());
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
if ( ListManager.listSetContains(
Expand Down Expand Up @@ -413,10 +412,9 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
} catch ( final SpaceExceededException e ) {
Log.logException(e);
}
final URIMetadataNode e = segment.fulltext().getMetadata(b);
url = segment.fulltext().getURL(b);
segment.fulltext().remove(b);
if ( e != null ) {
url = e.url();
if ( url != null ) {
pw.println(url.getHost() + "/.*");
for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) {
if ( ListManager.listSetContains(
Expand Down
10 changes: 5 additions & 5 deletions htroot/IndexControlURLs_p.java
Expand Up @@ -175,11 +175,11 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
}

if (post.containsKey("urlhashdelete")) {
final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash));
if (entry == null) {
final DigestURI url = segment.fulltext().getURL(ASCII.getBytes(urlhash));
if (url == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
urlstring = entry.url().toNormalform(true);
urlstring = url.toNormalform(true);
prop.put("urlstring", "");
sb.urlRemove(segment, urlhash.getBytes());
prop.putHTML("result", "Removed URL " + urlstring);
Expand Down Expand Up @@ -233,9 +233,9 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea

// generate list
if (post.containsKey("urlhashsimilar")) {
final Iterator<URIMetadataNode> entryIt = new RotateIterator<URIMetadataNode>(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
final Iterator<DigestURI> entryIt = new RotateIterator<DigestURI>(segment.fulltext().urls(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
URIMetadataNode entry;
DigestURI entry;
int i = 0, rows = 0, cols = 0;
prop.put("urlhashsimilar", "1");
while (entryIt.hasNext() && i < 256) {
Expand Down
2 changes: 1 addition & 1 deletion htroot/api/ymarks/add_ymark.java
Expand Up @@ -33,7 +33,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje

if(post.containsKey("urlHash")) {
final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING);
final DigestURI url = sb.index.fulltext().getMetadata(urlHash.getBytes()).url();
final DigestURI url = sb.index.fulltext().getURL(urlHash.getBytes());
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
try {
Expand Down
2 changes: 1 addition & 1 deletion htroot/gsa/searchresult.java
Expand Up @@ -115,7 +115,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
post.put(CommonParams.ROWS, post.remove("num"));
post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 5000 : 100));
post.put("defType", "edismax");
post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^200.0"); // a bost query that moves double content to the back
post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a bost query that moves double content to the back
post.put(CommonParams.FL,
YaCySchema.content_type.getSolrFieldName() + ',' +
YaCySchema.id.getSolrFieldName() + ',' +
Expand Down
6 changes: 3 additions & 3 deletions htroot/yacysearch.java
Expand Up @@ -677,12 +677,12 @@ public static serverObjects respond(
return prop;
}
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
final URIMetadataNode urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(bookmarkHash));
if ( urlentry != null ) {
final DigestURI url = indexSegment.fulltext().getURL(UTF8.getBytes(bookmarkHash));
if ( url != null ) {
try {
sb.tables.bookmarks.createBookmark(
sb.loader,
urlentry.url(),
url,
YMarkTables.USER_ADMIN,
true,
"searchresult",
Expand Down
2 changes: 1 addition & 1 deletion source/net/yacy/data/ymark/YMarkMetadata.java
Expand Up @@ -82,7 +82,7 @@ public YMarkMetadata(final DigestURI uri, final Segment indexSegment) {
public YMarkMetadata(final byte[] urlHash, final Segment indexSegment) {
this.document = null;
this.indexSegment = indexSegment;
this.uri = this.indexSegment.fulltext().getMetadata(urlHash).url();
this.uri = this.indexSegment.fulltext().getURL(urlHash);
}

public YMarkMetadata(final Document document) {
Expand Down
14 changes: 4 additions & 10 deletions source/net/yacy/search/Switchboard.java
Expand Up @@ -1521,16 +1521,10 @@ public void urlRemove(final Segment segment, final byte[] hash) {
}

public DigestURI getURL(final byte[] urlhash) {
if ( urlhash == null ) {
return null;
}
if ( urlhash.length == 0 ) {
return null;
}
final URIMetadataNode le = this.index.fulltext().getMetadata(urlhash);
if ( le != null ) {
return le.url();
}
if (urlhash == null) return null;
if (urlhash.length == 0) return null;
final DigestURI url = this.index.fulltext().getURL(urlhash);
if (url != null) return url;
return this.crawlQueues.getURL(urlhash);
}

Expand Down
75 changes: 56 additions & 19 deletions source/net/yacy/search/index/Fulltext.java
Expand Up @@ -227,13 +227,26 @@ public Date getLoadDate(final String urlHash) {
Date now = new Date();
return x.after(now) ? now : x;
}

public DigestURI getURL(final byte[] urlHash) {
if (urlHash == null) return null;
SolrDocument doc;
try {
doc = this.solr.getById(ASCII.String(urlHash), YaCySchema.sku.getSolrFieldName());
} catch (IOException e) {
return null;
}
if (doc == null) return null;
String x = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
if (x == null) return null;
try {
DigestURI uri = new DigestURI(x, urlHash);
return uri;
} catch (MalformedURLException e) {
return null;
}
}

/**
* generates an plasmaLURLEntry using the url hash
* if the url cannot be found, this returns null
* @param obrwi
* @return
*/
public URIMetadataNode getMetadata(WordReference wre, long weight) {
if (wre == null) return null; // all time was already wasted in takeRWI to get another element
return getMetadata(wre.urlhash(), wre, weight);
Expand All @@ -243,7 +256,7 @@ public URIMetadataNode getMetadata(final byte[] urlHash) {
if (urlHash == null) return null;
return getMetadata(urlHash, null, 0);
}

private URIMetadataNode getMetadata(final byte[] urlHash, WordReference wre, long weight) {

// get the metadata from Solr
Expand Down Expand Up @@ -519,9 +532,37 @@ public void close() {
true);
}

public CloneableIterator<DigestURI> urls() {
// enumerates entry elements
final Iterator<byte[]> ids = iterator();
return new CloneableIterator<DigestURI>() {
@Override
public CloneableIterator<DigestURI> clone(final Object secondHash) {
return this;
}
@Override
public final boolean hasNext() {
return ids.hasNext();
}
@Override
public final DigestURI next() {
byte[] id = ids.next();
if (id == null) return null;
return getURL(id);
}
@Override
public final void remove() {
ids.remove();
}
@Override
public void close() {
}
};
}

public CloneableIterator<URIMetadataNode> entries() {
// enumerates entry elements
final Iterator<byte[]> ids = iterator();
final Iterator<byte[]> ids = iterator();
return new CloneableIterator<URIMetadataNode>() {
@Override
public CloneableIterator<URIMetadataNode> clone(final Object secondHash) {
Expand Down Expand Up @@ -783,15 +824,15 @@ private TreeSet<String> domainNameCollector(int count, final Map<String, URLHash
// collect hashes from all domains

// fetch urls from the database to determine the host in clear text
URIMetadataNode urlref;
DigestURI url;
if (count < 0 || count > domainSamples.size()) count = domainSamples.size();
this.statsDump = new ArrayList<HostStat>();
final TreeSet<String> set = new TreeSet<String>();
for (final URLHashCounter hs: domainSamples.values()) {
if (hs == null) continue;
urlref = this.getMetadata(hs.urlhashb);
if (urlref == null || urlref.url() == null || urlref.url().getHost() == null) continue;
set.add(urlref.url().getHost());
url = this.getURL(hs.urlhashb);
if (url == null || url.getHost() == null) continue;
set.add(url.getHost());
count--;
if (count == 0) break;
}
Expand Down Expand Up @@ -820,16 +861,14 @@ public ScoreMap<String> urlSampleScores(final Map<String, URLHashCounter> domain
*/
public Map<String, HostStat> domainHashResolver(final Map<String, URLHashCounter> domainSamples) {
final HashMap<String, HostStat> hostMap = new HashMap<String, HostStat>();
URIMetadataNode urlref;

final ScoreMap<String> hosthashScore = new ConcurrentScoreMap<String>();
for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
hosthashScore.inc(ASCII.String(e.getValue().urlhashb, 6, 6), e.getValue().count);
}
DigestURI url;
for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
urlref = this.getMetadata(e.getValue().urlhashb);
url = urlref.url();
url = this.getURL(e.getValue().urlhashb);
hostMap.put(e.getKey(), new HostStat(url.getHost(), url.getPort(), e.getKey(), hosthashScore.get(e.getKey())));
}
return hostMap;
Expand All @@ -841,7 +880,6 @@ public Iterator<HostStat> statistics(int count, final ScoreMap<String> domainSco

// fetch urls from the database to determine the host in clear text
final Iterator<String> j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first)
URIMetadataNode urlref;
String urlhash;
count += 10; // make some more to prevent that we have to do this again after deletions too soon.
if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size();
Expand All @@ -850,10 +888,9 @@ public Iterator<HostStat> statistics(int count, final ScoreMap<String> domainSco
while (j.hasNext()) {
urlhash = j.next();
if (urlhash == null) continue;
urlref = this.getMetadata(ASCII.getBytes(urlhash));
if (urlref == null || urlref.url() == null || urlref.url().getHost() == null) continue;
url = this.getURL(ASCII.getBytes(urlhash));
if (url == null || url.getHost() == null) continue;
if (this.statsDump == null) return new ArrayList<HostStat>().iterator(); // some other operation has destroyed the object
url = urlref.url();
this.statsDump.add(new HostStat(url.getHost(), url.getPort(), urlhash.substring(6), domainScore.get(urlhash)));
count--;
if (count == 0) break;
Expand Down
10 changes: 4 additions & 6 deletions source/net/yacy/search/index/Segment.java
Expand Up @@ -58,7 +58,6 @@
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.citation.CitationReferenceFactory;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
Expand Down Expand Up @@ -235,7 +234,7 @@ protected DigestURI next0() {
return null;
}
if (id == null || id == AbstractSolrConnector.POISON_ID) return null;
DigestURI u = Segment.this.fulltext.getMetadata(ASCII.getBytes(id)).url();
DigestURI u = Segment.this.fulltext.getURL(ASCII.getBytes(id));
if (u.toNormalform(true).startsWith(urlstub)) return u;
}
}
Expand Down Expand Up @@ -508,13 +507,12 @@ public int removeAllUrlReferences(final byte[] urlhash, final LoaderDispatcher l

if (urlhash == null) return 0;
// determine the url string
final URIMetadataNode entry = fulltext().getMetadata(urlhash);
if (entry == null) return 0;
if (entry.url() == null) return 0;
final DigestURI url = fulltext().getURL(urlhash);
if (url == null) return 0;

try {
// parse the resource
final Document document = Document.mergeDocuments(entry.url(), null, loader.loadDocuments(loader.request(entry.url(), true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay));
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay));
if (document == null) {
// delete just the url entry
fulltext().remove(urlhash);
Expand Down
2 changes: 2 additions & 0 deletions source/net/yacy/search/query/QueryParams.java
Expand Up @@ -429,6 +429,8 @@ public SolrQuery solrQuery() {

// construct query
final SolrQuery params = new SolrQuery();
params.setParam("defType", "edismax");
params.setParam("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a bost query that moves double content to the back
params.setStart(this.offset);
params.setRows(this.itemsPerPage);
params.setFacet(false);
Expand Down
8 changes: 4 additions & 4 deletions source/net/yacy/search/query/RankingProcess.java
Expand Up @@ -51,7 +51,7 @@
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.document.Condenser;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
Expand Down Expand Up @@ -411,7 +411,7 @@ public ScoreMap<String> getHostNavigator() {
final ScoreMap<String> result = new ConcurrentScoreMap<String>();

final Iterator<String> domhashs = this.hostHashNavigator.keys(false);
URIMetadataNode row;
DigestURI url;
byte[] urlhash;
String hosthash, hostname;
if ( this.hostHashResolver != null ) {
Expand All @@ -421,8 +421,8 @@ public ScoreMap<String> getHostNavigator() {
continue;
}
urlhash = this.hostHashResolver.get(hosthash);
row = urlhash == null ? null : this.query.getSegment().fulltext().getMetadata(urlhash);
hostname = row == null ? null : row.url().getHost();
url = urlhash == null ? null : this.query.getSegment().fulltext().getURL(urlhash);
hostname = url == null ? null : url.getHost();
if ( hostname != null ) {
result.set(hostname, this.hostHashNavigator.get(hosthash));
}
Expand Down

0 comments on commit 4eab3aa

Please sign in to comment.