From 4eab3aae6037b755bdf185bd65c1bff60fcd1ccb Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 23 Nov 2012 01:35:28 +0100 Subject: [PATCH] removed overhead by preventing generation of full search results when only the url is requested --- htroot/IndexControlRWIs_p.java | 10 +-- htroot/IndexControlURLs_p.java | 10 +-- htroot/api/ymarks/add_ymark.java | 2 +- htroot/gsa/searchresult.java | 2 +- htroot/yacysearch.java | 6 +- source/net/yacy/data/ymark/YMarkMetadata.java | 2 +- source/net/yacy/search/Switchboard.java | 14 +--- source/net/yacy/search/index/Fulltext.java | 75 ++++++++++++++----- source/net/yacy/search/index/Segment.java | 10 +-- source/net/yacy/search/query/QueryParams.java | 2 + .../net/yacy/search/query/RankingProcess.java | 8 +- 11 files changed, 85 insertions(+), 56 deletions(-) diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index fe13c6e471..2140091c18 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -378,10 +378,9 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea } catch ( final SpaceExceededException e ) { Log.logException(e); } - final URIMetadataNode e = segment.fulltext().getMetadata(b); + url = segment.fulltext().getURL(b); segment.fulltext().remove(b); - if ( e != null ) { - url = e.url(); + if ( url != null ) { pw.println(url.getHost() + "/" + url.getFile()); for ( final String supportedBlacklistType : supportedBlacklistTypes ) { if ( ListManager.listSetContains( @@ -413,10 +412,9 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea } catch ( final SpaceExceededException e ) { Log.logException(e); } - final URIMetadataNode e = segment.fulltext().getMetadata(b); + url = segment.fulltext().getURL(b); segment.fulltext().remove(b); - if ( e != null ) { - url = e.url(); + if ( url != null ) { pw.println(url.getHost() + "/.*"); for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) { if ( ListManager.listSetContains( diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index ddc2bbb710..42a5dc05ea 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -175,11 +175,11 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea } if (post.containsKey("urlhashdelete")) { - final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); - if (entry == null) { + final DigestURI url = segment.fulltext().getURL(ASCII.getBytes(urlhash)); + if (url == null) { prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } else { - urlstring = entry.url().toNormalform(true); + urlstring = url.toNormalform(true); prop.put("urlstring", ""); sb.urlRemove(segment, urlhash.getBytes()); prop.putHTML("result", "Removed URL " + urlstring); @@ -233,9 +233,9 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea // generate list if (post.containsKey("urlhashsimilar")) { - final Iterator entryIt = new RotateIterator(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount()); + final Iterator entryIt = new RotateIterator(segment.fulltext().urls(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount()); final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:
"); - URIMetadataNode entry; + DigestURI entry; int i = 0, rows = 0, cols = 0; prop.put("urlhashsimilar", "1"); while (entryIt.hasNext() && i < 256) { diff --git a/htroot/api/ymarks/add_ymark.java b/htroot/api/ymarks/add_ymark.java index 8a0bb8791f..e90e22f605 100644 --- a/htroot/api/ymarks/add_ymark.java +++ b/htroot/api/ymarks/add_ymark.java @@ -33,7 +33,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje if(post.containsKey("urlHash")) { final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING); - final DigestURI url = sb.index.fulltext().getMetadata(urlHash.getBytes()).url(); + final DigestURI url = sb.index.fulltext().getURL(urlHash.getBytes()); final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt()); final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING); try { diff --git a/htroot/gsa/searchresult.java b/htroot/gsa/searchresult.java index 18c8bf387f..7ce843ce76 100644 --- a/htroot/gsa/searchresult.java +++ b/htroot/gsa/searchresult.java @@ -115,7 +115,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje post.put(CommonParams.ROWS, post.remove("num")); post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 5000 : 100)); post.put("defType", "edismax"); - post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^200.0"); // a bost query that moves double content to the back + post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a bost query that moves double content to the back post.put(CommonParams.FL, YaCySchema.content_type.getSolrFieldName() + ',' + YaCySchema.id.getSolrFieldName() + ',' + diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 41d64cefa7..dc3f59f20a 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -677,12 +677,12 @@ public static serverObjects respond( return prop; } final String bookmarkHash = post.get("bookmarkref", ""); // urlhash - final URIMetadataNode urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(bookmarkHash)); - if ( urlentry != null ) { + final DigestURI url = indexSegment.fulltext().getURL(UTF8.getBytes(bookmarkHash)); + if ( url != null ) { try { sb.tables.bookmarks.createBookmark( sb.loader, - urlentry.url(), + url, YMarkTables.USER_ADMIN, true, "searchresult", diff --git a/source/net/yacy/data/ymark/YMarkMetadata.java b/source/net/yacy/data/ymark/YMarkMetadata.java index c04aa87d23..8c8f987376 100644 --- a/source/net/yacy/data/ymark/YMarkMetadata.java +++ b/source/net/yacy/data/ymark/YMarkMetadata.java @@ -82,7 +82,7 @@ public YMarkMetadata(final DigestURI uri, final Segment indexSegment) { public YMarkMetadata(final byte[] urlHash, final Segment indexSegment) { this.document = null; this.indexSegment = indexSegment; - this.uri = this.indexSegment.fulltext().getMetadata(urlHash).url(); + this.uri = this.indexSegment.fulltext().getURL(urlHash); } public YMarkMetadata(final Document document) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index e73a899a8f..6b9fb18dc3 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1521,16 +1521,10 @@ public void urlRemove(final Segment segment, final byte[] hash) { } public DigestURI getURL(final byte[] urlhash) { - if ( urlhash == null ) { - return null; - } - if ( urlhash.length == 0 ) { - return null; - } - final URIMetadataNode le = this.index.fulltext().getMetadata(urlhash); - if ( le != null ) { - return le.url(); - } + if (urlhash == null) return null; + if (urlhash.length == 0) return null; + final DigestURI url = this.index.fulltext().getURL(urlhash); + if (url != null) return url; return this.crawlQueues.getURL(urlhash); } diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 8fba0db54a..997800ebb3 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -227,13 +227,26 @@ public Date getLoadDate(final String urlHash) { Date now = new Date(); return x.after(now) ? now : x; } + + public DigestURI getURL(final byte[] urlHash) { + if (urlHash == null) return null; + SolrDocument doc; + try { + doc = this.solr.getById(ASCII.String(urlHash), YaCySchema.sku.getSolrFieldName()); + } catch (IOException e) { + return null; + } + if (doc == null) return null; + String x = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()); + if (x == null) return null; + try { + DigestURI uri = new DigestURI(x, urlHash); + return uri; + } catch (MalformedURLException e) { + return null; + } + } - /** - * generates an plasmaLURLEntry using the url hash - * if the url cannot be found, this returns null - * @param obrwi - * @return - */ public URIMetadataNode getMetadata(WordReference wre, long weight) { if (wre == null) return null; // all time was already wasted in takeRWI to get another element return getMetadata(wre.urlhash(), wre, weight); @@ -243,7 +256,7 @@ public URIMetadataNode getMetadata(final byte[] urlHash) { if (urlHash == null) return null; return getMetadata(urlHash, null, 0); } - + private URIMetadataNode getMetadata(final byte[] urlHash, WordReference wre, long weight) { // get the metadata from Solr @@ -519,9 +532,37 @@ public void close() { true); } + public CloneableIterator urls() { + // enumerates entry elements + final Iterator ids = iterator(); + return new CloneableIterator() { + @Override + public CloneableIterator clone(final Object secondHash) { + return this; + } + @Override + public final boolean hasNext() { + return ids.hasNext(); + } + @Override + public final DigestURI next() { + byte[] id = ids.next(); + if (id == null) return null; + return getURL(id); + } + @Override + public final void remove() { + ids.remove(); + } + @Override + public void close() { + } + }; + } + public CloneableIterator entries() { // enumerates entry elements - final Iterator ids = iterator(); + final Iterator ids = iterator(); return new CloneableIterator() { @Override public CloneableIterator clone(final Object secondHash) { @@ -783,15 +824,15 @@ private TreeSet domainNameCollector(int count, final Map domainSamples.size()) count = domainSamples.size(); this.statsDump = new ArrayList(); final TreeSet set = new TreeSet(); for (final URLHashCounter hs: domainSamples.values()) { if (hs == null) continue; - urlref = this.getMetadata(hs.urlhashb); - if (urlref == null || urlref.url() == null || urlref.url().getHost() == null) continue; - set.add(urlref.url().getHost()); + url = this.getURL(hs.urlhashb); + if (url == null || url.getHost() == null) continue; + set.add(url.getHost()); count--; if (count == 0) break; } @@ -820,7 +861,6 @@ public ScoreMap urlSampleScores(final Map domain */ public Map domainHashResolver(final Map domainSamples) { final HashMap hostMap = new HashMap(); - URIMetadataNode urlref; final ScoreMap hosthashScore = new ConcurrentScoreMap(); for (final Map.Entry e: domainSamples.entrySet()) { @@ -828,8 +868,7 @@ public Map domainHashResolver(final Map e: domainSamples.entrySet()) { - urlref = this.getMetadata(e.getValue().urlhashb); - url = urlref.url(); + url = this.getURL(e.getValue().urlhashb); hostMap.put(e.getKey(), new HostStat(url.getHost(), url.getPort(), e.getKey(), hosthashScore.get(e.getKey()))); } return hostMap; @@ -841,7 +880,6 @@ public Iterator statistics(int count, final ScoreMap domainSco // fetch urls from the database to determine the host in clear text final Iterator j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first) - URIMetadataNode urlref; String urlhash; count += 10; // make some more to prevent that we have to do this again after deletions too soon. if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size(); @@ -850,10 +888,9 @@ public Iterator statistics(int count, final ScoreMap domainSco while (j.hasNext()) { urlhash = j.next(); if (urlhash == null) continue; - urlref = this.getMetadata(ASCII.getBytes(urlhash)); - if (urlref == null || urlref.url() == null || urlref.url().getHost() == null) continue; + url = this.getURL(ASCII.getBytes(urlhash)); + if (url == null || url.getHost() == null) continue; if (this.statsDump == null) return new ArrayList().iterator(); // some other operation has destroyed the object - url = urlref.url(); this.statsDump.add(new HostStat(url.getHost(), url.getPort(), urlhash.substring(6), domainScore.get(urlhash))); count--; if (count == 0) break; diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 864e3a9019..f6ef24d750 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -58,7 +58,6 @@ import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReferenceFactory; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; @@ -235,7 +234,7 @@ protected DigestURI next0() { return null; } if (id == null || id == AbstractSolrConnector.POISON_ID) return null; - DigestURI u = Segment.this.fulltext.getMetadata(ASCII.getBytes(id)).url(); + DigestURI u = Segment.this.fulltext.getURL(ASCII.getBytes(id)); if (u.toNormalform(true).startsWith(urlstub)) return u; } } @@ -508,13 +507,12 @@ public int removeAllUrlReferences(final byte[] urlhash, final LoaderDispatcher l if (urlhash == null) return 0; // determine the url string - final URIMetadataNode entry = fulltext().getMetadata(urlhash); - if (entry == null) return 0; - if (entry.url() == null) return 0; + final DigestURI url = fulltext().getURL(urlhash); + if (url == null) return 0; try { // parse the resource - final Document document = Document.mergeDocuments(entry.url(), null, loader.loadDocuments(loader.request(entry.url(), true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay)); + final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay)); if (document == null) { // delete just the url entry fulltext().remove(urlhash); diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 4ad3def9df..58449951ef 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -429,6 +429,8 @@ public SolrQuery solrQuery() { // construct query final SolrQuery params = new SolrQuery(); + params.setParam("defType", "edismax"); + params.setParam("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a bost query that moves double content to the back params.setStart(this.offset); params.setRows(this.itemsPerPage); params.setFacet(false); diff --git a/source/net/yacy/search/query/RankingProcess.java b/source/net/yacy/search/query/RankingProcess.java index ed0d2c1a7f..f762443111 100644 --- a/source/net/yacy/search/query/RankingProcess.java +++ b/source/net/yacy/search/query/RankingProcess.java @@ -51,7 +51,7 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.Condenser; import net.yacy.document.LibraryProvider; -import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -411,7 +411,7 @@ public ScoreMap getHostNavigator() { final ScoreMap result = new ConcurrentScoreMap(); final Iterator domhashs = this.hostHashNavigator.keys(false); - URIMetadataNode row; + DigestURI url; byte[] urlhash; String hosthash, hostname; if ( this.hostHashResolver != null ) { @@ -421,8 +421,8 @@ public ScoreMap getHostNavigator() { continue; } urlhash = this.hostHashResolver.get(hosthash); - row = urlhash == null ? null : this.query.getSegment().fulltext().getMetadata(urlhash); - hostname = row == null ? null : row.url().getHost(); + url = urlhash == null ? null : this.query.getSegment().fulltext().getURL(urlhash); + hostname = url == null ? null : url.getHost(); if ( hostname != null ) { result.set(hostname, this.hostHashNavigator.get(hosthash)); }