Skip to content

Commit

Permalink
migrated the index export methods from the old metadata to solr. Now
Browse files Browse the repository at this point in the history
exports are done using solr queries. removed superfluous methods and
servlets.
  • Loading branch information
Orbiter committed Jan 24, 2013
1 parent 1768c82 commit 0fe7b6f
Show file tree
Hide file tree
Showing 10 changed files with 95 additions and 317 deletions.
7 changes: 3 additions & 4 deletions htroot/CrawlResults.java
Expand Up @@ -124,10 +124,9 @@ public static serverObjects respond(final RequestHeader header, serverObjects po

if (post.containsKey("deletedomain")) {
final String domain = post.get("domain", null);
final String hashpart = domain == null ? null : DigestURI.hosthash6(domain);
if (hashpart != null) {
sb.index.fulltext().deleteDomain(hashpart, null, false);
ResultURLs.deleteDomain(tabletype, domain, hashpart);
if (domain != null) {
sb.index.fulltext().deleteDomainHostname(domain, null, false);
ResultURLs.deleteDomain(tabletype, domain);
}
}

Expand Down
2 changes: 1 addition & 1 deletion htroot/Crawler_p.java
Expand Up @@ -294,7 +294,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) {
for (DigestURI u: rootURLs) {
int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 1);
int count = sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1);
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
}
}
Expand Down
10 changes: 1 addition & 9 deletions htroot/IndexControlURLs_p.html
Expand Up @@ -77,7 +77,6 @@ <h2>URL References Administration</h2>
<dt class="TableCellDark">Retrieve by URL-Hash:</dt>
<dd><input type="text" name="urlhash" value="#[urlhash]#" size="40" maxlength="12" />
<input type="submit" name="urlhashsearch" value="Show Details for URL-Hash" class="submitready" style="width:240px;"/>
<input type="submit" name="urlhashsimilar" value="Generate List" class="submitready" style="width:240px;"/>
</dd>
</dl>
</fieldset>
Expand Down Expand Up @@ -132,7 +131,7 @@ <h2>URL References Administration</h2>
<td>
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<div>
<input type="hidden" name="hashpart" value="#[hashpart]#" />
<input type="hidden" name="domain" value="#[domain]#" />
<input type="hidden" name="lines" value="#[lines]#" />
<input type="submit" name="deletedomain" value="delete all" class="submitready" style="width:240px;"/>
</div>
Expand Down Expand Up @@ -206,13 +205,6 @@ <h2>URL References Administration</h2>
<div class="commit">Stored a solr dump to file #[dumpfile]#</div>::
#(/indexdump)#

#(urlhashsimilar)#::<p>Sequential List of URL-Hashes:<br />
#{rows}#
#{cols}#<a href="/IndexControlURLs_p.html?urlhash=#[urlHash]#&amp;urlhashsearch=1" class="tt">#[urlHash]#</a> #{/cols}#<br />
#{/rows}#
</p>
#(/urlhashsimilar)#

#(genUrlProfile)#
::No entry found for URL-hash #[urlhash]#
::<iframe src="/api/yacydoc.html?urlhash=#[urlhash]#" width="100%" height="420" frameborder="0" scrolling="no"></iframe><br />
Expand Down
51 changes: 14 additions & 37 deletions htroot/IndexControlURLs_p.java
Expand Up @@ -30,21 +30,22 @@
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.RotateIterator;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment;
Expand Down Expand Up @@ -236,30 +237,6 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
}
}

// generate list
if (post.containsKey("urlhashsimilar")) {
final Iterator<DigestURI> entryIt = new RotateIterator<DigestURI>(segment.fulltext().urls(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
DigestURI entry;
int i = 0, rows = 0, cols = 0;
prop.put("urlhashsimilar", "1");
while (entryIt.hasNext() && i < 256) {
entry = entryIt.next();
if (entry == null) break;
prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", ASCII.String(entry.hash()));
cols++;
if (cols==8) {
prop.put("urlhashsimilar_rows_"+rows+"_cols", cols);
cols = 0;
rows++;
}
i++;
}
prop.put("statistics", 0);
prop.put("urlhashsimilar_rows", rows);
prop.put("result", result.toString());
}

if (post.containsKey("lurlexport")) {
// parse format
int format = 0;
Expand All @@ -279,7 +256,7 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
final File f = new File(s);
f.getParentFile().mkdirs();
final String filter = post.get("exportfilter", ".*");
final Fulltext.Export running = segment.fulltext().export(f, filter, null, format, dom);
final Fulltext.Export running = segment.fulltext().export(f, filter, format, dom);

prop.put("lurlexport_exportfile", s);
prop.put("lurlexport_urlcount", running.count());
Expand All @@ -301,29 +278,29 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
}

if (post.containsKey("deletedomain")) {
final String hp = post.get("hashpart");
segment.fulltext().deleteDomain(hp, null, false);
final String domain = post.get("domain");
segment.fulltext().deleteDomainHostname(domain, null, false);
// trigger the loading of the table
post.put("statistics", "");
}

if (post.containsKey("statistics")) {
final int count = post.getInt("lines", 100);
Iterator<Fulltext.HostStat> statsiter;
prop.put("statistics_lines", count);
int cnt = 0;
try {
final Fulltext metadata = segment.fulltext();
statsiter = metadata.statistics(count, metadata.urlSampleScores(metadata.domainSampleCollector()));
Map<String, ReversibleScoreMap<String>> scores = metadata.getSolr().getFacets(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", count, YaCySchema.host_s.getSolrFieldName());
ReversibleScoreMap<String> stats = scores.get(YaCySchema.host_s.getSolrFieldName());
Iterator<String> statsiter = stats.keys(false);
boolean dark = true;
Fulltext.HostStat hs;
String hostname;
prop.put("statisticslines_domains_" + cnt + "lines", count);
while (statsiter.hasNext() && cnt < count) {
hs = statsiter.next();
hostname = statsiter.next();
prop.put("statisticslines_domains_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("statisticslines_domains_" + cnt + "_domain", hs.hostname + ((hs.port == 80) ? "" : ":" + hs.port));
prop.put("statisticslines_domains_" + cnt + "lines", count);
prop.put("statisticslines_domains_" + cnt + "_hashpart", hs.hosthash);
prop.put("statisticslines_domains_" + cnt + "_count", hs.count);
prop.put("statisticslines_domains_" + cnt + "_domain", hostname);
prop.put("statisticslines_domains_" + cnt + "_count", stats.get(hostname));
dark = !dark;
cnt++;
}
Expand Down
9 changes: 0 additions & 9 deletions htroot/IndexControlURLs_p.xml
Expand Up @@ -13,13 +13,4 @@
#(indexdump)#::
<dumpfile>#[dumpfile]#</dumpfile>::
#(/indexdump)#
#(urlhashsimilar)#::
<urls>
#{rows}#
#{cols}#
<urlhash>#[urlHash]#</urlhash>
#{/cols}#
#{/rows}#
</urls>
#(/urlhashsimilar)#
</data>
Empty file removed htroot/YBRFetch_p.html
Empty file.
70 changes: 0 additions & 70 deletions htroot/YBRFetch_p.java

This file was deleted.

11 changes: 1 addition & 10 deletions source/net/yacy/crawler/data/ResultURLs.java
Expand Up @@ -143,17 +143,8 @@ public static Iterator<String> domains(final EventOrigin stack) {
return getDomains(stack).keys(false);
}

public static int deleteDomain(final EventOrigin stack, final String host, final String hosthash) {
public static int deleteDomain(final EventOrigin stack, final String host) {
assert host != null : "host = null";
assert hosthash.length() == 6;
final Iterator<Map.Entry<String, InitExecEntry>> i = results(stack);
Map.Entry<String, InitExecEntry> w;
String urlhash;
while (i.hasNext()) {
w = i.next();
urlhash = w.getKey();
if (urlhash == null || urlhash.substring(6).equals(hosthash)) i.remove();
}
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).delete(host);
}
Expand Down

0 comments on commit 0fe7b6f

Please sign in to comment.