Permalink
Browse files

Limit the number of initially previewed links in crawl start pages.

This prevent rendering a big and inconvenient scrollbar on resources
containing many links.
If really needed, preview of all links is still available with a "Show
all links" button.

Doesn't affect the number of links used once the crawl is effectively
started, as the list is then loaded again server-side.
  • Loading branch information...
luccioman committed Jun 17, 2017
1 parent d2a4a27 commit 0f80c978d67acb6e3ab92e13d3ed0d21d0d8d4ab
@@ -246,6 +246,9 @@ <h2>Expert Crawl Start</h2>
<dd>
<input type="radio" name="crawlingMode" id="sitelist" value="sitelist" #(has_url)#disabled="disabled"::#(/has_url)# #(crawlingMode_sitelist)#::checked="checked"#(/crawlingMode_sitelist)#/><br />
<div id="sitelistURLs"></div>
<button id="expandSiteListBtn" style="visibility:hidden" type="button" onclick="this.disabled = true;loadInfos(true);" class="btn btn-default btn-xs" title="Show all links">
<span class="glyphicon glyphicon-option-horizontal"/>
</button>
</dd>
<dt>From Sitemap</dt>
<dd>
@@ -47,7 +47,12 @@ <h2>Site Crawling</h2>
</td>
</tr><tr>
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" />Link-List of URL</td>
<td><div id="sitelistURLs"></div></td>
<td>
<div id="sitelistURLs"></div>
<button id="expandSiteListBtn" style="visibility:hidden" type="button" onclick="this.disabled = true;loadInfos(true);" class="btn btn-default btn-xs" title="Show all links">
<span class="glyphicon glyphicon-option-horizontal"/>
</button>
</td>
</tr><tr>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;"/>Sitemap URL</td>
@@ -28,6 +28,7 @@
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
@@ -85,6 +86,7 @@
* </ul>
* </li>
* <li>agentName (optional) : the string identifying the agent used to fetch the resource. Example : "YaCy Internet (cautious)"</li>
* <li>maxLinks (optional) : the maximum number of links, sitemap URLs or icons to return</li>
* </ul>
* @param env
* server environment
@@ -110,6 +112,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
String actions = "title,robots";
if (post != null && post.containsKey("url")) {
final int maxLinks = post.getInt("maxLinks", Integer.MAX_VALUE);
if (post.containsKey("actions"))
actions=post.get("actions");
String url=post.get("url");
@@ -135,7 +138,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
net.yacy.document.Document scraper = null;
if (u != null) try {
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
} catch (final IOException e) {
ConcurrentLog.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@@ -145,20 +148,25 @@ public static serverObjects respond(final RequestHeader header, final serverObje
// put the document title
prop.putXML("title", scraper.dc_title());
// put the icons that belongs to the document
// put the icons that belong to the document
Set<DigestURL> iconURLs = scraper.getIcons().keySet();
int i = 0;
int count = 0;
for (DigestURL iconURL : iconURLs) {
prop.putXML("icons_" + i + "_icon", iconURL.toNormalform(false));
prop.put("icons_" + i + "_eol", 1);
i++;
if(count >= maxLinks) {
break;
}
prop.putXML("icons_" + count + "_icon", iconURL.toNormalform(false));
prop.put("icons_" + count + "_eol", 1);
count++;
}
if(count > 0) {
prop.put("icons_" + (count - 1) + "_eol", 0);
}
prop.put("icons_" + (i - 1) + "_eol", 0);
prop.put("icons", iconURLs.size());
prop.put("icons", count);
// put keywords
final Set<String> list = scraper.dc_subject();
int count = 0;
count = 0;
for (final String element: list) {
if (!element.equals("")) {
prop.putXML("tags_"+count+"_tag", element);
@@ -177,14 +185,20 @@ public static serverObjects respond(final RequestHeader header, final serverObje
final StringBuilder links = new StringBuilder(uris.size() * 80);
final StringBuilder filter = new StringBuilder(uris.size() * 40);
count = 0;
for (final DigestURL uri: uris) {
final Iterator<AnchorURL> urisIt = uris.iterator();
while (urisIt.hasNext()) {
AnchorURL uri = urisIt.next();
if (uri == null) continue;
if(count >= maxLinks) {
break;
}
links.append(';').append(uri.toNormalform(true));
filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
prop.putXML("links_" + count + "_link", uri.toNormalform(true));
count++;
}
prop.put("links", count);
prop.put("hasMoreLinks", (count >= maxLinks && urisIt.hasNext()) ? "1" : "0");
prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
}
@@ -200,12 +214,17 @@ public static serverObjects respond(final RequestHeader header, final serverObje
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());
// get the sitemap URL of the domain
// get the sitemap URL(s) of the domain
final List<String> sitemaps = robotsEntry == null ? new ArrayList<String>(0) : robotsEntry.getSitemaps();
for (int i = 0; i < sitemaps.size(); i++) {
prop.putXML("sitemaps_" + i + "_sitemap", sitemaps.get(i));
int count = 0;
for (String sitemap : sitemaps) {
if(count >= maxLinks) {
break;
}
prop.putXML("sitemaps_" + count + "_sitemap", sitemap);
count++;
}
prop.put("sitemaps", sitemaps.size());
prop.put("sitemaps", count);
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
}
@@ -25,5 +25,6 @@
<link name="#[link]#" />
#{/links}#
</links>
<hasMoreLinks>#(hasMoreLinks)#false::true#(/hasMoreLinks)#</hasMoreLinks>
<oai>#[oai]#</oai>
</pageinfo>
@@ -87,7 +87,29 @@ function handleResponse(){
sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
}
document.getElementById("sitelistURLs").innerHTML = sitelist;
if (sitelist) document.getElementById("sitelist").disabled=false;
var expandButton = document.getElementById("expandSiteListBtn");
var siteListRadio = document.getElementById("sitelist");
if (sitelist) {
siteListRadio.disabled = false;
var hasMoreLinksElement = response.getElementsByTagName("hasMoreLinks");
if(hasMoreLinksElement != null && hasMoreLinksElement.length > 0
&& hasMoreLinksElement[0].firstChild != null && hasMoreLinksElement[0].firstChild.nodeValue == "true") {
expandButton.style.visibility = "visible";
expandButton.disabled = false;
} else {
expandButton.style.visibility = "hidden";
}
} else {
siteListRadio.disabled = true;
siteListRadio.checked = false;
var urlModeRadio = document.getElementById("url");
if(urlModeRadio != null) {
urlModeRadio.checked = true;
}
if(expandButton != null) {
expandButton.style.visibility = "hidden";
}
}
// clear the ajax image
document.getElementById("ajax").setAttribute("src", AJAX_OFF);
@@ -96,15 +118,18 @@ function handleResponse(){
function changed() {
window.clearTimeout(timeout);
timeout=window.setTimeout("loadInfos()", 1500);
timeout=window.setTimeout(loadInfos, 1500);
}
function loadInfos() {
/**
* @param loadAll {Boolean} when true, load all links, else limit to the 100 first
*/
function loadInfos(loadAll) {
// displaying ajax image
document.getElementById("ajax").setAttribute("src",AJAX_ON);
var url=document.getElementById("crawlingURL").value;
if (url.indexOf("ftp") == 0 || url.indexOf("smb") == 0) document.getElementById("crawlingQ").checked = true; // since the pdf parser update for page separation, we need to set this
sndReq('api/getpageinfo_p.xml?actions=title,robots&url='+url);
sndReq('api/getpageinfo_p.xml?actions=title,robots' + (loadAll ? '' : '&maxLinks=50') + '&url='+url);
document.getElementById("api").innerHTML = "<a href='api/getpageinfo_p.xml?actions=title,robots&url=" + url + "' id='apilink'><img src='env/grafics/api.png' width='60' height='40' alt='API'/></a><span>See the page info about the start url.</span>";
}

0 comments on commit 0f80c97

Please sign in to comment.