Skip to content

Commit

Permalink
*) Bookmarks: Ajax icon is displayed while loading title
Browse files Browse the repository at this point in the history
*) First version of a sitemap parser added
   - currently only autodetection of sitemap files is supported
*) DB-Import restructured
   - pause/resume should work again now


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3666 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
theli committed May 6, 2007
1 parent 269d5ca commit 6f46245
Show file tree
Hide file tree
Showing 20 changed files with 749 additions and 88 deletions.
1 change: 1 addition & 0 deletions htroot/Bookmarks.html
Expand Up @@ -44,6 +44,7 @@ <h2>Bookmarks</h2>
::
<input type="submit" name="add" value="edit" />
#(/edit)#
<img src="/env/grafics/empty.gif" name="ajax" />
</fieldset>
</form>
::
Expand Down
13 changes: 10 additions & 3 deletions htroot/IndexCreate_p.html
Expand Up @@ -26,22 +26,29 @@ <h2>Index Creation</h2>
<tr valign="top" class="TableCellSummary">
<td>Starting Point:</td>
<td>
<table cellpadding="0" cellspacing="0">
<table cellpadding="0" cellspacing="0">
<tr>
<td><label for="url"><nobr>From URL</nobr></label>:</td>
<td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
<td>
<input name="crawlingURL" type="text" size="41" maxlength="256" value="http://" onkeypress="changed()" />
<span id="robotsOK"></span>
<span id="robotsOK"></span>
</td>
</tr>
<tr>
<td><label for="url"><nobr>From Sitemap</nobr></label>:</td>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="true"/></td>
<td>
<input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="true"/>
</td>
</tr>
<tr>
<td><label for="file"><nobr>From File</nobr></label>:</td>
<td><input type="radio" name="crawlingMode" id="file" value="file" /></td>
<td><input type="file" name="crawlingFile" size="28" /></td>
</tr>
<tr>
<td colspan="3" class="commit"><span id="title"><br></span></td>
<td colspan="3" class="commit"><span id="title"><br></span><img src="/env/grafics/empty.gif" name="ajax" /></td>
</tr>
</table>
</td>
Expand Down
12 changes: 7 additions & 5 deletions htroot/IndexImport_p.java
Expand Up @@ -51,6 +51,7 @@
import java.io.File;
import java.io.PrintStream;
import java.util.Date;
import java.util.HashMap;

import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
Expand Down Expand Up @@ -78,12 +79,13 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
String importIndexSecondaryPath = (String) post.get("importIndexSecondaryPath");
String importType = (String) post.get("importType");
String cacheSizeStr = (String) post.get("cacheSize");
int cacheSize = 8*1024*1024;
try {
cacheSize = Integer.valueOf(cacheSizeStr).intValue();
} catch (NumberFormatException e) {}
boolean startImport = true;

HashMap initParams = new HashMap();
initParams.put("plasmaPath",importPlasmaPath);
initParams.put("cacheSize",cacheSizeStr);
initParams.put("preloadTime","100");

// // check if there is an already running thread with the same import path
// Thread[] importThreads = new Thread[plasmaDbImporter.runningJobs.activeCount()*2];
// activeCount = plasmaDbImporter.runningJobs.enumerate(importThreads);
Expand All @@ -100,7 +102,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
if (startImport) {
dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType);
if (importerThread != null) {
importerThread.init(new File(importPlasmaPath), new File(importIndexPrimaryPath), new File(importIndexSecondaryPath), cacheSize, 100);
importerThread.init(initParams);
importerThread.startIt();
}
prop.put("LOCATION","");
Expand Down
2 changes: 1 addition & 1 deletion htroot/WatchCrawler_p.html
Expand Up @@ -9,7 +9,7 @@
<script type="text/javascript" src="/js/WatchCrawler.js"></script></head>
<body id="watchCrawler"> #%env/templates/header.template%#
<h2>Crawler Monitor</h2>
<p> Next update in <span id="nextUpdate" onclick="changeInterval()"></span> seconds.
<p> Next update in <span id="nextUpdate" onclick="changeInterval()"></span> seconds. <img src="/env/grafics/empty.gif" name="ajax" />
</p>
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>
Expand Down
42 changes: 40 additions & 2 deletions htroot/WatchCrawler_p.java
Expand Up @@ -35,6 +35,7 @@
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import de.anomic.data.SitemapParser;
import de.anomic.data.wikiCode;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
Expand All @@ -44,6 +45,7 @@
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.dbImport.dbImporter;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
Expand All @@ -52,6 +54,10 @@
import de.anomic.yacy.yacyNewsRecord;

public class WatchCrawler_p {
public static final String CRAWLING_MODE_URL = "url";
public static final String CRAWLING_MODE_FILE = "file";
public static final String CRAWLING_MODE_SITEMAP = "sitemap";


// this servlet does NOT create the WatchCrawler page content!
// this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
Expand Down Expand Up @@ -144,7 +150,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
env.setConfig("xpstopw", (xpstopw) ? "true" : "false");

String crawlingMode = post.get("crawlingMode","url");
if (crawlingMode.equals("url")) {
if (crawlingMode.equals(CRAWLING_MODE_URL)) {
// getting the crawljob start url
String crawlingStart = post.get("crawlingURL","");
crawlingStart = crawlingStart.trim();
Expand Down Expand Up @@ -236,7 +242,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
e.printStackTrace();
}

} else if (crawlingMode.equals("file")) {
} else if (crawlingMode.equals(CRAWLING_MODE_FILE)) {
if (post.containsKey("crawlingFile")) {
// getting the name of the uploaded file
String fileName = (String) post.get("crawlingFile");
Expand Down Expand Up @@ -316,6 +322,38 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
e.printStackTrace();
}
}
} else if (crawlingMode.equals(CRAWLING_MODE_SITEMAP)) {
String sitemapURLStr = null;
try {
// getting the sitemap URL
sitemapURLStr = post.get("sitemapURL","");

// create a new profile
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(
sitemapURLStr, sitemapURLStr, newcrawlingfilter, newcrawlingfilter,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);

// create a new sitemap importer
dbImporter importerThread = switchboard.dbImportManager.getNewImporter("sitemap");
if (importerThread != null) {
HashMap initParams = new HashMap();
initParams.put("sitemapURL",sitemapURLStr);
initParams.put("crawlingProfile",pe.handle());

importerThread.init(initParams);
importerThread.startIt();
}
} catch (Exception e) {
// mist
prop.put("info", 6);//Error with url
prop.put("info_crawlingStart", sitemapURLStr);
prop.put("info_error", e.getMessage());
e.printStackTrace();
}
}
}
}
Expand Down
Binary file added htroot/env/grafics/ajax.gif
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 9 additions & 0 deletions htroot/js/Bookmarks.js
@@ -1,11 +1,20 @@
AJAX_OFF="/env/grafics/empty.gif";
AJAX_ON="/env/grafics/ajax.gif";

function handleResponse(){
if(http.readyState == 4){
var response = http.responseXML;
title=response.getElementsByTagName("title")[0].firstChild.nodeValue;
document.getElementsByName("title")[0].value=title;

// remove the ajax image
document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
}
}
function loadTitle(){
// displaying ajax image
document.getElementsByName("ajax")[0].setAttribute("src",AJAX_ON);

url=document.getElementsByName("url")[0].value;
if(document.getElementsByName("title")[0].value==""){
sndReq('/xml/util/getpageinfo_p.xml?actions=title&url='+url);
Expand Down
28 changes: 24 additions & 4 deletions htroot/js/IndexCreate.js
@@ -1,16 +1,22 @@
AJAX_OFF="/env/grafics/empty.gif";
AJAX_ON="/env/grafics/ajax.gif";
timeout="";
function handleResponse(){
if(http.readyState == 4){
var response = http.responseXML;
title="";
robotsOK="";

// getting the document title
title="";
if(response.getElementsByTagName("title")[0].firstChild!=null){
title=response.getElementsByTagName("title")[0].firstChild.nodeValue;
}
document.getElementById("title").innerHTML=title;

// deterime if crawling is allowed by the robots.txt
robotsOK="";
if(response.getElementsByTagName("robots")[0].firstChild!=null){
robotsOK=response.getElementsByTagName("robots")[0].firstChild.nodeValue;
}
document.getElementById("title").innerHTML=title;
robotsOKspan=document.getElementById("robotsOK");
if(robotsOKspan.firstChild){
robotsOKspan.removeChild(robotsOKspan.firstChild);
Expand All @@ -31,14 +37,28 @@ function handleResponse(){
}else{
robotsOKspan.appendChild(document.createTextNode(""));
document.getElementById("robotsOK").innerHTML="";
}
}

// getting the sitemap URL contained in the robots.txt
sitemap="";
if(response.getElementsByTagName("sitemap")[0].firstChild!=null){
sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
}
document.getElementsByName("sitemapURL")[0].value=sitemap;
document.getElementById("sitemap").disabled=false;

// clear the ajax image
document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
}
}
function changed(){
window.clearTimeout(timeout);
timeout=window.setTimeout("loadInfos()", 1500);
}
function loadInfos(){
// displaying ajax image
document.getElementsByName("ajax")[0].setAttribute("src",AJAX_ON);

url=document.getElementsByName("crawlingURL")[0].value;
sndReq('/xml/util/getpageinfo_p.xml?actions=title,robots&url='+url);
}
9 changes: 8 additions & 1 deletion htroot/xml/util/getpageinfo_p.java
Expand Up @@ -101,11 +101,18 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
}
if(actions.indexOf("robots")>=0){
try {
if(robotsParser.isDisallowed(new URL(url))){
URL theURL = new URL(url);

// determine if crawling of the current URL is allowed
if(robotsParser.isDisallowed(theURL)){
prop.put("robots-allowed", 0);
}else{
prop.put("robots-allowed", 1);
}

// get the sitemap URL of the domain
URL sitemapURL = robotsParser.getSitemapURL(theURL);
prop.put("sitemap", (sitemapURL==null)?"":sitemapURL.toString());
} catch (MalformedURLException e) {}
}

Expand Down
1 change: 1 addition & 0 deletions htroot/xml/util/getpageinfo_p.xml
Expand Up @@ -2,4 +2,5 @@
<pageinfo>
<title>#[title]#</title>
<robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
<sitemap>#[sitemap]#</sitemap>
</pageinfo>

0 comments on commit 6f46245

Please sign in to comment.