Skip to content

Commit

Permalink
*) adding experimental support for parsing of bookmarksfiles
Browse files Browse the repository at this point in the history
  • Loading branch information
theli committed Jul 7, 2005
1 parent f57b60c commit 5c3822d
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 52 deletions.
37 changes: 26 additions & 11 deletions htroot/IndexCreate_p.html
Expand Up @@ -102,13 +102,26 @@ <h2>Index Creation</h2>
</tr>
-->
<tr valign="top" class="TableCellLight">
<td class=small>Start Point:</td>
<td class=small colspan="2"><input name="crawlingURL" type="text" size="42" maxlength="256" value="http://"></td>
<td class=small><input type="submit" name="crawlingstart" value="Start New Crawl"></td>
<td class=small>Existing start URL's are re-crawled.
<td class="small" rowspan="3">Starting Point:</td>
<td class="small">
<table cellpadding="0" cellspacing="0">
<tr><td class="small">From&nbsp;File:</td>
<td class="small"><input type="radio" name="crawlingMode" value="file"></td>
<td class="small"><input type="file" name="crawlingFile" size="28"></td>
</tr>
<tr><td class="small">From&nbsp;URL:</td>
<td class="small"><input type="radio" name="crawlingMode" value="url" checked="checked"></td>
<td class="small"><input name="crawlingURL" type="text" size="41" maxlength="256" value="http://"></td>
</tr>
</table>
</td>
<td class=small colspan="3" rowspan="2">Existing start URL's are re-crawled.
Other already visited URL's are sorted out as 'double'.
A complete re-crawl will be available soon.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td class=small colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl"></td>
</tr>
</form>
</table>
Expand All @@ -130,19 +143,21 @@ <h2>Index Creation</h2>


<p>
#(error)#
::
#(error)#<!-- 0 -->
::<!-- 1 -->
Error with profile management. Please stop yacy, delete the File DATA/PLASMADB/crawlProfiles0.db and restart.
::
::<!-- 2 -->
Error: #[errmsg]#
::
::<!-- 3 -->
Application not yet initialized. Sorry. Please wait some seconds and repeat the request.
::
::<!-- 4 -->
<b>ERROR: Crawl filter "#[newcrawlingfilter]#" does not match with crawl root "#[crawlingStart]#".</b> Please try again with different filter</p><br>
::
::<!-- 5 -->
Crawling of "#[crawlingURL]#" failed. Reason: #[reasonString]#<br>
::
::<!-- 6 -->
Error with url input "#[crawlingStart]#": #[error]#
::<!-- 7 -->
Error with file input "#[crawlingStart]#": #[error]#
#(/error)#
<br>
#(info)#
Expand Down
155 changes: 114 additions & 41 deletions htroot/IndexCreate_p.java
Expand Up @@ -43,20 +43,30 @@
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT

import java.io.File;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;

import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.tools.bitfield;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;

Expand Down Expand Up @@ -104,49 +114,112 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
boolean xpstopw = ((String) post.get("xpstopw", "")).equals("on");
env.setConfig("xpstopw", (xpstopw) ? "true" : "false");

String crawlingStart = (String) post.get("crawlingURL");
if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;

// check if url is proper
URL crawlingStartURL = null;
try {
crawlingStartURL = new URL(crawlingStart);
} catch (MalformedURLException e) {
crawlingStartURL = null;
}

// check if pattern matches
if ((crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
// print error message
prop.put("error", 4); //crawlfilter does not match url
prop.put("error_newcrawlingfilter", newcrawlingfilter);
prop.put("error_crawlingStart", crawlingStart);
} else try {
// stack request
// first delete old entry, if exists
String urlhash = plasmaURL.urlHash(crawlingStart);
switchboard.urlPool.loadedURL.remove(urlhash);
switchboard.urlPool.noticeURL.remove(urlhash);
String crawlingMode = post.get("crawlingMode","url");
if (crawlingMode.equals("url")) {
String crawlingStart = (String) post.get("crawlingURL");
if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;

// stack url
String reasonString = switchboard.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0,
switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw));

if (reasonString == null) {
// liftoff!
prop.put("info", 2);//start msg
prop.put("info_crawlingURL", ((String) post.get("crawlingURL")));
} else {
prop.put("error", 5); //Crawling failed
prop.put("error_crawlingURL", ((String) post.get("crawlingURL")));
prop.put("error_reasonString", reasonString);
// check if url is proper
URL crawlingStartURL = null;
try {
crawlingStartURL = new URL(crawlingStart);
} catch (MalformedURLException e) {
crawlingStartURL = null;
}
} catch (Exception e) {
// mist
prop.put("error", 6);//Error with url
prop.put("error_crawlingStart", crawlingStart);
prop.put("error_error", e.getMessage());
e.printStackTrace();

// check if pattern matches
if ((crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
// print error message
prop.put("error", 4); //crawlfilter does not match url
prop.put("error_newcrawlingfilter", newcrawlingfilter);
prop.put("error_crawlingStart", crawlingStart);
} else try {
// stack request
// first delete old entry, if exists
String urlhash = plasmaURL.urlHash(crawlingStart);
switchboard.urlPool.loadedURL.remove(urlhash);
switchboard.urlPool.noticeURL.remove(urlhash);

// stack url
String reasonString = switchboard.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0,
switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw));

if (reasonString == null) {
// liftoff!
prop.put("info", 2);//start msg
prop.put("info_crawlingURL", ((String) post.get("crawlingURL")));
} else {
prop.put("error", 5); //Crawling failed
prop.put("error_crawlingURL", ((String) post.get("crawlingURL")));
prop.put("error_reasonString", reasonString);
}
} catch (Exception e) {
// mist
prop.put("error", 6);//Error with url
prop.put("error_crawlingStart", crawlingStart);
prop.put("error_error", e.getMessage());
e.printStackTrace();
}

} else if (crawlingMode.equals("file")) {
if (post.containsKey("crawlingFile")) {
// getting the name of the uploaded file
String fileName = (String) post.get("crawlingFile");
try {
File file = new File(fileName);

// getting the content of the bookmark file
byte[] fileContent = (byte[]) post.get("crawlingFile$file");

// parsing the bookmark file and fetching the headline and contained links
htmlFilterContentScraper scraper = new htmlFilterContentScraper(file.toURL());
OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
serverFileUtils.write(fileContent,os);
os.close();

String headline = scraper.getHeadline();
HashMap hyperlinks = (HashMap) scraper.getAnchors();

// creating a crawler profile
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);

// loop through the contained links
Iterator interator = hyperlinks.entrySet().iterator();
int c = 0;
while (interator.hasNext()) {
Map.Entry e = (Map.Entry) interator.next();
String nexturlstring = (String) e.getKey();

// generating an url object
URL nexturlURL = null;
try {
nexturlURL = new URL(nexturlstring);
} catch (MalformedURLException ex) {
nexturlURL = null;
c++;
continue;
}

// enqueuing the url for crawling
String rejectReason = switchboard.stackCrawl(nexturlstring, null, yacyCore.seedDB.mySeed.hash, (String)e.getValue(), new Date(), 1, profile);

// if something failed add the url into the errorURL list
if (rejectReason == null) {
c++;
} else {
switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
}
}

} catch (Exception e) {
// mist
prop.put("error", 7);//Error with file
prop.put("error_crawlingStart", fileName);
prop.put("error_error", e.getMessage());
e.printStackTrace();
}
}
}
}
}
Expand Down

0 comments on commit 5c3822d

Please sign in to comment.