*) adding experimental support for parsing of bookmarksfiles

See: http://www.yacy-forum.de/viewtopic.php?t=177 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@388 6c8d7289-2bf4-0310-a012-ef5d649a1542
yacy · Jul 7, 2005 · 5c3822d · 5c3822d
1 parent f57b60c
commit 5c3822d
Show file tree

Hide file tree

Showing 2 changed files with 140 additions and 52 deletions.
diff --git a/htroot/IndexCreate_p.html b/htroot/IndexCreate_p.html
@@ -102,13 +102,26 @@ <h2>Index Creation</h2>
   </tr>
   -->
   <tr valign="top" class="TableCellLight">
-    <td class=small>Start Point:</td>
-    <td class=small colspan="2"><input name="crawlingURL" type="text" size="42" maxlength="256" value="http://"></td>
-    <td class=small><input type="submit" name="crawlingstart" value="Start New Crawl"></td>
-    <td class=small>Existing start URL's are re-crawled.
+    <td class="small" rowspan="3">Starting Point:</td>
+    <td class="small">
+    	<table cellpadding="0" cellspacing="0">
+    	<tr><td class="small">From&nbsp;File:</td>
+    	    <td class="small"><input type="radio" name="crawlingMode" value="file"></td>
+    	    <td class="small"><input type="file" name="crawlingFile" size="28"></td>
+    	</tr>    	
+    	<tr><td class="small">From&nbsp;URL:</td>
+    		<td class="small"><input type="radio" name="crawlingMode" value="url" checked="checked"></td>
+    	    <td class="small"><input name="crawlingURL" type="text" size="41" maxlength="256" value="http://"></td>
+    	</tr>    	
+    	</table>
+    </td>
+    <td class=small colspan="3" rowspan="2">Existing start URL's are re-crawled.
     Other already visited URL's are sorted out as 'double'.
     A complete re-crawl will be available soon.
     </td>
+  </tr> 
+  <tr valign="top" class="TableCellLight">
+  	<td class=small colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl"></td>
   </tr>
 </form>
 </table>
@@ -130,19 +143,21 @@ <h2>Index Creation</h2>
 
 
 <p>
-#(error)#
-::
+#(error)#<!-- 0 -->
+::<!-- 1 -->
 Error with profile management. Please stop yacy, delete the File DATA/PLASMADB/crawlProfiles0.db and restart.
-::
+::<!-- 2 -->
 Error: #[errmsg]#
-::
+::<!-- 3 -->
 Application not yet initialized. Sorry. Please wait some seconds and repeat the request.
-::
+::<!-- 4 -->
 <b>ERROR: Crawl filter "#[newcrawlingfilter]#" does not match with crawl root "#[crawlingStart]#".</b> Please try again with different filter</p><br>
-::
+::<!-- 5 -->
 Crawling of "#[crawlingURL]#" failed. Reason: #[reasonString]#<br>
-::
+::<!-- 6 -->
 Error with url input "#[crawlingStart]#": #[error]#
+::<!-- 7 -->
+Error with file input "#[crawlingStart]#": #[error]#
 #(/error)#
 <br>
 #(info)#

diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java
@@ -43,20 +43,30 @@
 // javac -classpath .:../classes IndexCreate_p.java
 // if the shell's current path is HTROOT
 
+import java.io.File;
+import java.io.OutputStream;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.text.SimpleDateFormat;
 import java.util.Date;
 import java.util.Enumeration;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Locale;
+import java.util.Map;
+import java.util.Properties;
 
+import de.anomic.htmlFilter.htmlFilterContentScraper;
+import de.anomic.htmlFilter.htmlFilterOutputStream;
 import de.anomic.http.httpHeader;
+import de.anomic.plasma.plasmaCrawlNURL;
 import de.anomic.plasma.plasmaCrawlProfile;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.plasma.plasmaURL;
+import de.anomic.server.serverFileUtils;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
+import de.anomic.tools.bitfield;
 import de.anomic.yacy.yacyCore;
 import de.anomic.yacy.yacySeed;
 
@@ -104,49 +114,112 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
                     boolean xpstopw = ((String) post.get("xpstopw", "")).equals("on");
                     env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
 
-                    String crawlingStart = (String) post.get("crawlingURL");
-                    if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;
-
-                    // check if url is proper
-                    URL crawlingStartURL = null;
-                    try {
-                        crawlingStartURL = new URL(crawlingStart);
-                    } catch (MalformedURLException e) {
-                        crawlingStartURL = null;
-                    }
-
-                    // check if pattern matches
-                    if ((crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
-                        // print error message
-                        prop.put("error", 4); //crawlfilter does not match url
-                        prop.put("error_newcrawlingfilter", newcrawlingfilter);
-                        prop.put("error_crawlingStart", crawlingStart);
-                    } else try {
-                        // stack request
-                        // first delete old entry, if exists
-                        String urlhash = plasmaURL.urlHash(crawlingStart);
-                        switchboard.urlPool.loadedURL.remove(urlhash);
-                        switchboard.urlPool.noticeURL.remove(urlhash);
+                    String crawlingMode = post.get("crawlingMode","url");
+                    if (crawlingMode.equals("url")) {
+                        String crawlingStart = (String) post.get("crawlingURL");
+                        if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;
 
-                        // stack url
-                        String reasonString = switchboard.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0,
-                        switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw));
-
-                        if (reasonString == null) {
-                            // liftoff!
-                            prop.put("info", 2);//start msg
-                            prop.put("info_crawlingURL", ((String) post.get("crawlingURL")));
-                        } else {
-                            prop.put("error", 5); //Crawling failed
-                            prop.put("error_crawlingURL", ((String) post.get("crawlingURL")));
-                            prop.put("error_reasonString", reasonString);
+                        // check if url is proper
+                        URL crawlingStartURL = null;
+                        try {
+                            crawlingStartURL = new URL(crawlingStart);
+                        } catch (MalformedURLException e) {
+                            crawlingStartURL = null;
                         }
-                    } catch (Exception e) {
-                        // mist
-                        prop.put("error", 6);//Error with url
-                        prop.put("error_crawlingStart", crawlingStart);
-                        prop.put("error_error", e.getMessage());
-                        e.printStackTrace();
+
+                        // check if pattern matches
+                        if ((crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
+                            // print error message
+                            prop.put("error", 4); //crawlfilter does not match url
+                            prop.put("error_newcrawlingfilter", newcrawlingfilter);
+                            prop.put("error_crawlingStart", crawlingStart);
+                        } else try {
+                            // stack request
+                            // first delete old entry, if exists
+                            String urlhash = plasmaURL.urlHash(crawlingStart);
+                            switchboard.urlPool.loadedURL.remove(urlhash);
+                            switchboard.urlPool.noticeURL.remove(urlhash);
+
+                            // stack url
+                            String reasonString = switchboard.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0,
+                            switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw));
+
+                            if (reasonString == null) {
+                                // liftoff!
+                                prop.put("info", 2);//start msg
+                                prop.put("info_crawlingURL", ((String) post.get("crawlingURL")));
+                            } else {
+                                prop.put("error", 5); //Crawling failed
+                                prop.put("error_crawlingURL", ((String) post.get("crawlingURL")));
+                                prop.put("error_reasonString", reasonString);
+                            }
+                        } catch (Exception e) {
+                            // mist
+                            prop.put("error", 6);//Error with url
+                            prop.put("error_crawlingStart", crawlingStart);
+                            prop.put("error_error", e.getMessage());
+                            e.printStackTrace();
+                        }                        
+
+                    } else if (crawlingMode.equals("file")) {                        
+                        if (post.containsKey("crawlingFile")) {
+                            // getting the name of the uploaded file
+                            String fileName = (String) post.get("crawlingFile");  
+                            try {                         
+                                File file = new File(fileName);
+
+                                // getting the content of the bookmark file
+                                byte[] fileContent = (byte[]) post.get("crawlingFile$file");
+
+                                // parsing the bookmark file and fetching the headline and contained links
+                                htmlFilterContentScraper scraper = new htmlFilterContentScraper(file.toURL());
+                                OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
+                                serverFileUtils.write(fileContent,os);
+                                os.close();
+
+                                String headline = scraper.getHeadline();
+                                HashMap hyperlinks = (HashMap) scraper.getAnchors();
+
+                                // creating a crawler profile
+                                plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);                                
+
+                                // loop through the contained links
+                                Iterator interator = hyperlinks.entrySet().iterator();
+                                int c = 0;
+                                while (interator.hasNext()) {
+                                    Map.Entry e = (Map.Entry) interator.next();
+                                    String nexturlstring = (String) e.getKey();
+
+                                    // generating an url object
+                                    URL nexturlURL = null;
+                                    try {
+                                        nexturlURL = new URL(nexturlstring);
+                                    } catch (MalformedURLException ex) {
+                                        nexturlURL = null;
+                                        c++;
+                                        continue;
+                                    }                                    
+
+                                    // enqueuing the url for crawling
+                                    String rejectReason = switchboard.stackCrawl(nexturlstring, null, yacyCore.seedDB.mySeed.hash, (String)e.getValue(), new Date(), 1, profile);                                    
+
+                                    // if something failed add the url into the errorURL list
+                                    if (rejectReason == null) {
+                                        c++;
+                                    } else {
+                                        switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
+                                       (String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
+                                    }
+                                }                             
+
+                            } catch (Exception e) {
+                                // mist
+                                prop.put("error", 7);//Error with file
+                                prop.put("error_crawlingStart", fileName);
+                                prop.put("error_error", e.getMessage());
+                                e.printStackTrace();                                
+                            }
+                        }                        
                     }
                 }
             }