Signed-off-by: lixia <xautlx@hotmail.com>

xautlx · Aug 7, 2014 · 4e4fd76 · 4e4fd76
1 parent c4b64ba
commit 4e4fd76
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 29 deletions.
diff --git a/conf/nutch-site.xml b/conf/nutch-site.xml
@@ -22,17 +22,6 @@
         </description>
     </property>
 
-    <property>
-        <name>http.redirect.max</name>
-        <value>1</value>
-        <description>The maximum number of redirects the fetcher will follow when
-            trying to fetch a page. If set to
-            negative or 0, fetcher won't immediately
-            follow redirected URLs, instead it will record them for later
-            fetching.
-        </description>
-    </property>
-
     <!-- -->
     <property>
         <name>plugin.folders</name>
@@ -136,15 +125,6 @@
         </description>
     </property>
 
-    <property>
-        <name>indexer.max.content.length</name>
-        <value>0</value>
-        <description>The maximum number of characters of a content that are indexed.
-            Content beyond the limit is
-            truncated. A value of -1 disables this check.
-        </description>
-    </property>
-
     <property>
         <name>fetcher.server.delay</name>
         <value>2.0</value>

diff --git a/src/plugin/parse-s2jh/src/java/lab/s2jh/crawl/parse/S2jhHtmlParseFilter.java b/src/plugin/parse-s2jh/src/java/lab/s2jh/crawl/parse/S2jhHtmlParseFilter.java
@@ -85,7 +85,6 @@ public String setupFilterRegex() {
     @Override
     protected boolean isParseDataFetchLoaded(HtmlPage page) {
         HtmlDivision div = page.getFirstByXPath("//DIV[@id='description']/DIV[@class='content ke-post']");
-        System.out.println("--------------------------------------" + div);
         if (div != null && div.getChildElementCount() > 0) {
             if (LOG.isInfoEnabled()) {
                 LOG.info("Product description content HTML: {}", asString(div));

diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -24,14 +24,14 @@
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.PushbackInputStream;
+import java.lang.reflect.Method;
 import java.net.InetSocketAddress;
 import java.net.Socket;
 import java.net.URL;
 import java.util.List;
 
-import lab.s2jh.crawl.parse.AbstractHtmlParseFilter;
-
 import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.lang3.reflect.MethodUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
@@ -262,14 +262,21 @@ private void readPlainContent(URL url) throws IOException {
                 //调用解析过滤器中定义的Javascript执行加载完成判断
                 //任何一个判断返回false标识暂未加载所需数据，线程继续等待直到最大等待时间
                 if (htmlParseFilters != null) {
-                    for (HtmlParseFilter htmlParseFilter : htmlParseFilters) {
-                        if (htmlParseFilter instanceof AbstractHtmlParseFilter) {
-                            AbstractHtmlParseFilter filter = (AbstractHtmlParseFilter) htmlParseFilter;
-                            if (filter.isParseDataFetchLoaded(urlStr, page) == false) {
-                                ok = false;
-                                break;
+                    try {
+                        for (HtmlParseFilter htmlParseFilter : htmlParseFilters) {
+                            //基于反射调用，目前发现直接基于类型转换会导致异常
+                            Method isParseDataFetchLoaded = MethodUtils.getAccessibleMethod(htmlParseFilter.getClass(),
+                                    "isParseDataFetchLoaded", String.class, page.getClass());
+                            if (isParseDataFetchLoaded != null) {
+                                Boolean ret = (Boolean) isParseDataFetchLoaded.invoke(htmlParseFilter, urlStr, page);
+                                if (ret == false) {
+                                    ok = false;
+                                    break;
+                                }
                             }
                         }
+                    } catch (Exception e) {
+                        e.printStackTrace();
                     }
                 }
                 if (ok == true) {