Skip to content

Commit

Permalink
Signed-off-by: lixia <xautlx@hotmail.com>
Browse files Browse the repository at this point in the history
  • Loading branch information
xautlx committed Aug 7, 2014
1 parent c4b64ba commit 4e4fd76
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 29 deletions.
20 changes: 0 additions & 20 deletions conf/nutch-site.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,6 @@
</description>
</property>

<property>
<name>http.redirect.max</name>
<value>1</value>
<description>The maximum number of redirects the fetcher will follow when
trying to fetch a page. If set to
negative or 0, fetcher won't immediately
follow redirected URLs, instead it will record them for later
fetching.
</description>
</property>

<!-- -->
<property>
<name>plugin.folders</name>
Expand Down Expand Up @@ -136,15 +125,6 @@
</description>
</property>

<property>
<name>indexer.max.content.length</name>
<value>0</value>
<description>The maximum number of characters of a content that are indexed.
Content beyond the limit is
truncated. A value of -1 disables this check.
</description>
</property>

<property>
<name>fetcher.server.delay</name>
<value>2.0</value>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ public String setupFilterRegex() {
@Override
protected boolean isParseDataFetchLoaded(HtmlPage page) {
HtmlDivision div = page.getFirstByXPath("//DIV[@id='description']/DIV[@class='content ke-post']");
System.out.println("--------------------------------------" + div);
if (div != null && div.getChildElementCount() > 0) {
if (LOG.isInfoEnabled()) {
LOG.info("Product description content HTML: {}", asString(div));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PushbackInputStream;
import java.lang.reflect.Method;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
import java.util.List;

import lab.s2jh.crawl.parse.AbstractHtmlParseFilter;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.reflect.MethodUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
Expand Down Expand Up @@ -262,14 +262,21 @@ private void readPlainContent(URL url) throws IOException {
//调用解析过滤器中定义的Javascript执行加载完成判断
//任何一个判断返回false标识暂未加载所需数据,线程继续等待直到最大等待时间
if (htmlParseFilters != null) {
for (HtmlParseFilter htmlParseFilter : htmlParseFilters) {
if (htmlParseFilter instanceof AbstractHtmlParseFilter) {
AbstractHtmlParseFilter filter = (AbstractHtmlParseFilter) htmlParseFilter;
if (filter.isParseDataFetchLoaded(urlStr, page) == false) {
ok = false;
break;
try {
for (HtmlParseFilter htmlParseFilter : htmlParseFilters) {
//基于反射调用,目前发现直接基于类型转换会导致异常
Method isParseDataFetchLoaded = MethodUtils.getAccessibleMethod(htmlParseFilter.getClass(),
"isParseDataFetchLoaded", String.class, page.getClass());
if (isParseDataFetchLoaded != null) {
Boolean ret = (Boolean) isParseDataFetchLoaded.invoke(htmlParseFilter, urlStr, page);
if (ret == false) {
ok = false;
break;
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
if (ok == true) {
Expand Down

0 comments on commit 4e4fd76

Please sign in to comment.