Skip to content

Commit

Permalink
增加htmlunit对https的支持
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaowu.lxw committed May 5, 2015
1 parent 51aa838 commit b18cf01
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 120 deletions.
64 changes: 2 additions & 62 deletions .classpath
Original file line number Diff line number Diff line change
@@ -1,74 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry exported="true" kind="lib" path="conf"/>
<classpathentry kind="src" path="src/plugin/parse-html/src/test"/>
<classpathentry kind="src" path="src/plugin/index-s2jh/src/test"/>
<classpathentry kind="src" path="src/plugin/index-s2jh/src/java"/>
<classpathentry kind="src" path="src/plugin/protocol-htmlunit/src/java"/>
<classpathentry kind="src" path="src/plugin/lib-htmlunit/src/java"/>
<classpathentry kind="src" path="src/plugin/indexer-solr/src/java"/>
<classpathentry kind="src" path="src/plugin/parse-s2jh/src/test"/>
<classpathentry kind="src" path="src/plugin/parse-s2jh/src/java"/>
<classpathentry kind="src" path="src/plugin/index-basic/src/test"/>
<classpathentry including="**/*.java" kind="src" path="src/testresources"/>
<classpathentry kind="src" path="src/plugin/urlnormalizer-basic/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-prefix/src/java"/>
<classpathentry kind="src" path="src/plugin/urlnormalizer-pass/src/test"/>
<classpathentry kind="src" output="target/test-classes" path="src/test">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" path="src/plugin/protocol-httpclient/src/test"/>
<classpathentry kind="src" path="src/plugin/urlnormalizer-basic/src/test"/>
<classpathentry kind="src" output="target/classes" path="src/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" path="src/plugin/urlfilter-automaton/src/test"/>
<classpathentry kind="src" path="src/plugin/urlfilter-suffix/src/java"/>
<classpathentry kind="src" path="src/plugin/subcollection/src/test"/>
<classpathentry kind="src" path="src/plugin/language-identifier/src/test"/>
<classpathentry kind="src" path="src/plugin/creativecommons/src/test"/>
<classpathentry kind="src" path="src/plugin/protocol-ftp/src/java"/>
<classpathentry kind="src" path="src/plugin/index-basic/src/java"/>
<classpathentry kind="src" path="src/plugin/index-more/src/test"/>
<classpathentry kind="src" path="src/bin"/>
<classpathentry kind="src" path="src/plugin/index-more/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-domain/src/test"/>
<classpathentry kind="src" path="src/plugin/subcollection/src/java"/>
<classpathentry kind="src" path="src/plugin/urlnormalizer-pass/src/java"/>
<classpathentry kind="src" path="src/plugin/index-anchor/src/test"/>
<classpathentry kind="src" path="src/plugin/protocol-file/src/test"/>
<classpathentry kind="src" path="src/plugin/urlfilter-validator/src/java"/>
<classpathentry kind="src" path="src/plugin/index-anchor/src/java"/>
<classpathentry kind="src" path="src/plugin/parse-tika/src/test"/>
<classpathentry kind="src" path="src/plugin/microformats-reltag/src/java"/>
<classpathentry kind="src" path="src/plugin/protocol-http/src/java"/>
<classpathentry kind="src" path="src/plugin/urlnormalizer-regex/src/test"/>
<classpathentry kind="src" path="src/plugin/lib-http/src/test"/>
<classpathentry kind="src" path="src/plugin/protocol-httpclient/src/java"/>
<classpathentry kind="src" path="src/plugin/language-identifier/src/java"/>
<classpathentry kind="src" path="src/plugin/parse-html/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-automaton/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-suffix/src/test"/>
<classpathentry kind="src" path="src/plugin/urlfilter-regex/src/test"/>
<classpathentry kind="src" path="src/plugin/scoring-opic/src/java"/>
<classpathentry kind="src" path="src/plugin/parse-tika/src/java"/>
<classpathentry kind="src" path="src/plugin/scoring-link/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-regex/src/java"/>
<classpathentry kind="src" path="src/plugin/urlnormalizer-regex/src/java"/>
<classpathentry kind="src" path="src/plugin/lib-regex-filter/src/test"/>
<classpathentry kind="src" path="src/plugin/lib-regex-filter/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-domain/src/java"/>
<classpathentry kind="src" path="src/plugin/tld/src/java"/>
<classpathentry kind="src" path="src/plugin/parse-js/src/java"/>
<classpathentry kind="src" path="src/plugin/protocol-file/src/java"/>
<classpathentry kind="src" path="src/plugin/lib-http/src/java"/>
<classpathentry kind="src" path="src/plugin/creativecommons/src/java"/>
<classpathentry kind="src" path="src/plugin/lib-htmlunit/src/java"/>
<classpathentry kind="src" path="src/plugin/protocol-htmlunit/src/java"/>
<classpathentry kind="con" path="org.apache.ivyde.eclipse.cpcontainer.IVYDE_CONTAINER/?project=apache-nutch-2.1&amp;ivyXmlPath=ivy%2Fivy.xml&amp;confs=*"/>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
Expand Down
2 changes: 1 addition & 1 deletion .project
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>nutch-htmlunit</name>
<name>nutch-htmlunit-git</name>
<comment></comment>
<projects>
</projects>
Expand Down
57 changes: 1 addition & 56 deletions .settings/org.eclipse.wst.common.component
Original file line number Diff line number Diff line change
@@ -1,62 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0">
<wb-module deploy-name="apache-nutch-1.7">
<wb-resource deploy-path="/" source-path="/src/plugin/parse-html/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/index-basic/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlnormalizer-basic/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlfilter-prefix/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlnormalizer-pass/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/protocol-httpclient/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlnormalizer-basic/src/test"/>
<wb-resource deploy-path="/" source-path="/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlfilter-automaton/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlfilter-suffix/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/subcollection/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/language-identifier/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/creativecommons/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/protocol-ftp/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/index-basic/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/index-more/src/test"/>
<wb-resource deploy-path="/" source-path="/src/bin"/>
<wb-resource deploy-path="/" source-path="/src/plugin/index-more/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlfilter-domain/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/subcollection/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlnormalizer-pass/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/index-anchor/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/protocol-file/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlfilter-validator/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/index-anchor/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/parse-tika/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/microformats-reltag/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/protocol-http/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlnormalizer-regex/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/lib-http/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/protocol-httpclient/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/language-identifier/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/parse-html/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlfilter-automaton/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlfilter-suffix/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlfilter-regex/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/scoring-opic/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/parse-tika/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/scoring-link/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlfilter-regex/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlnormalizer-regex/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/lib-regex-filter/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/lib-regex-filter/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/urlfilter-domain/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/tld/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/parse-js/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/protocol-file/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/lib-http/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/creativecommons/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/parse-s2jh/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/parse-s2jh/src/java"/>
<wb-resource deploy-path="/" source-path="/src/test"/>
<wb-resource deploy-path="/" source-path="/src/testresources"/>
<wb-resource deploy-path="/" source-path="/src/plugin/indexer-solr/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/protocol-htmlunit/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/lib-htmlunit/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/index-s2jh/src/test"/>
<wb-resource deploy-path="/" source-path="/src/plugin/index-s2jh/src/java"/>
<wb-resource deploy-path="/" source-path="/src/plugin/protocol-htmlunit/src/java"/>
</wb-module>
</project-modules>
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ public static HtmlPage getHtmlPage(String url, Configuration conf) {
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setAppletEnabled(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
// SSL support
webClient.getOptions().setUseInsecureSSL(true);
// AJAX support
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
// Use extension version htmlunit cache process
Expand Down
10 changes: 10 additions & 0 deletions src/plugin/protocol-htmlunit/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,15 @@
</implementation>

</extension>

<extension id="org.apache.nutch.protocol.https"
name="HttpsProtocol"
point="org.apache.nutch.protocol.Protocol">

<implementation id="org.apache.nutch.protocol.htmlunit.Http"
class="org.apache.nutch.protocol.htmlunit.Http">
<parameter name="protocolName" value="https"/>
</implementation>
</extension>

</plugin>
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum datum) throws ProtocolExc
this.orig = url.toString();
this.base = url.toString();

if (!"http".equals(url.getProtocol()))
if (!"http".equals(url.getProtocol())||!!"https".equals(url.getProtocol()))
throw new HttpException("Not an HTTP url:" + url);

if (Http.LOG.isTraceEnabled()) {
Expand Down

0 comments on commit b18cf01

Please sign in to comment.