Skip to content

Commit

Permalink
🍻 优化获取文章图片的逻辑,解决因lazyload等原因造成的图片获取失败和转换失败的问题。
Browse files Browse the repository at this point in the history
        优化停止爬虫的逻辑
        处理部分图片路径包含参数的问题,如temp.svg?theme=white,过滤掉参数
        删除无用的文件
  • Loading branch information
zhangyd-c committed Mar 11, 2019
1 parent 2ebcbc0 commit 9321e5d
Show file tree
Hide file tree
Showing 16 changed files with 199 additions and 105 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,10 @@ CopyOnWriteArrayList<VirtualArticle> list = hunter.execute();
16:58:46,565 INFO HunterPrintWriter:38 - [ hunter ] <a href="https://www.imooc.com/article/276553" target="_blank">大神云集——Redis命令实现源码分析</a> -- 慕课网官方_运营中心 -- 2019-01-30 15:21:00
```

**注意**

部分网站没有配置`Keywords`,所以在运行单元测试时如果碰到`Keywords`内容为空,可以忽略。如果是`title``content`等内容为空,请检查配置文件中的`xpath`匹配规则是否正确。

更多使用方式请参考文档...

## 配置信息
Expand Down Expand Up @@ -203,7 +207,6 @@ CopyOnWriteArrayList<VirtualArticle> list = hunter.execute();
| proxyList | 代理的列表 | list | - | × | 保留字段,暂时无用 |
| proxyType | 代理的类型 | enum | - | × | 保留字段,暂时无用 |


## 交流

| 微信(备注:`hunter加群`) | 欢迎关注公众号 |
Expand Down
8 changes: 2 additions & 6 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
<jsoup.version>1.10.2</jsoup.version>
<hibernate.validator.version>6.0.9.Final</hibernate.validator.version>
<tomcat.version>8.5.24</tomcat.version>
<log4j.version>1.2.17</log4j.version>
</properties>

<dependencies>
Expand Down Expand Up @@ -82,11 +83,6 @@
<artifactId>webmagic-extension</artifactId>
<version>${webmagic.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-selenium</artifactId>
<version>${webmagic.version}</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
Expand All @@ -112,7 +108,7 @@
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
<version>${log4j.version}</version>
</dependency>
</dependencies>

Expand Down
45 changes: 33 additions & 12 deletions src/main/java/me/zhyd/hunter/Hunter.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package me.zhyd.hunter;

import me.zhyd.hunter.enums.ExitWayEnum;
import me.zhyd.hunter.config.HunterConfig;
import me.zhyd.hunter.enums.ExitWayEnum;
import me.zhyd.hunter.exception.HunterException;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
Expand All @@ -25,18 +27,29 @@ public class Hunter extends Spider {
/**
* 唯一的key,一般为用户ID,需要调用方生成
*/
private Object uuid;
private String hunterId;
private volatile long startTime = 0L;

private Hunter(PageProcessor pageProcessor, HunterConfig config, String uuid) {
private Hunter(PageProcessor pageProcessor, HunterConfig config, String hunterId) {
super(pageProcessor);
this.config = config;
this.uuid = uuid;
SPIDER_BUCKET.put(uuid, this);
this.hunterId = hunterId;
SPIDER_BUCKET.put(hunterId, this);
}

public static Hunter create(PageProcessor pageProcessor, HunterConfig config, String hunterId) {
return new Hunter(pageProcessor, config, hunterId);
}

public static Hunter create(PageProcessor pageProcessor, HunterConfig model, String uuid) {
return new Hunter(pageProcessor, model, uuid);
public static Hunter getHunter(String hunterId) {
if (StringUtils.isEmpty(hunterId)) {
throw new HunterException("HunterId:[" + hunterId + "]为空,请指定HunterId");
}
Hunter hunter = SPIDER_BUCKET.get(hunterId);
if (null == hunter) {
throw new HunterException("当前没有正在运行的爬虫!HunterId:[" + hunterId + "]");
}
return hunter;
}

@Override
Expand All @@ -51,7 +64,9 @@ protected void onSuccess(Request request) {

@Override
public void run() {
startTime = System.currentTimeMillis() + config.getCount() * 1000;
if (ExitWayEnum.DURATION.toString().equals(config.getExitWay())) {
startTime = System.currentTimeMillis() + config.getCount() * 1000;
}
super.run();
}

Expand All @@ -63,13 +78,19 @@ protected void onError(Request request) {
@Override
public void close() {
super.close();
SPIDER_BUCKET.remove(this.uuid);
SPIDER_BUCKET.remove(this.hunterId);
}

@Override
public void stop() {
super.stop();
// this.close();
SPIDER_BUCKET.remove(this.uuid);
Spider.Status status = this.getStatus();
if (status.equals(Spider.Status.Running)) {
super.stop();
SPIDER_BUCKET.remove(this.hunterId);
} else if (status.equals(Spider.Status.Init)) {
throw new HunterException("爬虫正在初始化!HunterId:[" + this.hunterId + "]");
} else {
throw new HunterException("当前没有正在运行的爬虫!HunterId:[" + this.hunterId + "]");
}
}
}
11 changes: 6 additions & 5 deletions src/main/java/me/zhyd/hunter/config/HunterConfigTemplate.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import cn.hutool.core.io.IoUtil;
import com.alibaba.fastjson.JSONObject;
import me.zhyd.hunter.consts.HunterConsts;
import me.zhyd.hunter.exception.HunterException;
import org.apache.commons.lang3.StringUtils;

Expand All @@ -26,20 +27,20 @@ public static String getConfig(String platform) {
if (configTemplate.containsKey(platform)) {
return configTemplate.getString(platform);
}
throw new HunterException("[hunter] 暂不支持该平台[" + platform + "]");
throw new HunterException("暂不支持该平台[" + platform + "]");
}

private void init() {
String configFileName = "/HunterConfig.json";
String configFileName = HunterConsts.CONFIG_FILE_NAME;
String config = null;
try {
InputStream inputStream = this.getClass().getResourceAsStream(configFileName);
if (null == inputStream) {
throw new HunterException("[hunter] 请检查`src/main/resources`下是否存在" + configFileName);
throw new HunterException("请检查`src/main/resources`下是否存在" + configFileName);
}
config = IoUtil.read(inputStream, Charset.forName("UTF-8"));
if (StringUtils.isEmpty(config)) {
throw new HunterException("[hunter] HunterConfig内容为空:" + configFileName);
throw new HunterException("HunterConfig内容为空:" + configFileName);
}
} catch (Exception e) {
e.printStackTrace();
Expand All @@ -48,7 +49,7 @@ private void init() {
try {
configTemplate = JSONObject.parseObject(config);
} catch (Exception e) {
throw new HunterException("[hunter] HunterConfig配置文件格式错误");
throw new HunterException("HunterConfig配置文件格式错误");
}

}
Expand Down
21 changes: 0 additions & 21 deletions src/main/java/me/zhyd/hunter/config/platform/InfoqPlatform.java

This file was deleted.

5 changes: 2 additions & 3 deletions src/main/java/me/zhyd/hunter/config/platform/Platform.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,12 @@
* @since 1.8
*/
public enum Platform {
CNBLOGS("cnblogs", "cnblogs.com", CnblogsPlatform.class),
CSDN("csdn", "csdn.net", CsdnPlatform.class),
IMOOC("imooc", "imooc.com", ImoocPlatform.class),
ITEYE("iteye", "iteye.com", IteyePlatform.class),
IMOOC("imooc", "imooc.com", ImoocPlatform.class),
CNBLOGS("cnblogs", "cnblogs.com", CnblogsPlatform.class),
JUEJIN("juejin", "juejin.im", JuejinPlatform.class),
V2EX("v2ex", "v2ex.com", V2exPlatform.class),
INFOQ("infoq", "infoq.cn", InfoqPlatform.class),
;

private String platform;
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/me/zhyd/hunter/consts/HunterConsts.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package me.zhyd.hunter.consts;

/**
* @author yadong.zhang (yadong.zhang0415(a)gmail.com)
* @version 1.0
* @since 1.8
*/
public class HunterConsts {

public static final String LOG_PREFIX = "[ hunter ] ";

public static final String CONFIG_FILE_NAME = "/HunterConfig.json";
}
12 changes: 1 addition & 11 deletions src/main/java/me/zhyd/hunter/entity/ImageLink.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package me.zhyd.hunter.entity;

import lombok.Builder;
import lombok.Data;
import lombok.Getter;

/**
* @author yadong.zhang (yadong.zhang0415(a)gmail.com)
Expand All @@ -12,17 +10,9 @@
@Data
public class ImageLink {

/**
* 正常img标签的src连接
*/
private String srcLink;
/**
* 当网站采用了懒加载时,originalLink表示真正的连接
*/
private String originalLink;

public ImageLink(String srcLink, String originalLink) {
public ImageLink(String srcLink) {
this.srcLink = srcLink;
this.originalLink = originalLink;
}
}
6 changes: 4 additions & 2 deletions src/main/java/me/zhyd/hunter/exception/HunterException.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package me.zhyd.hunter.exception;

import me.zhyd.hunter.consts.HunterConsts;

/**
* @author yadong.zhang (yadong.zhang0415(a)gmail.com)
* @version 1.0
Expand All @@ -8,10 +10,10 @@
public class HunterException extends RuntimeException {

public HunterException(String message) {
super(message);
super(HunterConsts.LOG_PREFIX + message);
}

public HunterException(String message, Throwable cause) {
super(message, cause);
super(HunterConsts.LOG_PREFIX + message, cause);
}
}
10 changes: 5 additions & 5 deletions src/main/java/me/zhyd/hunter/processor/BlogHunterProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@
*/
public class BlogHunterProcessor extends HunterProcessor {

private BlogHunterProcessor() {
super();
}

public BlogHunterProcessor(String url, boolean convertImage) {
super(url, convertImage);
}

public BlogHunterProcessor(String url, boolean convertImage, HunterPrintWriter writer) {
super(url, convertImage, writer);
}

public BlogHunterProcessor(HunterConfig config) {
super(config);
}
Expand All @@ -55,7 +55,7 @@ public BlogHunterProcessor(HunterConfig config, HunterPrintWriter writer, String
public CopyOnWriteArrayList<VirtualArticle> execute() {
List<String> errors = this.validateModel(config);
if (CollectionUtils.isNotEmpty(errors)) {
writer.print("[hunter] 校验不通过!请依据下方提示,检查输入参数是否正确......");
writer.print("校验不通过!请依据下方提示,检查输入参数是否正确......");
for (String error : errors) {
writer.print(">> " + error);
}
Expand Down
10 changes: 9 additions & 1 deletion src/main/java/me/zhyd/hunter/processor/HunterProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ public abstract class HunterProcessor implements PageProcessor {
this(HunterConfigContext.getHunterConfig(url).setConvertImg(convertImage));
}

HunterProcessor(String url, boolean convertImage, HunterPrintWriter writer) {
this(HunterConfigContext.getHunterConfig(url).setConvertImg(convertImage));
if (writer != null) {
this.writer = writer;
}
}

/**
* 程序入口方法
*
Expand Down Expand Up @@ -150,13 +157,14 @@ final void process(ResultItems resultItems, List<VirtualArticle> virtualArticles
virtualArticle.setDescription(CommonUtil.getRealDescription(virtualArticle.getDescription(), virtualArticle.getContent()))
.setKeywords(CommonUtil.getRealKeywords(virtualArticle.getKeywords()));
if (this.config.isConvertImg()) {
virtualArticle.setContent(CommonUtil.formatHtml(virtualArticle.getContent()));
virtualArticle.setImageLinks(CommonUtil.getAllImageLink(virtualArticle.getContent()));
}
if (CollectionUtils.isEmpty(virtualArticle.getTags())) {
virtualArticle.setTags(Collections.singletonList("其他"));
}
virtualArticles.add(virtualArticle);
writer.print(String.format("[ hunter ] <a href=\"%s\" target=\"_blank\">%s</a> -- %s -- %s", virtualArticle.getSource(), title, virtualArticle.getAuthor(), virtualArticle.getReleaseDate()));
writer.print(String.format("<a href=\"%s\" target=\"_blank\">%s</a> -- %s -- %s", virtualArticle.getSource(), title, virtualArticle.getAuthor(), virtualArticle.getReleaseDate()));
}

public HunterConfig getConfig() {
Expand Down
Loading

0 comments on commit 9321e5d

Please sign in to comment.