Skip to content

Commit

Permalink
DynamicGecco稳定版,支持动态改变抓取规则
Browse files Browse the repository at this point in the history
  • Loading branch information
xtuhcy committed Jul 14, 2016
1 parent 9fb7a51 commit 6637501
Show file tree
Hide file tree
Showing 20 changed files with 482 additions and 159 deletions.
6 changes: 0 additions & 6 deletions doc/UnirestDownloader.java
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,4 @@ public void shutdown() {
e.printStackTrace();
}
}

public static void main(String[] args) throws Exception {
UnirestDownloader ud = new UnirestDownloader();
HttpResponse resp = ud.download(new HttpGetRequest("http://temai.tuniu.com/tours/212032167"));
System.out.println(resp.getContent());
}
}
94 changes: 91 additions & 3 deletions src/main/java/com/geccocrawler/gecco/GeccoEngine.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import org.apache.log4j.Logger;

import com.alibaba.fastjson.JSON;
import com.geccocrawler.gecco.dynamic.DynamicGecco;
import com.geccocrawler.gecco.dynamic.GeccoClassLoader;
import com.geccocrawler.gecco.monitor.GeccoJmx;
import com.geccocrawler.gecco.monitor.GeccoMonitor;
import com.geccocrawler.gecco.pipeline.PipelineFactory;
Expand Down Expand Up @@ -74,23 +76,50 @@ private GeccoEngine() {
this.retry = 3;
}

/**
* 动态配置规则不能使用该方法构造GeccoEngine
* @return
*/
public static GeccoEngine create() {
GeccoEngine geccoEngine = new GeccoEngine();
geccoEngine.setName("GeccoEngine");
return geccoEngine;
}

public static GeccoEngine create(String classpath) {
return create(classpath, null);
}

public static GeccoEngine create(String classpath, PipelineFactory pipelineFactory) {
if(StringUtils.isEmpty(classpath)) {
//classpath不为空
throw new IllegalArgumentException("classpath cannot be empty");
}
GeccoEngine ge = create();
ge.spiderBeanFactory = new SpiderBeanFactory(classpath, pipelineFactory);
return ge;
}

public GeccoEngine start(String url) {
return start(new HttpGetRequest(url));
}

public GeccoEngine start(String... urls) {
for(String url : urls) {
start(url);
}
return this;
}

public GeccoEngine start(HttpRequest request) {
this.startRequests.add(request);
return this;
}

public GeccoEngine start(List<HttpRequest> requests) {
this.startRequests = requests;
for(HttpRequest request : requests) {
start(request);
}
return this;
}

Expand Down Expand Up @@ -144,7 +173,16 @@ public GeccoEngine spiderBeanFactory(SpiderBeanFactory spiderBeanFactory) {
return this;
}

public void register(Class<?> spiderBeanClass) {
getSpiderBeanFactory().addSpiderBean(spiderBeanClass);
}

public void unregister(Class<?> spiderBeanClass) {
getSpiderBeanFactory().removeSpiderBean(spiderBeanClass);
DynamicGecco.unregister(spiderBeanClass);
}

@Override
public void run() {
if(debug) {
Logger log = LogManager.getLogger("com.geccocrawler.gecco.spider.render");
Expand All @@ -171,7 +209,7 @@ public void run() {
startsJson();
if(startRequests.isEmpty()) {
//startRequests不为空
throw new IllegalArgumentException("startRequests cannot be empty");
//throw new IllegalArgumentException("startRequests cannot be empty");
}
for(HttpRequest startRequest : startRequests) {
scheduler.into(startRequest);
Expand Down Expand Up @@ -255,10 +293,16 @@ public boolean isDebug() {
return debug;
}

public void notifyComplemet() {
/**
* spider线程告知engine执行结束
*/
public void notifyComplete() {
this.cdl.countDown();
}

/**
* 非循环模式等待线程执行完毕后关闭
*/
public void closeUnitlComplete() {
if(!loop) {
try {
Expand All @@ -273,4 +317,48 @@ public void closeUnitlComplete() {
log.info("close gecco!");
}
}

/**
* 启动引擎,并返回GeccoEngine对象
* @return
*/
public GeccoEngine engineStart() {
start();
return this;
}

/**
* 暂停
*/
public void pause() {
for(Spider spider : spiders) {
spider.pause();
}
}

/**
* 重新开始抓取
*/
public void restart() {
for(Spider spider : spiders) {
spider.restart();
}
}

public void beginUpdateRule() {
if(log.isDebugEnabled()) {
log.debug("begin update rule");
}
//修改规则前需要暂停引擎并且重新创建ClassLoader
pause();
GeccoClassLoader.create();
}

public void endUpdateRule() {
//修改完成后重启引擎
restart();
if(log.isDebugEnabled()) {
log.debug("end update rule");
}
}
}
29 changes: 25 additions & 4 deletions src/main/java/com/geccocrawler/gecco/dynamic/DynamicBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,33 @@ public interface DynamicBean {
* @param fieldName 已有字段名称
* @return 字段
*/
public DynamicField existField(String fieldName);

/**
* 由于有歧义,已经被existField代替
*
* @param fieldName
* @return
*/
@Deprecated
public DynamicField field(String fieldName);

/**
* 新增一个字段
* 新增一个字段,如果已经存在返回当前字段
*
* @param fieldName 字段名称
* @param fieldType 字段类型
* @return
*/
public DynamicField field(String fieldName, CtClass fieldType);

/**
* 删除一个属性
*
* @param fieldName
*/
public DynamicBean removeField(String fieldName);

/**
* string类型字段
*
Expand Down Expand Up @@ -113,18 +129,23 @@ public interface DynamicBean {
public DynamicField listField(String fieldName, Class<?> memberClass);

/**
* 注册Bean
* 将加载的bean注册到爬虫引擎中。
* 主要应用在先定义Bean后期的爬虫引擎的情况。
*
* @return
*/
public Class<?> register();

/**
* 已经被register代替
* 加载bean到classloader中
*
* @return
*/
@Deprecated
public Class<?> loadClass();

/**
* 卸载bean
*/
public void unloadClass();

}
16 changes: 10 additions & 6 deletions src/main/java/com/geccocrawler/gecco/dynamic/DynamicGecco.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,23 @@

public class DynamicGecco {

public static JavassistDynamicBean htmlBean(String htmlBeanName) {
return new JavassistDynamicBean(htmlBeanName, JavassistDynamicBean.HtmlBean, false);
public static JavassistDynamicBean html(String htmlBeanName) {
return new JavassistDynamicBean(htmlBeanName, JavassistDynamicBean.HtmlBean);
}

public static JavassistDynamicBean jsonBean(String jsonBeanName) {
return new JavassistDynamicBean(jsonBeanName, JavassistDynamicBean.JsonBean, false);
public static JavassistDynamicBean json(String jsonBeanName) {
return new JavassistDynamicBean(jsonBeanName, JavassistDynamicBean.JsonBean);
}

public static JavassistDynamicBean html() {
return new JavassistDynamicBean("com.geccocrawler.gecco.dynamic.HtlmBean"+RandomStringUtils.randomAlphabetic(6)+System.nanoTime(), JavassistDynamicBean.HtmlBean, true);
return new JavassistDynamicBean("com.geccocrawler.gecco.dynamic.HtlmBean"+RandomStringUtils.randomAlphabetic(6)+System.nanoTime(), JavassistDynamicBean.HtmlBean);
}

public static JavassistDynamicBean json() {
return new JavassistDynamicBean("com.geccocrawler.gecco.dynamic.JsonBean"+RandomStringUtils.randomAlphabetic(6)+System.nanoTime(), JavassistDynamicBean.JsonBean, true);
return new JavassistDynamicBean("com.geccocrawler.gecco.dynamic.JsonBean"+RandomStringUtils.randomAlphabetic(6)+System.nanoTime(), JavassistDynamicBean.JsonBean);
}

public static void unregister(Class<?> clazz) {
new JavassistDynamicBean(clazz.getName()).unloadClass();;
}
}
4 changes: 2 additions & 2 deletions src/main/java/com/geccocrawler/gecco/dynamic/FieldType.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ public class FieldType {
}
}

public static CtClass type(String className) {
public static CtClass type(Class<?> clazz) {
try {
return ClassPool.getDefault().get(className);
return ClassPool.getDefault().get(clazz.getName());
} catch (NotFoundException e) {
e.printStackTrace();
return voidType;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,14 @@ public class GeccoClassLoader extends ClassLoader {

private static GeccoClassLoader instance;

/**
* 创建一个新的GeccoClassLoader
* @return
*/
public static synchronized GeccoClassLoader create() {
if(instance != null) {
instance.classes.clear();
}
ClassLoader parent = Thread.currentThread().getContextClassLoader();
if(parent != null) {
instance = new GeccoClassLoader(parent);
Expand Down
Loading

0 comments on commit 6637501

Please sign in to comment.