Skip to content

Commit

Permalink
close #5 url is not used as a date hint anymore
Browse files Browse the repository at this point in the history
  • Loading branch information
dj committed May 24, 2018
1 parent 7e63723 commit 100b352
Show file tree
Hide file tree
Showing 3 changed files with 946 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,8 @@
import org.slf4j.LoggerFactory;

import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static com.digitalpebble.stormcrawler.Constants.StatusStreamName;
import static lt.tokenmill.crawling.crawler.CrawlerConstants.PARTIAL_ANALYSIS_STATUS;


public class ArticleIndexerBolt extends BaseRichBolt {
Expand Down Expand Up @@ -135,18 +132,11 @@ protected void storeDocument(HttpArticle article, Map<String, Object> fields) th
this.esDocumentOperations.store(article, fields);
}

private Pattern dateInUrl = Pattern.compile(".*(\\d{4}/\\d{2}/\\d{2}).*");
private HttpArticle analyze(String url, String filtered, HttpSource httpSource, String html, Metadata metadata) throws Exception {
String publishedHint = metadata.getFirstValue(CrawlerConstants.META_PUBLISHED);
if (publishedHint == null) {
publishedHint = metadata.getFirstValue(CrawlerConstants.META_FEED_PUBLISHED);
}
if (publishedHint == null) {
Matcher matcher = dateInUrl.matcher(url);
if (matcher.find()) {
publishedHint = matcher.group(1);
}
}
HttpArticle article = ArticleExtractor.extractArticle(html, filtered, httpSource, publishedHint);
String discovered = metadata.getFirstValue(CrawlerConstants.META_DISCOVERED);
article.setDiscovered(DataUtils.parseFromUTC(discovered));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package lt.tokenmill.crawling.parser;

import lt.tokenmill.crawling.data.HttpArticle;
import lt.tokenmill.crawling.data.HttpSource;
import org.junit.Test;

import java.util.Arrays;

import static junit.framework.TestCase.assertEquals;

public class AljazeeraExtractorTest extends BaseArticleExtractorTest {

@Test
public void testFortune2() throws Exception {
String html = loadArticle("aljazeera1");
String url = "https://www.aljazeera.com/news/2018/05/2000-jewish-settlers-storm-al-aqsa-setting-record-180513161200107.html";
HttpArticle article = ArticleExtractor.extractArticle(html, url, getSourceConf(), null);
assertEquals("2018-05-13T00:00:00.000Z", article.getPublished().toInstant().toString());
}

private HttpSource getSourceConf() {
HttpSource source = new HttpSource();
source.setDateSelectors(Arrays.asList(".article-duration"));
return source;
}

}
Loading

0 comments on commit 100b352

Please sign in to comment.