close #5 url is not used as a date hint anymore

tokenmill · May 24, 2018 · 100b352 · 100b352
1 parent 7e63723
commit 100b352
Show file tree

Hide file tree

Showing 3 changed files with 946 additions and 10 deletions.
diff --git a/crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/ArticleIndexerBolt.java b/crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/ArticleIndexerBolt.java
@@ -30,11 +30,8 @@
 import org.slf4j.LoggerFactory;
 
 import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import static com.digitalpebble.stormcrawler.Constants.StatusStreamName;
-import static lt.tokenmill.crawling.crawler.CrawlerConstants.PARTIAL_ANALYSIS_STATUS;
 
 
 public class ArticleIndexerBolt extends BaseRichBolt {
@@ -135,18 +132,11 @@ protected void storeDocument(HttpArticle article, Map<String, Object> fields) th
         this.esDocumentOperations.store(article, fields);
     }
 
-    private Pattern dateInUrl = Pattern.compile(".*(\\d{4}/\\d{2}/\\d{2}).*");
     private HttpArticle analyze(String url, String filtered, HttpSource httpSource, String html, Metadata metadata) throws Exception {
         String publishedHint = metadata.getFirstValue(CrawlerConstants.META_PUBLISHED);
         if (publishedHint == null) {
             publishedHint = metadata.getFirstValue(CrawlerConstants.META_FEED_PUBLISHED);
         }
-        if (publishedHint == null) {
-            Matcher matcher = dateInUrl.matcher(url);
-            if (matcher.find()) {
-                publishedHint = matcher.group(1);
-            }
-        }
         HttpArticle article = ArticleExtractor.extractArticle(html, filtered, httpSource, publishedHint);
         String discovered = metadata.getFirstValue(CrawlerConstants.META_DISCOVERED);
         article.setDiscovered(DataUtils.parseFromUTC(discovered));

diff --git a/parser/src/test/java/lt/tokenmill/crawling/parser/AljazeeraExtractorTest.java b/parser/src/test/java/lt/tokenmill/crawling/parser/AljazeeraExtractorTest.java
@@ -0,0 +1,27 @@
+package lt.tokenmill.crawling.parser;
+
+import lt.tokenmill.crawling.data.HttpArticle;
+import lt.tokenmill.crawling.data.HttpSource;
+import org.junit.Test;
+
+import java.util.Arrays;
+
+import static junit.framework.TestCase.assertEquals;
+
+public class AljazeeraExtractorTest extends BaseArticleExtractorTest {
+
+    @Test
+    public void testFortune2() throws Exception {
+        String html = loadArticle("aljazeera1");
+        String url = "https://www.aljazeera.com/news/2018/05/2000-jewish-settlers-storm-al-aqsa-setting-record-180513161200107.html";
+        HttpArticle article = ArticleExtractor.extractArticle(html, url, getSourceConf(), null);
+        assertEquals("2018-05-13T00:00:00.000Z", article.getPublished().toInstant().toString());
+    }
+
+    private HttpSource getSourceConf() {
+        HttpSource source = new HttpSource();
+        source.setDateSelectors(Arrays.asList(".article-duration"));
+        return source;
+    }
+
+}