added parsing of dd, dt and article html fields. The parsed result is

written to special solr fields which are deactivated by default.
yacy · Apr 12, 2015 · 4cb4f67 · 4cb4f67
1 parent 1395f10
commit 4cb4f67
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 14 deletions.
diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema
@@ -314,6 +314,24 @@ images_width_val
 ## number of <li> tags, int
 #licount_i
 
+## all texts in <dt> tags
+#dt_txt
+
+## number of <dt> tags, int
+#dtcount_i
+
+## all texts in <dd> tags
+#dd_txt
+
+## number of <dd> tags, int
+#ddcount_i
+
+## all texts in <article> tags
+#article_txt
+
+## number of <article> tags, int
+#articlecount_i
+
 ## all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order
 bold_txt
 

diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -4,10 +4,6 @@
 // first published on http://www.anomic.de
 // Frankfurt, Germany, 2004
 //
-// $LastChangedDate$
-// $LastChangedRevision$
-// $LastChangedBy$
-//
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation; either version 2 of the License, or
@@ -114,6 +110,8 @@ public enum TagName {
         u(TagType.pair),
         i(TagType.pair),
         li(TagType.pair),
+        dt(TagType.pair),
+        dd(TagType.pair),
         script(TagType.pair),
         span(TagType.pair),
         div(TagType.pair),
@@ -182,7 +180,7 @@ public String toString() {
     //private String headline;
     private List<String>[] headlines;
     private final ClusteredScoreMap<String> bold, italic, underline;
-    private final List<String> li;
+    private final List<String> li, dt, dd;
     private final CharBuffer content;
     private final EventListenerList htmlFilterEventListeners;
     private double lon, lat;
@@ -242,6 +240,8 @@ public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScrape
         this.italic = new ClusteredScoreMap<String>(false);
         this.underline = new ClusteredScoreMap<String>(false);
         this.li = new ArrayList<String>();
+        this.dt = new ArrayList<String>();
+        this.dd = new ArrayList<String>();
         this.content = new CharBuffer(MAX_DOCSIZE, 1024);
         this.htmlFilterEventListeners = new EventListenerList();
         this.lon = 0.0d;
@@ -591,6 +591,12 @@ public void scrapeTag1(Tag tag) {
         } else if ((tag.name.equalsIgnoreCase("li")) && (tag.content.length() < 1024)) {
             h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
             if (h.length() > 0) this.li.add(h);
+        } else if ((tag.name.equalsIgnoreCase("dt")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
+            if (h.length() > 0) this.dt.add(h);
+        } else if ((tag.name.equalsIgnoreCase("dd")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
+            if (h.length() > 0) this.dd.add(h);
         } else if (tag.name.equalsIgnoreCase("script")) {
             final String src = tag.opts.getProperty("src", EMPTY_STRING);
             if (src.length() > 0) {
@@ -734,6 +740,14 @@ public String[] getLi() {
         return this.li.toArray(new String[this.li.size()]);
     }
 
+    public String[] getDt() {
+        return this.dt.toArray(new String[this.dt.size()]);
+    }
+
+    public String[] getDd() {
+        return this.dd.toArray(new String[this.dd.size()]);
+    }
+
     public DigestURL[] getFlash() {
         String ext;
         ArrayList<DigestURL> f = new ArrayList<DigestURL>();
@@ -760,22 +774,18 @@ public int breadcrumbCount() {
     }
 
     public String getText() {
-        if (this.articles.size() > 0) {
-            StringBuilder sb = new StringBuilder();
-            for (String al: this.articles) {
-                sb.append(al).append(' ');
-            }
-            if (sb.length() > this.articles.size()) return sb.toString().trim();
-        }
-        this.content.trim();
         try {
-            return this.content.toString();
+            return this.content.trim().toString();
         } catch (final OutOfMemoryError e) {
             ConcurrentLog.logException(e);
             return "";
         }
     }
 
+    public List<String> getArticles() {
+        return this.articles;
+    }
+
     public List<AnchorURL> getAnchors() {
         // returns a url (String) / name (String) relation
         return this.anchors;

diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -647,6 +647,18 @@ public SolrVector yacy2solr(
             final String[] li = html.getLi();
             add(doc, CollectionSchema.licount_i, li.length);
             if (li.length > 0) add(doc, CollectionSchema.li_txt, li);
+
+            final String[] dt = html.getDt();
+            add(doc, CollectionSchema.dtcount_i, dt.length);
+            if (dt.length > 0) add(doc, CollectionSchema.dt_txt, li);
+
+            final String[] dd = html.getLi();
+            add(doc, CollectionSchema.ddcount_i, dd.length);
+            if (dd.length > 0) add(doc, CollectionSchema.dd_txt, li);
+
+            final List<String> articles = html.getArticles();
+            add(doc, CollectionSchema.articlecount_i, articles.size());
+            if (articles.size() > 0) add(doc, CollectionSchema.article_txt, articles);
 
             // images
             final ArrayList<String> imgprots = new ArrayList<String>(images.size());

diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java
@@ -151,6 +151,12 @@ public enum CollectionSchema implements SchemaDeclaration {
     refresh_s(SolrType.string, true, true, false, false, false, "link from the url property inside the refresh link element"),
     li_txt(SolrType.text_general, true, true, true, false, true, "all texts in <li> tags"),
     licount_i(SolrType.num_integer, true, true, false, false, false, "number of <li> tags"),
+    dt_txt(SolrType.text_general, true, true, true, false, true, "all texts in <dt> tags"),
+    dtcount_i(SolrType.num_integer, true, true, false, false, false, "number of <dt> tags"),
+    dd_txt(SolrType.text_general, true, true, true, false, true, "all texts in <dd> tags"),
+    ddcount_i(SolrType.num_integer, true, true, false, false, false, "number of <dd> tags"),
+    article_txt(SolrType.text_general, true, true, true, false, true, "all texts in <article> tags"),
+    articlecount_i(SolrType.num_integer, true, true, false, false, false, "number of <article> tags"),
     bold_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
     boldcount_i(SolrType.num_integer, true, true, false, false, false, "total number of occurrences of <b> or <strong>"),
     italic_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),