Skip to content

Commit

Permalink
added parsing of dd, dt and article html fields. The parsed result is
Browse files Browse the repository at this point in the history
written to special solr fields which are deactivated by default.
  • Loading branch information
Orbiter committed Apr 12, 2015
1 parent 1395f10 commit 4cb4f67
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 14 deletions.
18 changes: 18 additions & 0 deletions defaults/solr.collection.schema
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,24 @@ images_width_val
## number of <li> tags, int
#licount_i

## all texts in <dt> tags
#dt_txt

## number of <dt> tags, int
#dtcount_i

## all texts in <dd> tags
#dd_txt

## number of <dd> tags, int
#ddcount_i

## all texts in <article> tags
#article_txt

## number of <article> tags, int
#articlecount_i

## all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order
bold_txt

Expand Down
38 changes: 24 additions & 14 deletions source/net/yacy/document/parser/html/ContentScraper.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
Expand Down Expand Up @@ -114,6 +110,8 @@ public enum TagName {
u(TagType.pair),
i(TagType.pair),
li(TagType.pair),
dt(TagType.pair),
dd(TagType.pair),
script(TagType.pair),
span(TagType.pair),
div(TagType.pair),
Expand Down Expand Up @@ -182,7 +180,7 @@ public String toString() {
//private String headline;
private List<String>[] headlines;
private final ClusteredScoreMap<String> bold, italic, underline;
private final List<String> li;
private final List<String> li, dt, dd;
private final CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
private double lon, lat;
Expand Down Expand Up @@ -242,6 +240,8 @@ public ContentScraper(final DigestURL root, int maxLinks, final VocabularyScrape
this.italic = new ClusteredScoreMap<String>(false);
this.underline = new ClusteredScoreMap<String>(false);
this.li = new ArrayList<String>();
this.dt = new ArrayList<String>();
this.dd = new ArrayList<String>();
this.content = new CharBuffer(MAX_DOCSIZE, 1024);
this.htmlFilterEventListeners = new EventListenerList();
this.lon = 0.0d;
Expand Down Expand Up @@ -591,6 +591,12 @@ public void scrapeTag1(Tag tag) {
} else if ((tag.name.equalsIgnoreCase("li")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.li.add(h);
} else if ((tag.name.equalsIgnoreCase("dt")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.dt.add(h);
} else if ((tag.name.equalsIgnoreCase("dd")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.dd.add(h);
} else if (tag.name.equalsIgnoreCase("script")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING);
if (src.length() > 0) {
Expand Down Expand Up @@ -734,6 +740,14 @@ public String[] getLi() {
return this.li.toArray(new String[this.li.size()]);
}

public String[] getDt() {
return this.dt.toArray(new String[this.dt.size()]);
}

public String[] getDd() {
return this.dd.toArray(new String[this.dd.size()]);
}

public DigestURL[] getFlash() {
String ext;
ArrayList<DigestURL> f = new ArrayList<DigestURL>();
Expand All @@ -760,22 +774,18 @@ public int breadcrumbCount() {
}

public String getText() {
if (this.articles.size() > 0) {
StringBuilder sb = new StringBuilder();
for (String al: this.articles) {
sb.append(al).append(' ');
}
if (sb.length() > this.articles.size()) return sb.toString().trim();
}
this.content.trim();
try {
return this.content.toString();
return this.content.trim().toString();
} catch (final OutOfMemoryError e) {
ConcurrentLog.logException(e);
return "";
}
}

public List<String> getArticles() {
return this.articles;
}

public List<AnchorURL> getAnchors() {
// returns a url (String) / name (String) relation
return this.anchors;
Expand Down
12 changes: 12 additions & 0 deletions source/net/yacy/search/schema/CollectionConfiguration.java
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,18 @@ public SolrVector yacy2solr(
final String[] li = html.getLi();
add(doc, CollectionSchema.licount_i, li.length);
if (li.length > 0) add(doc, CollectionSchema.li_txt, li);

final String[] dt = html.getDt();
add(doc, CollectionSchema.dtcount_i, dt.length);
if (dt.length > 0) add(doc, CollectionSchema.dt_txt, li);

final String[] dd = html.getLi();
add(doc, CollectionSchema.ddcount_i, dd.length);
if (dd.length > 0) add(doc, CollectionSchema.dd_txt, li);

final List<String> articles = html.getArticles();
add(doc, CollectionSchema.articlecount_i, articles.size());
if (articles.size() > 0) add(doc, CollectionSchema.article_txt, articles);

// images
final ArrayList<String> imgprots = new ArrayList<String>(images.size());
Expand Down
6 changes: 6 additions & 0 deletions source/net/yacy/search/schema/CollectionSchema.java
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@ public enum CollectionSchema implements SchemaDeclaration {
refresh_s(SolrType.string, true, true, false, false, false, "link from the url property inside the refresh link element"),
li_txt(SolrType.text_general, true, true, true, false, true, "all texts in <li> tags"),
licount_i(SolrType.num_integer, true, true, false, false, false, "number of <li> tags"),
dt_txt(SolrType.text_general, true, true, true, false, true, "all texts in <dt> tags"),
dtcount_i(SolrType.num_integer, true, true, false, false, false, "number of <dt> tags"),
dd_txt(SolrType.text_general, true, true, true, false, true, "all texts in <dd> tags"),
ddcount_i(SolrType.num_integer, true, true, false, false, false, "number of <dd> tags"),
article_txt(SolrType.text_general, true, true, true, false, true, "all texts in <article> tags"),
articlecount_i(SolrType.num_integer, true, true, false, false, false, "number of <article> tags"),
bold_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
boldcount_i(SolrType.num_integer, true, true, false, false, false, "total number of occurrences of <b> or <strong>"),
italic_txt(SolrType.text_general, true, true, true, false, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
Expand Down

0 comments on commit 4cb4f67

Please sign in to comment.