Skip to content

Commit

Permalink
added parsing of canonical link element to html parser
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7812 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Jul 1, 2011
1 parent b6f09a4 commit bda3eec
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 3 deletions.
3 changes: 3 additions & 0 deletions defaults/solr.keys.list
Expand Up @@ -111,6 +111,9 @@ attr_paths
## host of the url, string
host_s

## url inside the canonical link element, string
canonical_s

## all texts in <li> tags, textgen
attr_li

Expand Down
12 changes: 9 additions & 3 deletions source/de/anomic/search/Switchboard.java
Expand Up @@ -561,15 +561,21 @@ public void run() {
TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));

// prepare a solr index profile switch list
final File solrBackupProfile = new File("defaults/solr.keys.list");
final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/solr.keys.default.list");
if (!solrWorkProfile.exists()) FileUtils.copy(new File("defaults/solr.keys.list"), solrWorkProfile);
final SolrScheme scheme = new SolrScheme(solrWorkProfile);
if (!solrWorkProfile.exists()) FileUtils.copy(solrBackupProfile, solrWorkProfile);
final SolrScheme backupScheme = new SolrScheme(solrBackupProfile);
final SolrScheme workingScheme = new SolrScheme(solrWorkProfile);

// update the working scheme with the backup scheme. This is necessary to include new features.
// new features are always activated by default


// set up the solr interface
final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
try {
this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
} catch (final IOException e) {
Log.logException(e);
this.solrConnector = null;
Expand Down
3 changes: 3 additions & 0 deletions source/net/yacy/cora/services/federated/solr/SolrScheme.java
Expand Up @@ -181,6 +181,9 @@ public SolrInputDocument yacy2solr(final String id, final ResponseHeader header,
}
addSolr(solrdoc, "htags_i", h);

// canonical tag
if (html.getCanonical() != null) addSolr(solrdoc, "canonical_s", html.getCanonical().toNormalform(false, false));

// meta tags
final Map<String, String> metas = html.getMetas();
final String robots = metas.get("robots");
Expand Down
1 change: 1 addition & 0 deletions source/net/yacy/cora/storage/ConfigurationSet.java
Expand Up @@ -72,6 +72,7 @@ public ConfigurationSet(final File file) {
}
}


@Override
public boolean isEmpty() {
// a shortcut to a fast 'true' in case that we initialized the class without a configuration file
Expand Down
10 changes: 10 additions & 0 deletions source/net/yacy/document/parser/html/ContentScraper.java
Expand Up @@ -125,6 +125,7 @@ private Tag(final TagType type) {
private CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
private float lon, lat;
private MultiProtocolURI canonical;

/**
* {@link MultiProtocolURI} to the favicon that belongs to the document
Expand Down Expand Up @@ -167,6 +168,7 @@ public ContentScraper(final MultiProtocolURI root) {
this.lon = 0.0f;
this.lat = 0.0f;
this.evaluationScores.match(Element.url, root.toNormalform(false, false));
this.canonical = null;
}

public void scrapeText(final char[] newtext, final String insideTag) {
Expand Down Expand Up @@ -345,6 +347,10 @@ public void scrapeTag0(final String tagname, final Properties tagopts) {
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
this.images.put(ie.url(), ie);
this.favicon = newLink;
} else if (rel.equalsIgnoreCase("canonical")) {
final Properties p = new Properties(); p.put("name", this.title);
this.anchors.put(newLink, p);
this.canonical = newLink;
} else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
this.rss.put(newLink, linktitle);
} else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) {
Expand Down Expand Up @@ -599,6 +605,10 @@ public Set<MultiProtocolURI> getScript() {
return this.script;
}

public MultiProtocolURI getCanonical() {
return this.canonical;
}

/**
* get all images
* @return a map of <urlhash, ImageEntry>
Expand Down

0 comments on commit bda3eec

Please sign in to comment.