Skip to content
This repository has been archived by the owner on Nov 9, 2017. It is now read-only.

Add HTML adapter #333

Merged
merged 10 commits into from
Jan 19, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 12 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,18 @@
</exclusions>
</dependency>

<dependency>
<groupId>net.sf.okapi.filters</groupId>
<artifactId>okapi-filter-html</artifactId>
<version>${okapi.version}</version>
<exclusions>
<exclusion>
<groupId>net.sf.okapi.logbind</groupId>
<artifactId>build-log4j</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>net.sf.okapi.filters</groupId>
<artifactId>okapi-filter-idml</artifactId>
Expand Down
4 changes: 4 additions & 0 deletions zanata-war/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1317,6 +1317,10 @@
<groupId>net.sf.okapi.filters</groupId>
<artifactId>okapi-filter-dtd</artifactId>
</dependency>
<dependency>
<groupId>net.sf.okapi.filters</groupId>
<artifactId>okapi-filter-html</artifactId>
</dependency>
<dependency>
<groupId>net.sf.okapi.filters</groupId>
<artifactId>okapi-filter-idml</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/*
* Copyright 2013, Red Hat, Inc. and individual contributors
* as indicated by the @author tags. See the copyright.txt file in the
* distribution for a full listing of individual contributors.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package org.zanata.adapter

/**
* @see #separate(String)
*/
class TranslatableSeparator {

/**
* Separates leading and trailing non-translatable portions of a string from
* the translatable portion.
*
* The leading portion maps to 'pre', the trailing non-translatable portion
* maps to 'suf' and the translatable portion maps to 'str'.
*/
static Map<String, String> separate(String s) {

def stripAllNonTranslatable = stripTrailingEmptyPairedTags << stripLeadingEmptyPairedTags \
<< stripWrappingTags \
<< stripTrailingEmptyTag << stripLeadingEmptyTags \
<< stripTrailingWhitespace << stripLeadingWhitespace

def preStripped = [pre: "", str: s, suf: ""]
def stripped = stripAllNonTranslatable(preStripped)
while (preStripped != stripped) {
preStripped = stripped
stripped = stripAllNonTranslatable(preStripped)
}

// this is given to Java, so all Strings are converted to java.lang.String for compatibility
[pre: stripped.pre.toString(), str: stripped.str.toString(), suf: stripped.suf.toString()]
}

static def stripLeadingWhitespace = {
def leadingWhitespace = /(?ms)^(\s*)(.*)$/
def matcher = ( it.str =~ leadingWhitespace )
if (matcher.matches()) {
[pre: it.pre + matcher[0][1], str: matcher[0][2], suf: it.suf]
} else {
it
}
}

// Does not treat whitespace-only string as having trailing whitespace
static def stripTrailingWhitespace = {
def trailingWhitespace = /(?ms)^(.*[^\s])(\s*)$/
def matcher = ( it.str =~ trailingWhitespace )
if (matcher.matches()) {
[pre: it.pre, str: matcher[0][1], suf: matcher[0][2] + it.suf]
} else {
it
}
}

static def stripLeadingEmptyTags = {
def leadingStandaloneTags = /(?ms)^((?:<[^\/>]*\/ ?>)*)(.*)$/
def matcher = ( it.str =~ leadingStandaloneTags )
if (matcher.matches()) {
[pre: it.pre + matcher[0][1], str: matcher[0][2], suf: it.suf]
} else {
it
}
}

static def stripTrailingEmptyTag = {
def trailingStandaloneTag = /(?ms)^(.*)(<[^\/>]*\/\s*>)$/
def matcher = ( it.str =~ trailingStandaloneTag )
if (matcher.matches()) {
[pre: it.pre, str: matcher[0][1], suf: matcher[0][2] + it.suf]
} else {
it
}
}

static def stripWrappingTags = {
def wrappingTags = /(?ms)^(<([^\/>]*)\s*>)(.*)(<\/\2\s*>)$/
def matcher = ( it.str =~ wrappingTags )
if (matcher.matches()) {
[pre: it.pre + matcher[0][1], str: matcher[0][3], suf: matcher[0][4] + it.suf]
} else {
it
}
}

static def stripLeadingEmptyPairedTags = {
def leadingPairedTags = /(?ms)^(<([^\/>]*)\s*>(.*)<\/\2\s*>)(.*)$/
def matcher = ( it.str =~ leadingPairedTags )
if (matcher.matches()) {
def tagContents = matcher[0][3];
def cleanedContents = separate(tagContents).str
if (cleanedContents.isEmpty()) {
[pre: it.pre + matcher[0][1], str: matcher[0][4], suf: it.suf]
} else {
it
}
} else {
it
}
}

static def stripTrailingEmptyPairedTags = {
def trailingPairedTags = /(?ms)^(.*?)(<([^\/>]*)\s*>(.*)<\/\3\s*>)$/
def matcher = ( it.str =~ trailingPairedTags )
if (matcher.matches()) {
def tagContents = matcher[0][4];
def cleanedContents = separate(tagContents).str
if (cleanedContents.isEmpty()) {
[pre: it.pre, str: matcher[0][1], suf: matcher[0][2] + it.suf]
} else {
it
}
} else {
it
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
* href="mailto:damason@redhat.com">damason@redhat.com</a>
* @see DTDFilter
*/
public class DTDAdapter extends GenericOkapiFilterAdapter {
public class DTDAdapter extends OkapiFilterAdapter {
public DTDAdapter() {
super(new DTDFilter(), IdSource.textUnitName);
}
Expand Down
77 changes: 77 additions & 0 deletions zanata-war/src/main/java/org/zanata/adapter/HTMLAdapter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright 2013, Red Hat, Inc. and individual contributors
* as indicated by the @author tags. See the copyright.txt file in the
* distribution for a full listing of individual contributors.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package org.zanata.adapter;

import java.io.IOException;
import java.net.URL;

import net.sf.okapi.common.IParameters;
import net.sf.okapi.filters.html.HtmlFilter;

import com.google.common.base.Charsets;
import com.google.common.io.Resources;

/**
* Adapter to handle HTML documents.
* This adapter outputs HTML files in ASCII encoding by default to force the use
* of CERs.
* It uses the Okapi's {@link net.sf.okapi.filters.html.HtmlFilter} class, and
* specifically its escapeCharacters configuration parameter to make sure all
* HTML entities get encoded.
*/
public class HTMLAdapter extends OkapiFilterAdapter {

private static final String defaultConfig = loadDefaultConfig();

private static String loadDefaultConfig() {
URL configURL =
HTMLAdapter.class
.getResource("HTMLAdapterDefaultConfiguration.yml");
try {
return Resources.toString(configURL, Charsets.UTF_8);
} catch (IOException e) {
throw new RuntimeException(
"Failed to load default config for HTML adapter.", e);
}
}

public HTMLAdapter() {
super(prepareFilter(), IdSource.contentHash, true, true);
}

private static HtmlFilter prepareFilter() {
return new HtmlFilter();
}

@Override
protected String getOutputEncoding() {
// Using ASCII encoding for HTML to force the output of CERs
return "ascii";
}

@Override
protected void updateParamsWithDefaults(IParameters params) {
// IParameters has setter methods, but they break the contract in the
// implementation for HtmlFilter and don't do anything. Have to set all
// configuration at once rather than change individual settings.
params.fromString(defaultConfig);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
* @author Sean Flanigan <a
* href="mailto:sflaniga@redhat.com">sflaniga@redhat.com</a>
*/
public class IDMLAdapter extends GenericOkapiFilterAdapter {
public class IDMLAdapter extends OkapiFilterAdapter {
public IDMLAdapter() {
super(prepareFilter(), IdSource.contentHash, true);
}
Expand Down