Skip to content
This repository has been archived by the owner on Nov 9, 2017. It is now read-only.

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
Patrick Huang committed Jun 4, 2015
1 parent 6e027e0 commit 0ab8655
Show file tree
Hide file tree
Showing 2 changed files with 178 additions and 82 deletions.
206 changes: 133 additions & 73 deletions zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java
Expand Up @@ -32,16 +32,16 @@
*/
@Slf4j
public class TransMemoryMatcher {
private final SourceHTMLParser upcomingSourceParser;
private static final String EMPTY_STRING = "";
private final HTMLParser tmSourceParser;
private final Map<TextTokenKey, String> tmTokensMap;
private final Map<TextTokenKey, String> upcomingTokensMap;
private final String upcomingSourceContent;
private final HTMLParser upcomingSourceParser;

public TransMemoryMatcher(HTextFlow upcomingSource,
HTextFlow transMemory, HLocale targetLocale) {
// TODO pahuang work on plural
upcomingSourceContent = upcomingSource.getContents().get(0);
String upcomingSourceContent = upcomingSource.getContents().get(0);
String tmSource = transMemory.getContents().get(0);
String tmTarget = transMemory.getTargets().get(targetLocale.getId()).getContents().get(
0);
Expand All @@ -56,17 +56,67 @@ public TransMemoryMatcher(HTextFlow upcomingSource,
tmSourceParser = new SourceHTMLParser(tmMap, tmSource);
HTMLParser tmTargetParser = new TargetHTMLParser(tmMap, tmTarget);

log.debug("=== about to parse TM source ===");
tmSourceParser.parse();
log.debug("=== about to parse TM target ===");
tmTargetParser.parse();

Map<ParentNodes, Map.Entry<String, String>> upcomingSourceMap = Maps.newHashMap();

log.debug("=== about to parse upcoming source ===");
upcomingSourceParser =
new SourceHTMLParser(upcomingSourceMap, upcomingSourceContent);
new HTMLParser(
upcomingSourceMap,
upcomingSourceContent) {

@Override
protected void doWithTextNode(
ParentNodes parentNodes,
TextNode textNode) {
String sourceTokenText = textNode.getWholeText();
String translation = EMPTY_STRING;
// text node text will get updated if it can find a
// match in TM tokens
textNode.text(translation);
if (parentNodesToSourceTargetEntryMap.containsKey(parentNodes)) {
Map.Entry<String, String> previousEntry =
parentNodesToSourceTargetEntryMap.get(parentNodes);
// more than one text token share the same parent nodes, we combine the text together
String combinedText = previousEntry.getKey() +
sourceTokenText;
Map.Entry<String, String> newEntry =
makeMapEntry(combinedText, translation);
parentNodesToSourceTargetEntryMap.put(parentNodes, newEntry);
TextTokenKey key =
new TextTokenKey(parentNodes.size(),
combinedText);
if (tmTokensMap.containsKey(key)) {
translation = tmTokensMap.get(key);
textNode.text(translation);
newEntry.setValue(translation);
}

} else {
Map.Entry<String, String>
sourceToTarget =
makeMapEntry(sourceTokenText, translation);
parentNodesToSourceTargetEntryMap
.put(parentNodes, sourceToTarget);
TextTokenKey key =
new TextTokenKey(parentNodes.size(),
sourceTokenText);
if (tmTokensMap.containsKey(key)) {
translation = tmTokensMap.get(key);
textNode.text(translation);
sourceToTarget.setValue(translation);
}

}
}
};
upcomingSourceParser.parse();

tmTokensMap = toMatchableTextTokensMap(
tmMap);
tmTokensMap = toMatchableTextTokensMap(tmMap);
upcomingTokensMap = toMatchableTextTokensMap(upcomingSourceMap);
}

Expand Down Expand Up @@ -121,7 +171,8 @@ private static Map<TextTokenKey, String> toMatchableTextTokensMap(
Map.Entry<String, String> sourceToTarget = entry.getValue();
String textFlowSource = sourceToTarget
.getKey();
String textFlowTarget = sourceToTarget.getValue() == null ? textFlowSource : sourceToTarget.getValue();
String textFlowTarget = sourceToTarget.getValue() == null ?
EMPTY_STRING : sourceToTarget.getValue();
tmMapBuilder.put(
new TextTokenKey(parentNodesForTextNode.size(),
textFlowSource), textFlowTarget);
Expand All @@ -130,26 +181,7 @@ private static Map<TextTokenKey, String> toMatchableTextTokensMap(
}

public String translationFromTransMemory() {
HTMLParser upcomingSourceTargetParser =
new HTMLParser(this.upcomingSourceParser.parentNodesToSourceTargetEntryMap,
upcomingSourceContent) {

@Override
protected void doWithTextNode(
ParentNodes parentNodes,
TextNode textNode) {
TextTokenKey textTokenKey =
new TextTokenKey(parentNodes.size(), textNode.getWholeText());
if (tmTokensMap.containsKey(textTokenKey) &&
parentNodesToSourceTargetEntryMap.containsKey(parentNodes)) {
parentNodesToSourceTargetEntryMap.get(parentNodes)
.setValue(tmTokensMap.get(textTokenKey));
}
}
};
upcomingSourceTargetParser.parse();

String translationBuildFromTM = upcomingSourceTargetParser.doc.body().html();
String translationBuildFromTM = upcomingSourceParser.doc.body().html();
log.debug("Translation build from given TM is {}", translationBuildFromTM);
return translationBuildFromTM;
}
Expand Down Expand Up @@ -177,16 +209,30 @@ public int size() {
return parentNodes.size();
}

/**
* Checking whether the parent nodes are identical. When we try to join
* together source tokens, we need to make sure they share exactly the
* same nodes not just tags as parents. When we do look up for matching
* translation tokens, we use different algorithm.
*
* @param o other ParentNodes object
* @return true if parent nodes list are identical in identity
*/
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ParentNodes that = (ParentNodes) o;
return Objects.equals(parentNodes, that.parentNodes);
}

@Override
public int hashCode() {
ensureParentTags();
return Objects.equals(parentTags, that.parentTags);
return Objects.hash(parentNodes);
}

public void ensureParentTags() {
private void ensureParentTags() {
if (parentTags == null) {
parentTags = Lists
.transform(parentNodes, new Function<Element, Tag>() {
Expand All @@ -199,14 +245,14 @@ public Tag apply(Element input) {
}
}

@Override
public int hashCode() {
List<Tag> parentTags() {
ensureParentTags();
return Objects.hash(parentTags);
return parentTags;
}

@Override
public String toString() {
ensureParentTags();
return MoreObjects.toStringHelper(this)
.add("parentTags", parentTags)
.toString();
Expand All @@ -225,27 +271,6 @@ public String toString() {
}
}

private static class SourceTokensToTargetTokens {
private final List<Map.Entry<String, String>> tokens;

private SourceTokensToTargetTokens(
List<Map.Entry<String, String>> tokens) {
this.tokens = ImmutableList.copyOf(tokens);
}

private SourceTokensToTargetTokens(List<Map.Entry<String, String>> tokens, String sourceToken, String targetToken) {
this.tokens = ImmutableList
.<Map.Entry<String, String>> builder()
.addAll(tokens)
.add(new AbstractMap.SimpleEntry<>(sourceToken,
targetToken)).build();
}

SourceTokensToTargetTokens add(String sourceToken, String targetToken) {
return new SourceTokensToTargetTokens(tokens, sourceToken, targetToken);
}
}

private abstract static class HTMLParser {
private static final Document.OutputSettings OUTPUT_SETTINGS =
new Document.OutputSettings()
Expand Down Expand Up @@ -310,20 +335,37 @@ public SourceHTMLParser(
@Override
protected void doWithTextNode(ParentNodes parentNodes,
TextNode textNode) {
AbstractMap.SimpleEntry<String, String>
sourceToTarget =
new AbstractMap.SimpleEntry<>(textNode.getWholeText(),
null);
String sourceTokenText = textNode.getWholeText();

if (parentNodesToSourceTargetEntryMap.containsKey(parentNodes)) {
Map.Entry<String, String> previousEntry =
parentNodesToSourceTargetEntryMap.get(parentNodes);

// more than one text token share the same parent nodes, we will
// store individual token as well as combined tokens in case
// text tokens get swapped around in translation
Map.Entry<String, String>
newEntry = makeMapEntry(previousEntry.getKey() +
sourceTokenText, EMPTY_STRING);
parentNodesToSourceTargetEntryMap.put(parentNodes, newEntry);
log.debug("appending to parent nodes: {}, old token(s) [{}], new token [{}]",
parentNodes, previousEntry.getKey(), sourceTokenText);

} else {
Map.Entry<String, String>
sourceToTarget =
makeMapEntry(sourceTokenText, EMPTY_STRING);
parentNodesToSourceTargetEntryMap
.put(parentNodes, sourceToTarget);
log.debug("putting to parent nodes: {}, token [{}]",
parentNodes, sourceTokenText);
}
parentNodesToSourceTargetEntryMap
.put(parentNodes, sourceToTarget);
}
}

private static <K, V> Map.Entry<K, V> makeMapEntry(K key, V value) {
return new AbstractMap.SimpleEntry<>(key, value);
}

private static class TargetHTMLParser extends HTMLParser {

public TargetHTMLParser(
Expand All @@ -335,20 +377,38 @@ public TargetHTMLParser(
@Override
protected void doWithTextNode(ParentNodes parentNodes,
TextNode textNode) {
Map.Entry<String, String> sourceToTarget =
parentNodesToSourceTargetEntryMap.get(parentNodes);
String wholeText = textNode.getWholeText();
if (sourceToTarget == null) {
log.warn("Can not match translation text token [{}] in source using parent nodes:{}", wholeText,
parentNodes);

throw new IllegalStateException(
"can not match translation text token [" +
wholeText
+ "] in source using parent nodes:"
+ parentNodes);
String targetTokenText = textNode.getWholeText();
Map.Entry<String, String> matchingSourceToken =
findMatchingSourceToken(parentNodes, targetTokenText);
// we may have other translation tokens under same parent nodes
String previousTrans = matchingSourceToken.getValue();
matchingSourceToken.setValue(previousTrans + targetTokenText);
log.debug(
"putting to source token [{}] as translation: existing trans [{}], current trans [{}]",
matchingSourceToken.getKey(), previousTrans,
targetTokenText);
}

private Map.Entry<String, String> findMatchingSourceToken(
ParentNodes parentNodes, String targetTokenText) {
// because we build a new ParentNodes for target, the identity of
// parent nodes will NOT be the same but the tag names will be. We
// have to use tag names to look up matching source tokens
for (Map.Entry<ParentNodes, Map.Entry<String, String>> entry : parentNodesToSourceTargetEntryMap
.entrySet()) {
ParentNodes key = entry.getKey();
if (key.parentTags().equals(parentNodes.parentTags())) {
return entry.getValue();
}
}
sourceToTarget.setValue(wholeText);
log.warn("Can not match translation text token [{}] in source using parent nodes:{}", targetTokenText,
parentNodes);

throw new IllegalStateException(
"can not match translation text token [" +
targetTokenText
+ "] in source using parent nodes:"
+ parentNodes);
}
}
}
Expand Up @@ -15,6 +15,7 @@
import org.zanata.model.HTextFlow;
import org.zanata.model.HTextFlowTarget;
import org.zanata.model.TestFixture;

import com.google.common.base.Charsets;

/**
Expand All @@ -24,28 +25,63 @@
public class TransMemoryMatcherTest {
private HDocument document = null;
private String resId = "abc";
private String transMemorySource =
"Do you know <div><some>you</some> will <strong>never</strong></div> walk alone?<p><i>Yes</i>, I do.</p>";
private String transMemoryTarget =
"<div><some>你</some><strong>永远不会</strong></div>一个人走你知道吗?<p> <i>是</i>, 我知道.";
private HTextFlow transMemory = new HTextFlow(document, resId,
transMemorySource);
private HTextFlow transMemory;
private HLocale targetLocale = TestFixture.setId(1L, new HLocale(new LocaleId("zh")));

@Before
public void setUp() {
transMemory = null;
}

public void givenTransMemory(String sourceContent, String targetContent) {
transMemory = new HTextFlow(document, resId,
sourceContent);
HTextFlowTarget transMemoryTranslation =
new HTextFlowTarget(transMemory, targetLocale);
transMemoryTranslation.setContent0(transMemoryTarget);
transMemoryTranslation.setContent0(targetContent);
transMemory.getTargets().put(targetLocale.getId(),
transMemoryTranslation);
}

@Test
public void canMatchSameStructureButDifferentTags() {
// Given:
givenTransMemory(
"Do you know <div><some>you</some> will <strong>never</strong></div> walk alone? <p><i>Yes</i>, I do.</p>",
"DO YOU KNOW <div><some>YOU</some> WILL <strong>NEVER</strong></div> WALK ALONE? <p><i>YES</i>, I DO.</p>");
String upcomingSource =
"Do you know <span><other>you</other> will <bold>never</bold></span> walk alone? <para><o>Yes</o>, I do.</para>";
HTextFlow upcomingMessage =
new HTextFlow(document, resId, upcomingSource);
TransMemoryMatcher matcher =
new TransMemoryMatcher(upcomingMessage, transMemory,
targetLocale);


// When:
double similarityPercent = matcher.calculateSimilarityPercent();

// Then:
Assertions.assertThat(similarityPercent)
.isEqualTo(100)
.as("same structure but different tags can be matched as 100%");

// When:
String translation = matcher.translationFromTransMemory();

// Then:
Assertions
.assertThat(translation)
.isEqualTo(
"DO YOU KNOW <span><other>YOU</other> WILL <bold>NEVER</bold></span> WALK ALONE? <para><o>YES</o>, I DO.</para>")
.as("will replace translation from TM with correct tags");
}

@Test
public void canMatchSameStructureButDifferentTagsPlusTranslationSwappedLocation() {
// Given:
String upcomingSource =
"Do you know <span><other>you</other> will <bold>never</bold></span> walk alone?<para><o>Yes</o>, I do.</para>";
"Do you know <span><other>you</other> will <bold>never</bold></span> walk alone? <para><o>Yes</o>, I do.</para>";
HTextFlow upcomingMessage =
new HTextFlow(document, resId, upcomingSource);
TransMemoryMatcher matcher =
Expand All @@ -68,7 +104,7 @@ public void canMatchSameStructureButDifferentTags() {
Assertions
.assertThat(translation)
.isEqualTo(
"<span><other>你<bold>永远不会</bold></span>一个人走你知道吗?<br> <o>是</o>, 我知道.")
"<span><other>你</other><bold>永远不会</bold></span>一个人走你知道吗? <para><o>是</o>, 我知道.</para>")
.as("will replace translation from TM with correct tags");
}

Expand Down

0 comments on commit 0ab8655

Please sign in to comment.