From 0ab865548249e3ec0377cc4fcd5aea430f3349f8 Mon Sep 17 00:00:00 2001 From: Patrick Huang Date: Tue, 2 Jun 2015 15:23:33 +1000 Subject: [PATCH] WIP --- .../org/zanata/search/TransMemoryMatcher.java | 206 +++++++++++------- .../zanata/search/TransMemoryMatcherTest.java | 54 ++++- 2 files changed, 178 insertions(+), 82 deletions(-) diff --git a/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java b/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java index b5a07b2f16..64875e72bb 100644 --- a/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java +++ b/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java @@ -32,16 +32,16 @@ */ @Slf4j public class TransMemoryMatcher { - private final SourceHTMLParser upcomingSourceParser; + private static final String EMPTY_STRING = ""; private final HTMLParser tmSourceParser; private final Map tmTokensMap; private final Map upcomingTokensMap; - private final String upcomingSourceContent; + private final HTMLParser upcomingSourceParser; public TransMemoryMatcher(HTextFlow upcomingSource, HTextFlow transMemory, HLocale targetLocale) { // TODO pahuang work on plural - upcomingSourceContent = upcomingSource.getContents().get(0); + String upcomingSourceContent = upcomingSource.getContents().get(0); String tmSource = transMemory.getContents().get(0); String tmTarget = transMemory.getTargets().get(targetLocale.getId()).getContents().get( 0); @@ -56,17 +56,67 @@ public TransMemoryMatcher(HTextFlow upcomingSource, tmSourceParser = new SourceHTMLParser(tmMap, tmSource); HTMLParser tmTargetParser = new TargetHTMLParser(tmMap, tmTarget); + log.debug("=== about to parse TM source ==="); tmSourceParser.parse(); + log.debug("=== about to parse TM target ==="); tmTargetParser.parse(); Map> upcomingSourceMap = Maps.newHashMap(); + log.debug("=== about to parse upcoming source ==="); upcomingSourceParser = - new SourceHTMLParser(upcomingSourceMap, upcomingSourceContent); + new HTMLParser( + upcomingSourceMap, + upcomingSourceContent) { + + @Override + protected void doWithTextNode( + ParentNodes parentNodes, + TextNode textNode) { + String sourceTokenText = textNode.getWholeText(); + String translation = EMPTY_STRING; + // text node text will get updated if it can find a + // match in TM tokens + textNode.text(translation); + if (parentNodesToSourceTargetEntryMap.containsKey(parentNodes)) { + Map.Entry previousEntry = + parentNodesToSourceTargetEntryMap.get(parentNodes); + // more than one text token share the same parent nodes, we combine the text together + String combinedText = previousEntry.getKey() + + sourceTokenText; + Map.Entry newEntry = + makeMapEntry(combinedText, translation); + parentNodesToSourceTargetEntryMap.put(parentNodes, newEntry); + TextTokenKey key = + new TextTokenKey(parentNodes.size(), + combinedText); + if (tmTokensMap.containsKey(key)) { + translation = tmTokensMap.get(key); + textNode.text(translation); + newEntry.setValue(translation); + } + + } else { + Map.Entry + sourceToTarget = + makeMapEntry(sourceTokenText, translation); + parentNodesToSourceTargetEntryMap + .put(parentNodes, sourceToTarget); + TextTokenKey key = + new TextTokenKey(parentNodes.size(), + sourceTokenText); + if (tmTokensMap.containsKey(key)) { + translation = tmTokensMap.get(key); + textNode.text(translation); + sourceToTarget.setValue(translation); + } + + } + } + }; upcomingSourceParser.parse(); - tmTokensMap = toMatchableTextTokensMap( - tmMap); + tmTokensMap = toMatchableTextTokensMap(tmMap); upcomingTokensMap = toMatchableTextTokensMap(upcomingSourceMap); } @@ -121,7 +171,8 @@ private static Map toMatchableTextTokensMap( Map.Entry sourceToTarget = entry.getValue(); String textFlowSource = sourceToTarget .getKey(); - String textFlowTarget = sourceToTarget.getValue() == null ? textFlowSource : sourceToTarget.getValue(); + String textFlowTarget = sourceToTarget.getValue() == null ? + EMPTY_STRING : sourceToTarget.getValue(); tmMapBuilder.put( new TextTokenKey(parentNodesForTextNode.size(), textFlowSource), textFlowTarget); @@ -130,26 +181,7 @@ private static Map toMatchableTextTokensMap( } public String translationFromTransMemory() { - HTMLParser upcomingSourceTargetParser = - new HTMLParser(this.upcomingSourceParser.parentNodesToSourceTargetEntryMap, - upcomingSourceContent) { - - @Override - protected void doWithTextNode( - ParentNodes parentNodes, - TextNode textNode) { - TextTokenKey textTokenKey = - new TextTokenKey(parentNodes.size(), textNode.getWholeText()); - if (tmTokensMap.containsKey(textTokenKey) && - parentNodesToSourceTargetEntryMap.containsKey(parentNodes)) { - parentNodesToSourceTargetEntryMap.get(parentNodes) - .setValue(tmTokensMap.get(textTokenKey)); - } - } - }; - upcomingSourceTargetParser.parse(); - - String translationBuildFromTM = upcomingSourceTargetParser.doc.body().html(); + String translationBuildFromTM = upcomingSourceParser.doc.body().html(); log.debug("Translation build from given TM is {}", translationBuildFromTM); return translationBuildFromTM; } @@ -177,16 +209,30 @@ public int size() { return parentNodes.size(); } + /** + * Checking whether the parent nodes are identical. When we try to join + * together source tokens, we need to make sure they share exactly the + * same nodes not just tags as parents. When we do look up for matching + * translation tokens, we use different algorithm. + * + * @param o other ParentNodes object + * @return true if parent nodes list are identical in identity + */ @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; ParentNodes that = (ParentNodes) o; + return Objects.equals(parentNodes, that.parentNodes); + } + + @Override + public int hashCode() { ensureParentTags(); - return Objects.equals(parentTags, that.parentTags); + return Objects.hash(parentNodes); } - public void ensureParentTags() { + private void ensureParentTags() { if (parentTags == null) { parentTags = Lists .transform(parentNodes, new Function() { @@ -199,14 +245,14 @@ public Tag apply(Element input) { } } - @Override - public int hashCode() { + List parentTags() { ensureParentTags(); - return Objects.hash(parentTags); + return parentTags; } @Override public String toString() { + ensureParentTags(); return MoreObjects.toStringHelper(this) .add("parentTags", parentTags) .toString(); @@ -225,27 +271,6 @@ public String toString() { } } - private static class SourceTokensToTargetTokens { - private final List> tokens; - - private SourceTokensToTargetTokens( - List> tokens) { - this.tokens = ImmutableList.copyOf(tokens); - } - - private SourceTokensToTargetTokens(List> tokens, String sourceToken, String targetToken) { - this.tokens = ImmutableList - .> builder() - .addAll(tokens) - .add(new AbstractMap.SimpleEntry<>(sourceToken, - targetToken)).build(); - } - - SourceTokensToTargetTokens add(String sourceToken, String targetToken) { - return new SourceTokensToTargetTokens(tokens, sourceToken, targetToken); - } - } - private abstract static class HTMLParser { private static final Document.OutputSettings OUTPUT_SETTINGS = new Document.OutputSettings() @@ -310,20 +335,37 @@ public SourceHTMLParser( @Override protected void doWithTextNode(ParentNodes parentNodes, TextNode textNode) { - AbstractMap.SimpleEntry - sourceToTarget = - new AbstractMap.SimpleEntry<>(textNode.getWholeText(), - null); + String sourceTokenText = textNode.getWholeText(); + if (parentNodesToSourceTargetEntryMap.containsKey(parentNodes)) { Map.Entry previousEntry = parentNodesToSourceTargetEntryMap.get(parentNodes); - + // more than one text token share the same parent nodes, we will + // store individual token as well as combined tokens in case + // text tokens get swapped around in translation + Map.Entry + newEntry = makeMapEntry(previousEntry.getKey() + + sourceTokenText, EMPTY_STRING); + parentNodesToSourceTargetEntryMap.put(parentNodes, newEntry); + log.debug("appending to parent nodes: {}, old token(s) [{}], new token [{}]", + parentNodes, previousEntry.getKey(), sourceTokenText); + + } else { + Map.Entry + sourceToTarget = + makeMapEntry(sourceTokenText, EMPTY_STRING); + parentNodesToSourceTargetEntryMap + .put(parentNodes, sourceToTarget); + log.debug("putting to parent nodes: {}, token [{}]", + parentNodes, sourceTokenText); } - parentNodesToSourceTargetEntryMap - .put(parentNodes, sourceToTarget); } } + private static Map.Entry makeMapEntry(K key, V value) { + return new AbstractMap.SimpleEntry<>(key, value); + } + private static class TargetHTMLParser extends HTMLParser { public TargetHTMLParser( @@ -335,20 +377,38 @@ public TargetHTMLParser( @Override protected void doWithTextNode(ParentNodes parentNodes, TextNode textNode) { - Map.Entry sourceToTarget = - parentNodesToSourceTargetEntryMap.get(parentNodes); - String wholeText = textNode.getWholeText(); - if (sourceToTarget == null) { - log.warn("Can not match translation text token [{}] in source using parent nodes:{}", wholeText, - parentNodes); - - throw new IllegalStateException( - "can not match translation text token [" + - wholeText - + "] in source using parent nodes:" - + parentNodes); + String targetTokenText = textNode.getWholeText(); + Map.Entry matchingSourceToken = + findMatchingSourceToken(parentNodes, targetTokenText); + // we may have other translation tokens under same parent nodes + String previousTrans = matchingSourceToken.getValue(); + matchingSourceToken.setValue(previousTrans + targetTokenText); + log.debug( + "putting to source token [{}] as translation: existing trans [{}], current trans [{}]", + matchingSourceToken.getKey(), previousTrans, + targetTokenText); + } + + private Map.Entry findMatchingSourceToken( + ParentNodes parentNodes, String targetTokenText) { + // because we build a new ParentNodes for target, the identity of + // parent nodes will NOT be the same but the tag names will be. We + // have to use tag names to look up matching source tokens + for (Map.Entry> entry : parentNodesToSourceTargetEntryMap + .entrySet()) { + ParentNodes key = entry.getKey(); + if (key.parentTags().equals(parentNodes.parentTags())) { + return entry.getValue(); + } } - sourceToTarget.setValue(wholeText); + log.warn("Can not match translation text token [{}] in source using parent nodes:{}", targetTokenText, + parentNodes); + + throw new IllegalStateException( + "can not match translation text token [" + + targetTokenText + + "] in source using parent nodes:" + + parentNodes); } } } diff --git a/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java b/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java index de7c169e18..89b470cb1a 100644 --- a/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java +++ b/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java @@ -15,6 +15,7 @@ import org.zanata.model.HTextFlow; import org.zanata.model.HTextFlowTarget; import org.zanata.model.TestFixture; + import com.google.common.base.Charsets; /** @@ -24,28 +25,63 @@ public class TransMemoryMatcherTest { private HDocument document = null; private String resId = "abc"; - private String transMemorySource = - "Do you know
you will never
walk alone?

Yes, I do.

"; - private String transMemoryTarget = - "
永远不会
一个人走你知道吗?

, 我知道."; - private HTextFlow transMemory = new HTextFlow(document, resId, - transMemorySource); + private HTextFlow transMemory; private HLocale targetLocale = TestFixture.setId(1L, new HLocale(new LocaleId("zh"))); @Before public void setUp() { + transMemory = null; + } + + public void givenTransMemory(String sourceContent, String targetContent) { + transMemory = new HTextFlow(document, resId, + sourceContent); HTextFlowTarget transMemoryTranslation = new HTextFlowTarget(transMemory, targetLocale); - transMemoryTranslation.setContent0(transMemoryTarget); + transMemoryTranslation.setContent0(targetContent); transMemory.getTargets().put(targetLocale.getId(), transMemoryTranslation); } @Test public void canMatchSameStructureButDifferentTags() { + // Given: + givenTransMemory( + "Do you know

you will never
walk alone?

Yes, I do.

", + "DO YOU KNOW
YOU WILL NEVER
WALK ALONE?

YES, I DO.

"); + String upcomingSource = + "Do you know you will never walk alone? Yes, I do."; + HTextFlow upcomingMessage = + new HTextFlow(document, resId, upcomingSource); + TransMemoryMatcher matcher = + new TransMemoryMatcher(upcomingMessage, transMemory, + targetLocale); + + + // When: + double similarityPercent = matcher.calculateSimilarityPercent(); + + // Then: + Assertions.assertThat(similarityPercent) + .isEqualTo(100) + .as("same structure but different tags can be matched as 100%"); + + // When: + String translation = matcher.translationFromTransMemory(); + + // Then: + Assertions + .assertThat(translation) + .isEqualTo( + "DO YOU KNOW YOU WILL NEVER WALK ALONE? YES, I DO.") + .as("will replace translation from TM with correct tags"); + } + + @Test + public void canMatchSameStructureButDifferentTagsPlusTranslationSwappedLocation() { // Given: String upcomingSource = - "Do you know you will never walk alone?Yes, I do."; + "Do you know you will never walk alone? Yes, I do."; HTextFlow upcomingMessage = new HTextFlow(document, resId, upcomingSource); TransMemoryMatcher matcher = @@ -68,7 +104,7 @@ public void canMatchSameStructureButDifferentTags() { Assertions .assertThat(translation) .isEqualTo( - "永远不会一个人走你知道吗?
, 我知道.") + "永远不会一个人走你知道吗? , 我知道.") .as("will replace translation from TM with correct tags"); }