diff --git a/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java b/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java index acaa205727..ccbd3c6d54 100644 --- a/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java +++ b/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java @@ -1,8 +1,10 @@ package org.zanata.search; +import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.Objects; import javax.annotation.Nullable; @@ -20,12 +22,14 @@ import org.zanata.model.HLocale; import org.zanata.model.HTextFlow; +import com.beust.jcommander.internal.Maps; import com.google.common.base.Charsets; import com.google.common.base.Function; import com.google.common.base.MoreObjects; import com.google.common.base.Optional; +import com.google.common.base.Predicate; import com.google.common.collect.ImmutableList; -import com.google.common.collect.Iterators; +import com.google.common.collect.Iterables; import com.google.common.collect.Lists; /** @@ -41,7 +45,7 @@ public class TransMemoryMatcher { private boolean canFullyReuseTM = false; private final Element transMemorySourceRootElement; private final Document upcomingSourceDoc; - private LinkedList upcomingSourceTokens; + private LinkedList upcomingSourceTokens = Lists.newLinkedList(); public TransMemoryMatcher(HTextFlow upcomingSource, HTextFlow transMemory, HLocale targetLocale) { @@ -65,10 +69,10 @@ public TransMemoryMatcher(HTextFlow upcomingSource, if (!transMemoryParser.canFullyMatch) { canFullyReuseTM = false; - return; + // TODO pahuang if we can not fully reuse TM we should quit here and +// return; } log.debug("=== about to parse upcoming source ==="); - upcomingSourceTokens = Lists.newLinkedList(); List upcomingNodes = upcomingSourceDoc.body() .childNodes(); depthFirstTraverseSourceNodes(upcomingNodes, new ParentNodes(), @@ -94,7 +98,7 @@ && matchText(nextSourceToken.optElementAfter, nextTransMemoryToken = nextOrNull(transMemoryTokensIt); } else { log.debug("can not find matching translation for [{}] in TM", nextSourceToken); - nextSourceToken.targetText = ""; + nextSourceToken.targetText = "!UnKnoWn!"; } nextSourceToken = nextOrNull(upcomingSourceTokensIt); } @@ -116,7 +120,7 @@ private static Optional getOptionalElementText(Optional optElem private static void logContext(HTextFlow upcomingSource, HTextFlow transMemory, HLocale targetLocale) { if (log.isDebugEnabled()) { - log.debug("about to match upcoming source: {} to TM: {} -> {}", + log.debug("about to match \nupcoming source: \n * {} to \nTM: \n * {} \n-> {}", upcomingSource.getContents(), transMemory.getContents(), transMemory.getTargets().get(targetLocale.getId()) .getContents()); @@ -130,7 +134,7 @@ private static void depthFirstTraverseSourceNodes(List sourceNodes, if (node instanceof TextNode) { TextNode textNode = (TextNode) node; TextTokenKey textTokenKey = - new TextTokenKey(textNode, parentNodes.parentTags(), + new TextTokenKey(textNode, parentNodes, getOptionalBeforeSiblingElement(textNode), getOptionalAfterSiblingElement(textNode), textNode.siblingIndex(), @@ -180,9 +184,11 @@ public double calculateSimilarityPercent() { public String translationFromTransMemory() { + // TODO pahuang check whether we can fully reuse TM for (TextTokenKey upcomingSourceToken : upcomingSourceTokens) { upcomingSourceToken.node.text(upcomingSourceToken.targetText); } + // TODO pahuang reshuffle tags under same parent nodes to match what's in TM target String translationBuildFromTM = upcomingSourceDoc.outputSettings(OUTPUT_SETTINGS).body().html(); log.debug("Translation build from given TM is {}", translationBuildFromTM); return translationBuildFromTM; @@ -190,16 +196,16 @@ public String translationFromTransMemory() { private static class ParentNodes { - private final List parentNodes; + private final List parentElements; private transient List parentTags; ParentNodes() { - this.parentNodes = ImmutableList.of(); + this.parentElements = ImmutableList.of(); } private ParentNodes(ParentNodes parentNodes, Element elementNode) { - this.parentNodes = - ImmutableList. builder().addAll(parentNodes.parentNodes) + this.parentElements = + ImmutableList. builder().addAll(parentNodes.parentElements) .add(elementNode).build(); } @@ -208,16 +214,17 @@ ParentNodes append(Element currentElementNode) { } public int size() { - return parentNodes.size(); + return parentElements.size(); } /** - * Checking whether the parent nodes are identical. When we try to join - * together source tokens, we need to make sure they share exactly the - * same nodes not just tags as parents. When we do look up for matching - * translation tokens, we use different algorithm. + * Checking whether the parent elements are identical. When we try to + * compare between source tokens, we need to make sure they share + * exactly the same elements not just tags. When we do look up for + * matching translation tokens, we use tags. * - * @param o other ParentNodes object + * @param o + * other ParentNodes object * @return true if parent nodes list are identical in identity */ @Override @@ -225,19 +232,19 @@ public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; ParentNodes that = (ParentNodes) o; - return Objects.equals(parentNodes, that.parentNodes); + return Objects.equals(parentElements, that.parentElements); } @Override public int hashCode() { ensureParentTags(); - return Objects.hash(parentNodes); + return Objects.hash(parentElements); } private void ensureParentTags() { if (parentTags == null) { parentTags = Lists - .transform(parentNodes, new Function() { + .transform(parentElements, new Function() { @Nullable @Override public Tag apply(Element input) { @@ -254,10 +261,7 @@ List parentTags() { @Override public String toString() { - ensureParentTags(); - return MoreObjects.toStringHelper(this) - .add("parentTags", parentTags) - .toString(); + return parentTags().toString(); } } @@ -265,7 +269,7 @@ public String toString() { @EqualsAndHashCode private static class TextTokenKey { private final TextNode node; - private final List parentTags; + private final ParentNodes parentNodes; private final Optional optElementBefore; private final Optional optElementAfter; private final int siblingIndex; @@ -274,20 +278,20 @@ private static class TextTokenKey { @Override public String toString() { - return "(" + parentTags + ")#" + siblingIndex +":" + sourceText; + return "(" + parentNodes + ")#" + siblingIndex +":" + sourceText; } } @RequiredArgsConstructor private static class TranslationToken { - private static final TranslationToken EMPTY = new TranslationToken(Optional.absent(), Optional.absent(), ""); + private final List parentTags; private final Optional optElementBefore; private final Optional optElementAfter; private final String targetText; @Override public String toString() { - return targetText; + return "(" + parentTags + ")" + targetText; } } @@ -296,7 +300,11 @@ private static class TransMemoryHTMLParser { private boolean canFullyMatch = true; private List sourceTextTokens = Lists.newLinkedList(); private List targetTextTokens = Lists.newLinkedList(); - private List leftOverTransTokens = Lists.newLinkedList(); + private List leftOverTransTokens = Collections.emptyList(); + private transient Map, List> parentTagsToTransTokens = + Maps.newHashMap(); + private transient Map lastSourceTextNodeForEachParentNodes = + Maps.newHashMap(); private TransMemoryHTMLParser( Element sourceRootElement, String targetContent) { @@ -307,66 +315,118 @@ private TransMemoryHTMLParser( private void parse(Document targetDoc) { List targetChildNodes = targetDoc.body().childNodes(); - depthFirstTraverseTargetNodes(targetChildNodes); + // TODO pahuang use jsoup built-in traverse method + depthFirstTraverseTargetNodes(targetChildNodes, new ParentNodes()); depthFirstTraverseSourceNodes(sourceRootElement.childNodes(), new ParentNodes(), sourceTextTokens); matchSourceTokensToTargetTokens(); } - private void depthFirstTraverseTargetNodes(List targetChildNodes) { + private void depthFirstTraverseTargetNodes(List targetChildNodes, + ParentNodes parentNodes) { for (Node node : targetChildNodes) { if (node instanceof TextNode) { TextNode textNode = (TextNode) node; TranslationToken translationToken = new TranslationToken( + parentNodes.parentTags(), getOptionalBeforeSiblingElement(textNode), getOptionalAfterSiblingElement(textNode), textNode.getWholeText()); + addToParentTagsMap(parentNodes.parentTags(), + translationToken); targetTextTokens.add(translationToken); } else { Element element = (Element) node; - depthFirstTraverseTargetNodes(element.childNodes()); + depthFirstTraverseTargetNodes(element.childNodes(), + parentNodes.append(element)); } } } + private void addToParentTagsMap( + List tags, TranslationToken translationToken) { + if (parentTagsToTransTokens.containsKey(tags)) { + parentTagsToTransTokens.get(tags).add(translationToken); + } else { + parentTagsToTransTokens.put(tags, Lists.newArrayList(translationToken)); + } + } + private void matchSourceTokensToTargetTokens() { - Iterator sourceTokensIt = sourceTextTokens.iterator(); - Iterator - targetTokensIt = targetTextTokens.iterator(); - TextTokenKey nextSourceToken = sourceTokensIt.next(); - TranslationToken nextTargetToken = targetTokensIt.next(); - - while (nextSourceToken != null) { - if (nextTargetToken != null - && matchTag(nextSourceToken.optElementBefore, - nextTargetToken.optElementBefore) - && matchTag(nextSourceToken.optElementAfter, - nextTargetToken.optElementAfter)) { - log.debug("found matching translation for [{}] -> [{}]", nextSourceToken, nextTargetToken); - nextSourceToken.targetText = nextTargetToken.targetText; - nextTargetToken = nextOrNull(targetTokensIt); + + for (TextTokenKey nextSourceToken : sourceTextTokens) { + lastSourceTextNodeForEachParentNodes.put( + nextSourceToken.parentNodes, nextSourceToken); + List transTokensUnderThisParentTags = + MoreObjects.firstNonNull(parentTagsToTransTokens + .get(nextSourceToken.parentNodes.parentTags()), + Collections. emptyList()); + log.debug("translation tokens under this parent tags [{}]: {}", + nextSourceToken.parentNodes, + transTokensUnderThisParentTags); + + // we match translation token to source token by checking their + // before and after element tag matches and they have same + // parent tags + Optional optMatchTransToken = + Iterables.tryFind(transTokensUnderThisParentTags, + new MatchTMSourceToTargetTextTokenPredicate( + nextSourceToken)); + if (optMatchTransToken.isPresent()) { + TranslationToken translationToken = + optMatchTransToken.get(); + log.debug("found matching translation for [{}] -> [{}]", nextSourceToken, translationToken); + nextSourceToken.targetText = translationToken.targetText; + // remove matched translation + transTokensUnderThisParentTags.remove(translationToken); } else { - log.debug("can not find matching translation for [{}]", nextSourceToken); + log.debug("can not find matching translation for [{}], assuming empty string.", nextSourceToken); nextSourceToken.targetText = ""; } - nextSourceToken = nextOrNull(sourceTokensIt); } - if (targetTokensIt.hasNext()) { - Iterators.addAll(leftOverTransTokens, targetTokensIt); + + leftOverTransTokens = + Lists.newLinkedList(Iterables.concat(parentTagsToTransTokens.values())); + if (leftOverTransTokens.size() > 0) { + // FIXME pahuang for left over trans tokens, assign/append it to the last source token under same parent tags + for (TranslationToken leftOverTransToken : leftOverTransTokens) { + + } log.info("target tokens left: {}", leftOverTransTokens); canFullyMatch = false; } } - private static Optional getTag(Optional elementOptional) { - return elementOptional.isPresent() ? Optional.of( - elementOptional.get().tag()) : Optional.absent(); - } + private static class MatchTMSourceToTargetTextTokenPredicate + implements Predicate { + private final TextTokenKey sourceToken; - private static boolean matchTag(Optional optSourceEle, Optional optTargetEle) { - return (getTag(optSourceEle).equals(getTag(optTargetEle))); - } + public MatchTMSourceToTargetTextTokenPredicate( + TextTokenKey sourceToken) { + this.sourceToken = sourceToken; + } + + @Override + public boolean apply(@Nullable TranslationToken transToken) { + return transToken != null + && + matchTag( + sourceToken.optElementBefore, + transToken.optElementBefore) + && matchTag( + sourceToken.optElementAfter, + transToken.optElementAfter); + } + + private static Optional getTag(Optional elementOptional) { + return elementOptional.isPresent() ? Optional.of( + elementOptional.get().tag()) : Optional.absent(); + } + private static boolean matchTag(Optional optSourceEle, Optional optTargetEle) { + return (getTag(optSourceEle).equals(getTag(optTargetEle))); + } + } } } diff --git a/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java b/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java index b1b80b7e07..d29dd0f0ba 100644 --- a/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java +++ b/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java @@ -27,6 +27,7 @@ public class TransMemoryMatcherTest { private String resId = "abc"; private HTextFlow transMemory; private HLocale targetLocale = TestFixture.setId(1L, new HLocale(new LocaleId("zh"))); + private TransMemoryMatcher matcher; @Before public void setUp() { @@ -51,11 +52,7 @@ public void canMatchSameStructureButDifferentTags() { "你知道吗
永远不会
一个人走?"); String upcomingSource = "Do you know you will never walk alone?"; - HTextFlow upcomingMessage = - new HTextFlow(document, resId, upcomingSource); - TransMemoryMatcher matcher = - new TransMemoryMatcher(upcomingMessage, transMemory, - targetLocale); + matcher = givenUpcomingSourceToMatch(upcomingSource); // When: @@ -77,6 +74,14 @@ public void canMatchSameStructureButDifferentTags() { .as("will replace translation from TM with correct tags"); } + public TransMemoryMatcher givenUpcomingSourceToMatch( + String upcomingSource) { + HTextFlow upcomingMessage = + new HTextFlow(document, resId, upcomingSource); + return new TransMemoryMatcher(upcomingMessage, transMemory, + targetLocale); + } + @Test public void canMatchSameStructureButDifferentTagsPlusTranslationSwappedLocation() { // Given: @@ -85,11 +90,8 @@ public void canMatchSameStructureButDifferentTagsPlusTranslationSwappedLocation( "
永远不会
一个人走你知道吗?"); String upcomingSource = "Do you know you will never walk alone?"; - HTextFlow upcomingMessage = - new HTextFlow(document, resId, upcomingSource); - TransMemoryMatcher matcher = - new TransMemoryMatcher(upcomingMessage, transMemory, - targetLocale); + matcher = + givenUpcomingSourceToMatch(upcomingSource); // When: @@ -111,6 +113,24 @@ public void canMatchSameStructureButDifferentTagsPlusTranslationSwappedLocation( .as("will replace translation from TM with correct tags"); } + @Test + public void moreTestCases() { + // Given: two text tokens are identical in source, e.g. "good" + givenTransMemory( + "How good are you? I am good.", + "吗? 我不错。"); + matcher = givenUpcomingSourceToMatch( + "How good are you? I am good."); + + // When: + String translation = matcher.translationFromTransMemory(); + + // Then: + Assertions.assertThat(translation) + .isEqualTo( + "吗? 我不错。"); + } + @Test public void test() { Document doc = Jsoup.parseBodyFragment(