WIP

zanata · Jun 4, 2015 · 0ab8655 · 0ab8655
1 parent 6e027e0
commit 0ab8655
Show file tree

Hide file tree

Showing 2 changed files with 178 additions and 82 deletions.
diff --git a/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java b/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java
@@ -32,16 +32,16 @@
  */
 @Slf4j
 public class TransMemoryMatcher {
-    private final SourceHTMLParser upcomingSourceParser;
+    private static final String EMPTY_STRING = "";
     private final HTMLParser tmSourceParser;
     private final Map<TextTokenKey, String> tmTokensMap;
     private final Map<TextTokenKey, String> upcomingTokensMap;
-    private final String upcomingSourceContent;
+    private final HTMLParser upcomingSourceParser;
 
     public TransMemoryMatcher(HTextFlow upcomingSource,
             HTextFlow transMemory, HLocale targetLocale) {
         // TODO pahuang work on plural
-        upcomingSourceContent = upcomingSource.getContents().get(0);
+        String upcomingSourceContent = upcomingSource.getContents().get(0);
         String tmSource = transMemory.getContents().get(0);
         String tmTarget = transMemory.getTargets().get(targetLocale.getId()).getContents().get(
                 0);
@@ -56,17 +56,67 @@ public TransMemoryMatcher(HTextFlow upcomingSource,
         tmSourceParser = new SourceHTMLParser(tmMap, tmSource);
         HTMLParser tmTargetParser = new TargetHTMLParser(tmMap, tmTarget);
 
+        log.debug("=== about to parse TM source ===");
         tmSourceParser.parse();
+        log.debug("=== about to parse TM target ===");
         tmTargetParser.parse();
 
         Map<ParentNodes, Map.Entry<String, String>> upcomingSourceMap = Maps.newHashMap();
 
+        log.debug("=== about to parse upcoming source ===");
         upcomingSourceParser =
-                new SourceHTMLParser(upcomingSourceMap, upcomingSourceContent);
+                new HTMLParser(
+                        upcomingSourceMap,
+                        upcomingSourceContent) {
+
+                    @Override
+                    protected void doWithTextNode(
+                            ParentNodes parentNodes,
+                            TextNode textNode) {
+                        String sourceTokenText = textNode.getWholeText();
+                        String translation = EMPTY_STRING;
+                        // text node text will get updated if it can find a
+                        // match in TM tokens
+                        textNode.text(translation);
+                        if (parentNodesToSourceTargetEntryMap.containsKey(parentNodes)) {
+                            Map.Entry<String, String> previousEntry =
+                                    parentNodesToSourceTargetEntryMap.get(parentNodes);
+                            // more than one text token share the same parent nodes, we combine the text together
+                            String combinedText = previousEntry.getKey() +
+                                    sourceTokenText;
+                            Map.Entry<String, String> newEntry =
+                                    makeMapEntry(combinedText, translation);
+                            parentNodesToSourceTargetEntryMap.put(parentNodes, newEntry);
+                            TextTokenKey key =
+                                    new TextTokenKey(parentNodes.size(),
+                                            combinedText);
+                            if (tmTokensMap.containsKey(key)) {
+                                translation = tmTokensMap.get(key);
+                                textNode.text(translation);
+                                newEntry.setValue(translation);
+                            }
+
+                        } else {
+                            Map.Entry<String, String>
+                                    sourceToTarget =
+                                    makeMapEntry(sourceTokenText, translation);
+                            parentNodesToSourceTargetEntryMap
+                                    .put(parentNodes, sourceToTarget);
+                            TextTokenKey key =
+                                    new TextTokenKey(parentNodes.size(),
+                                            sourceTokenText);
+                            if (tmTokensMap.containsKey(key)) {
+                                translation = tmTokensMap.get(key);
+                                textNode.text(translation);
+                                sourceToTarget.setValue(translation);
+                            }
+
+                        }
+                    }
+                };
         upcomingSourceParser.parse();
 
-        tmTokensMap = toMatchableTextTokensMap(
-                tmMap);
+        tmTokensMap = toMatchableTextTokensMap(tmMap);
         upcomingTokensMap = toMatchableTextTokensMap(upcomingSourceMap);
     }
 
@@ -121,7 +171,8 @@ private static Map<TextTokenKey, String> toMatchableTextTokensMap(
             Map.Entry<String, String> sourceToTarget = entry.getValue();
             String textFlowSource = sourceToTarget
                     .getKey();
-            String textFlowTarget = sourceToTarget.getValue() == null ? textFlowSource : sourceToTarget.getValue();
+            String textFlowTarget = sourceToTarget.getValue() == null ?
+                    EMPTY_STRING : sourceToTarget.getValue();
             tmMapBuilder.put(
                     new TextTokenKey(parentNodesForTextNode.size(),
                             textFlowSource), textFlowTarget);
@@ -130,26 +181,7 @@ private static Map<TextTokenKey, String> toMatchableTextTokensMap(
     }
 
     public String translationFromTransMemory() {
-        HTMLParser upcomingSourceTargetParser =
-                new HTMLParser(this.upcomingSourceParser.parentNodesToSourceTargetEntryMap,
-                        upcomingSourceContent) {
-
-                    @Override
-                    protected void doWithTextNode(
-                            ParentNodes parentNodes,
-                            TextNode textNode) {
-                        TextTokenKey textTokenKey =
-                                new TextTokenKey(parentNodes.size(), textNode.getWholeText());
-                        if (tmTokensMap.containsKey(textTokenKey) &&
-                                parentNodesToSourceTargetEntryMap.containsKey(parentNodes)) {
-                            parentNodesToSourceTargetEntryMap.get(parentNodes)
-                                    .setValue(tmTokensMap.get(textTokenKey));
-                        }
-                    }
-                };
-        upcomingSourceTargetParser.parse();
-
-        String translationBuildFromTM = upcomingSourceTargetParser.doc.body().html();
+        String translationBuildFromTM = upcomingSourceParser.doc.body().html();
         log.debug("Translation build from given TM is {}", translationBuildFromTM);
         return translationBuildFromTM;
     }
@@ -177,16 +209,30 @@ public int size() {
             return parentNodes.size();
         }
 
+        /**
+         * Checking whether the parent nodes are identical. When we try to join
+         * together source tokens, we need to make sure they share exactly the
+         * same nodes not just tags as parents. When we do look up for matching
+         * translation tokens, we use different algorithm.
+         *
+         * @param o other ParentNodes object
+         * @return true if parent nodes list are identical in identity
+         */
         @Override
         public boolean equals(Object o) {
             if (this == o) return true;
             if (o == null || getClass() != o.getClass()) return false;
             ParentNodes that = (ParentNodes) o;
+            return Objects.equals(parentNodes, that.parentNodes);
+        }
+
+        @Override
+        public int hashCode() {
             ensureParentTags();
-            return Objects.equals(parentTags, that.parentTags);
+            return Objects.hash(parentNodes);
         }
 
-        public void ensureParentTags() {
+        private void ensureParentTags() {
             if (parentTags == null) {
                 parentTags = Lists
                         .transform(parentNodes, new Function<Element, Tag>() {
@@ -199,14 +245,14 @@ public Tag apply(Element input) {
             }
         }
 
-        @Override
-        public int hashCode() {
+        List<Tag> parentTags() {
             ensureParentTags();
-            return Objects.hash(parentTags);
+            return parentTags;
         }
 
         @Override
         public String toString() {
+            ensureParentTags();
             return MoreObjects.toStringHelper(this)
                     .add("parentTags", parentTags)
                     .toString();
@@ -225,27 +271,6 @@ public String toString() {
         }
     }
 
-    private static class SourceTokensToTargetTokens {
-        private final List<Map.Entry<String, String>> tokens;
-
-        private SourceTokensToTargetTokens(
-                List<Map.Entry<String, String>> tokens) {
-            this.tokens = ImmutableList.copyOf(tokens);
-        }
-
-        private SourceTokensToTargetTokens(List<Map.Entry<String, String>> tokens, String sourceToken, String targetToken) {
-            this.tokens = ImmutableList
-                    .<Map.Entry<String, String>> builder()
-                    .addAll(tokens)
-                    .add(new AbstractMap.SimpleEntry<>(sourceToken,
-                            targetToken)).build();
-        }
-
-        SourceTokensToTargetTokens add(String sourceToken, String targetToken) {
-            return new SourceTokensToTargetTokens(tokens, sourceToken, targetToken);
-        }
-    }
-
     private abstract static class HTMLParser {
         private static final Document.OutputSettings OUTPUT_SETTINGS =
                 new Document.OutputSettings()
@@ -310,20 +335,37 @@ public SourceHTMLParser(
         @Override
         protected void doWithTextNode(ParentNodes parentNodes,
                 TextNode textNode) {
-            AbstractMap.SimpleEntry<String, String>
-                    sourceToTarget =
-                    new AbstractMap.SimpleEntry<>(textNode.getWholeText(),
-                            null);
+            String sourceTokenText = textNode.getWholeText();
+
             if (parentNodesToSourceTargetEntryMap.containsKey(parentNodes)) {
                 Map.Entry<String, String> previousEntry =
                         parentNodesToSourceTargetEntryMap.get(parentNodes);
-
+                // more than one text token share the same parent nodes, we will
+                // store individual token as well as combined tokens in case
+                // text tokens get swapped around in translation
+                Map.Entry<String, String>
+                        newEntry = makeMapEntry(previousEntry.getKey() +
+                        sourceTokenText, EMPTY_STRING);
+                parentNodesToSourceTargetEntryMap.put(parentNodes, newEntry);
+                log.debug("appending to parent nodes: {}, old token(s) [{}], new token [{}]",
+                        parentNodes, previousEntry.getKey(), sourceTokenText);
+
+            } else {
+                Map.Entry<String, String>
+                        sourceToTarget =
+                        makeMapEntry(sourceTokenText, EMPTY_STRING);
+                parentNodesToSourceTargetEntryMap
+                        .put(parentNodes, sourceToTarget);
+                log.debug("putting to parent nodes: {}, token [{}]",
+                        parentNodes, sourceTokenText);
             }
-            parentNodesToSourceTargetEntryMap
-                    .put(parentNodes, sourceToTarget);
         }
     }
 
+    private static <K, V> Map.Entry<K, V> makeMapEntry(K key, V value) {
+        return new AbstractMap.SimpleEntry<>(key, value);
+    }
+
     private static class TargetHTMLParser extends HTMLParser {
 
         public TargetHTMLParser(
@@ -335,20 +377,38 @@ public TargetHTMLParser(
         @Override
         protected void doWithTextNode(ParentNodes parentNodes,
                 TextNode textNode) {
-            Map.Entry<String, String> sourceToTarget =
-                    parentNodesToSourceTargetEntryMap.get(parentNodes);
-            String wholeText = textNode.getWholeText();
-            if (sourceToTarget == null) {
-                log.warn("Can not match translation text token [{}] in source using parent nodes:{}", wholeText,
-                        parentNodes);
-
-                throw new IllegalStateException(
-                        "can not match translation text token [" +
-                                wholeText
-                                + "] in source using parent nodes:"
-                                + parentNodes);
+            String targetTokenText = textNode.getWholeText();
+            Map.Entry<String, String> matchingSourceToken =
+                    findMatchingSourceToken(parentNodes, targetTokenText);
+            // we may have other translation tokens under same parent nodes
+            String previousTrans = matchingSourceToken.getValue();
+            matchingSourceToken.setValue(previousTrans + targetTokenText);
+            log.debug(
+                    "putting to source token [{}] as translation: existing trans [{}], current trans [{}]",
+                    matchingSourceToken.getKey(), previousTrans,
+                    targetTokenText);
+        }
+
+        private Map.Entry<String, String> findMatchingSourceToken(
+                ParentNodes parentNodes, String targetTokenText) {
+            // because we build a new ParentNodes for target, the identity of
+            // parent nodes will NOT be the same but the tag names will be. We
+            // have to use tag names to look up matching source tokens
+            for (Map.Entry<ParentNodes, Map.Entry<String, String>> entry : parentNodesToSourceTargetEntryMap
+                    .entrySet()) {
+                ParentNodes key = entry.getKey();
+                if (key.parentTags().equals(parentNodes.parentTags())) {
+                    return entry.getValue();
+                }
             }
-            sourceToTarget.setValue(wholeText);
+            log.warn("Can not match translation text token [{}] in source using parent nodes:{}", targetTokenText,
+                    parentNodes);
+
+            throw new IllegalStateException(
+                    "can not match translation text token [" +
+                            targetTokenText
+                            + "] in source using parent nodes:"
+                            + parentNodes);
         }
     }
 }
diff --git a/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java b/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java
@@ -15,6 +15,7 @@
 import org.zanata.model.HTextFlow;
 import org.zanata.model.HTextFlowTarget;
 import org.zanata.model.TestFixture;
+
 import com.google.common.base.Charsets;
 
 /**
@@ -24,28 +25,63 @@
 public class TransMemoryMatcherTest {
     private HDocument document = null;
     private String resId = "abc";
-    private String transMemorySource =
-            "Do you know <div><some>you</some> will <strong>never</strong></div> walk alone?<p><i>Yes</i>, I do.</p>";
-    private String transMemoryTarget =
-            "<div><some>你</some><strong>永远不会</strong></div>一个人走你知道吗?<p> <i>是</i>, 我知道.";
-    private HTextFlow transMemory = new HTextFlow(document, resId,
-            transMemorySource);
+    private HTextFlow transMemory;
     private HLocale targetLocale = TestFixture.setId(1L, new HLocale(new LocaleId("zh")));
 
     @Before
     public void setUp() {
+        transMemory = null;
+    }
+
+    public void givenTransMemory(String sourceContent, String targetContent) {
+        transMemory = new HTextFlow(document, resId,
+                sourceContent);
         HTextFlowTarget transMemoryTranslation =
                 new HTextFlowTarget(transMemory, targetLocale);
-        transMemoryTranslation.setContent0(transMemoryTarget);
+        transMemoryTranslation.setContent0(targetContent);
         transMemory.getTargets().put(targetLocale.getId(),
                 transMemoryTranslation);
     }
 
     @Test
     public void canMatchSameStructureButDifferentTags() {
+        // Given:
+        givenTransMemory(
+                "Do you know <div><some>you</some> will <strong>never</strong></div> walk alone? <p><i>Yes</i>, I do.</p>",
+                "DO YOU KNOW <div><some>YOU</some> WILL <strong>NEVER</strong></div> WALK ALONE? <p><i>YES</i>, I DO.</p>");
+        String upcomingSource =
+                "Do you know <span><other>you</other> will <bold>never</bold></span> walk alone? <para><o>Yes</o>, I do.</para>";
+        HTextFlow upcomingMessage =
+                new HTextFlow(document, resId, upcomingSource);
+        TransMemoryMatcher matcher =
+                new TransMemoryMatcher(upcomingMessage, transMemory,
+                        targetLocale);
+
+
+        // When:
+        double similarityPercent = matcher.calculateSimilarityPercent();
+
+        // Then:
+        Assertions.assertThat(similarityPercent)
+                .isEqualTo(100)
+                .as("same structure but different tags can be matched as 100%");
+
+        // When:
+        String translation = matcher.translationFromTransMemory();
+
+        // Then:
+        Assertions
+                .assertThat(translation)
+                .isEqualTo(
+                        "DO YOU KNOW <span><other>YOU</other> WILL <bold>NEVER</bold></span> WALK ALONE? <para><o>YES</o>, I DO.</para>")
+                .as("will replace translation from TM with correct tags");
+    }
+
+    @Test
+    public void canMatchSameStructureButDifferentTagsPlusTranslationSwappedLocation() {
         // Given:
         String upcomingSource =
-                "Do you know <span><other>you</other> will <bold>never</bold></span> walk alone?<para><o>Yes</o>, I do.</para>";
+                "Do you know <span><other>you</other> will <bold>never</bold></span> walk alone? <para><o>Yes</o>, I do.</para>";
         HTextFlow upcomingMessage =
                 new HTextFlow(document, resId, upcomingSource);
         TransMemoryMatcher matcher =
@@ -68,7 +104,7 @@ public void canMatchSameStructureButDifferentTags() {
         Assertions
                 .assertThat(translation)
                 .isEqualTo(
-                        "<span><other>你<bold>永远不会</bold></span>一个人走你知道吗?<br> <o>是</o>, 我知道.")
+                        "<span><other>你</other><bold>永远不会</bold></span>一个人走你知道吗? <para><o>是</o>, 我知道.</para>")
                 .as("will replace translation from TM with correct tags");
     }