From e354dc0990b052f791da5c8d4e9986b2812f420f Mon Sep 17 00:00:00 2001
From: Patrick Huang
Date: Fri, 5 Jun 2015 15:21:03 +1000
Subject: [PATCH] gave up on swapped tags translation
---
.../org/zanata/search/TransMemoryMatcher.java | 147 ++++++++++--------
.../zanata/search/TransMemoryMatcherTest.java | 9 +-
2 files changed, 90 insertions(+), 66 deletions(-)
diff --git a/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java b/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java
index 1dff38d867..d2dff948a3 100644
--- a/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java
+++ b/zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java
@@ -1,18 +1,18 @@
package org.zanata.search;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Objects;
-
-import javax.annotation.Nullable;
-
+import com.google.common.base.Charsets;
+import com.google.common.base.Function;
+import com.google.common.base.MoreObjects;
+import com.google.common.base.Optional;
+import com.google.common.base.Preconditions;
+import com.google.common.base.Predicate;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
import lombok.EqualsAndHashCode;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
-
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -22,17 +22,14 @@
import org.zanata.model.HLocale;
import org.zanata.model.HTextFlow;
-import com.beust.jcommander.internal.Maps;
-import com.google.common.base.Charsets;
-import com.google.common.base.Function;
-import com.google.common.base.MoreObjects;
-import com.google.common.base.Optional;
-import com.google.common.base.Preconditions;
-import com.google.common.base.Predicate;
-import com.google.common.base.Predicates;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
/**
* @author Patrick Huang
@@ -46,10 +43,8 @@ public class TransMemoryMatcher {
.prettyPrint(false);
private boolean canFullyReuseTM = false;
private final Element transMemorySourceRootElement;
- private final Element transMemoryTargetRootElement;
private final Document upcomingSourceDoc;
- private LinkedList upcomingSourceTokens = Lists.newLinkedList();
- private List transMemoryTextTokens = Lists.newLinkedList();
+ private final String upcomingSourceTextOnly;
public TransMemoryMatcher(HTextFlow upcomingSource,
HTextFlow transMemory, HLocale targetLocale) {
@@ -66,12 +61,16 @@ public TransMemoryMatcher(HTextFlow upcomingSource,
// jsoup will append a
to the end if it sees a standalone
// jsoup will ignore
upcomingSourceDoc = Jsoup.parseBodyFragment(upcomingSourceContent);
+ upcomingSourceTextOnly = upcomingSourceDoc.body().text();
+
transMemorySourceRootElement = Jsoup.parseBodyFragment(tmSource).body();
- transMemoryTargetRootElement = Jsoup.parseBodyFragment(tmTarget).body();
+ Element transMemoryTargetRootElement =
+ Jsoup.parseBodyFragment(tmTarget).body();
TransMemoryHTMLParser transMemoryParser =
- new TransMemoryHTMLParser(transMemorySourceRootElement, transMemoryTargetRootElement);
+ new TransMemoryHTMLParser(transMemorySourceRootElement,
+ transMemoryTargetRootElement);
if (!transMemoryParser.perfectMatch && !transMemoryParser.heuristicMatch) {
log.info(
@@ -80,11 +79,12 @@ public TransMemoryMatcher(HTextFlow upcomingSource,
canFullyReuseTM = false;
return;
}
- transMemoryTextTokens = transMemoryParser.textTokens;
+ List transMemoryTextTokens = transMemoryParser.textTokens;
log.debug("=== about to parse upcoming source ===");
List upcomingNodes = upcomingSourceDoc.body()
.childNodes();
+ LinkedList upcomingSourceTokens = Lists.newLinkedList();
depthFirstTraverseSourceNodes(upcomingNodes, new ParentNodes(),
upcomingSourceTokens);
@@ -96,11 +96,9 @@ public TransMemoryMatcher(HTextFlow upcomingSource,
TextTokenKey nextTransMemoryToken = transMemoryTokensIt.next();
while (nextSourceToken != null) {
- if (nextTransMemoryToken != null
- && matchText(nextSourceToken.optElementBefore,
- nextTransMemoryToken.optElementBefore)
- && matchText(nextSourceToken.optElementAfter,
- nextTransMemoryToken.optElementAfter)) {
+ if (upcomingSourceTokenMatchesTMSourceToken(
+ nextSourceToken,
+ nextTransMemoryToken)) {
log.debug("found source token for [{}] in TM [{}]", nextSourceToken, nextTransMemoryToken);
// we can't update text on the node here because it will affect matchText(nextSourceToken.optElementBefore, nextTransMemoryToken.optElementBefore) above
@@ -115,6 +113,46 @@ && matchText(nextSourceToken.optElementAfter,
canFullyReuseTM = !upcomingSourceTokensIt.hasNext() &&
!transMemoryTokensIt.hasNext();
+
+ // apply matched tokens
+ if (canFullyReuseTM) {
+ StringBuilder generatedTargetTextOnly = new StringBuilder();
+ for (TextTokenKey upcomingSourceToken : upcomingSourceTokens) {
+ upcomingSourceToken.sourceNode.text(upcomingSourceToken.targetText);
+ generatedTargetTextOnly.append(upcomingSourceToken.targetText);
+ }
+ String tmTargetTextOnly =
+ transMemoryParser.tmTargetTextOnly.toString();
+ String translationBuildFromTM = upcomingSourceDoc.outputSettings(OUTPUT_SETTINGS).body().html();
+ log.debug("Translation build from given TM is:{}", translationBuildFromTM);
+ // do a final comparison to see if the generated translation matches
+ // TM translation (in case some tags has changed locations,
+ // I don't have a way to fix it.
+ log.debug(
+ "comparing TM target text only to generated target text:\nTM :{}\nGEN:{}",
+ tmTargetTextOnly, generatedTargetTextOnly);
+ canFullyReuseTM = generatedTargetTextOnly.toString().equals(
+ tmTargetTextOnly);
+ }
+
+ }
+
+ /**
+ * We compare the token's before and after element text.
+ *
+ * @param upcomingSourceToken text token in upcoming source
+ * @param transMemoryToken text token in trans memory
+ * @return true if two tokens' before element and after element matches in text
+ */
+ private static boolean upcomingSourceTokenMatchesTMSourceToken(
+ TextTokenKey upcomingSourceToken,
+ TextTokenKey transMemoryToken) {
+ return transMemoryToken != null
+ && upcomingSourceToken != null
+ && matchText(upcomingSourceToken.optElementBefore,
+ transMemoryToken.optElementBefore)
+ && matchText(upcomingSourceToken.optElementAfter,
+ transMemoryToken.optElementAfter);
}
private static boolean matchText(Optional optElement,
@@ -177,49 +215,28 @@ private static T nextOrNull(Iterator iterator) {
public double calculateSimilarityPercent() {
double similarity =
LevenshteinTokenUtil.getSimilarity(
- upcomingSourceDoc.body().text(),
+ upcomingSourceTextOnly,
transMemorySourceRootElement.text());
double similarityPercent = similarity * 100;
if (similarityPercent < 99.99) {
// TODO pahuang here we could still try to put in reasonable effort (i.e. try to match as much text token as possible)
return similarityPercent;
- }
-
- if (canFullyReuseTM) {
+ } else if (canFullyReuseTM) {
+ // only return 100 if we can fully reuse TM.
return 100;
+ } else {
+ // text only matches 100% but we can not fully reuse TM translation
+ return 99;
}
- return similarityPercent;
}
public String translationFromTransMemory() {
Preconditions.checkState(canFullyReuseTM, "do not know how to apply translation memory to this source! Similarity must be 100%.");
- for (TextTokenKey upcomingSourceToken : upcomingSourceTokens) {
- upcomingSourceToken.sourceNode.text(upcomingSourceToken.targetText);
- }
- // TODO pahuang reshuffle tags under same parent nodes to match what's in TM target
- for (TextTokenKey tmToken : transMemoryTextTokens) {
-
- }
- breadthFirstTraverse(upcomingSourceDoc.body(), upcomingSourceDoc.body().childNodes());
- String translationBuildFromTM = upcomingSourceDoc.outputSettings(OUTPUT_SETTINGS).body().html();
- log.debug("Translation build from given TM is {}", translationBuildFromTM);
- return translationBuildFromTM;
+ return upcomingSourceDoc.outputSettings(OUTPUT_SETTINGS).body().html();
}
- private void breadthFirstTraverse(Element parent, List upcomingNodes) {
- List textNodes = parent.textNodes();
-// Iterables.filter(transMemoryTextTokens, new Predicate() {
-// @Override
-// public boolean apply(@Nullable TextTokenKey input) {
-// return input != null && input.sourceNode
-// }
-// })
-
- }
-
-
private static class ParentNodes {
private final List parentElements;
private transient List parentTags;
@@ -292,7 +309,7 @@ public String toString() {
@RequiredArgsConstructor
@EqualsAndHashCode
- private static class TextTokenKey {
+ private static class TextTokenKey implements Comparable {
private final TextNode sourceNode;
private final ParentNodes parentNodes;
private final Optional optElementBefore;
@@ -301,11 +318,17 @@ private static class TextTokenKey {
private final String sourceText;
private TranslationToken matchedTranslationToken;
private String targetText = "";
+ public int appearanceOrder;
@Override
public String toString() {
return "(" + parentNodes + ")#" + siblingIndex +":" + sourceText;
}
+
+ @Override
+ public int compareTo(@Nonnull TextTokenKey o) {
+ return appearanceOrder - o.appearanceOrder;
+ }
}
@RequiredArgsConstructor
@@ -326,6 +349,7 @@ private static class TransMemoryHTMLParser {
private boolean perfectMatch = true;
private boolean heuristicMatch = true;
private List textTokens = Lists.newLinkedList();
+ private StringBuilder tmTargetTextOnly = new StringBuilder();
private transient Map, List> parentTagsToTransTokens =
Maps.newHashMap();
@@ -351,6 +375,7 @@ private void depthFirstTraverseTargetNodes(List targetChildNodes,
for (Node node : targetChildNodes) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
+ tmTargetTextOnly.append(textNode.getWholeText());
TranslationToken translationToken = new TranslationToken(
parentNodes.parentTags(),
getOptionalBeforeSiblingElement(textNode),
diff --git a/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java b/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java
index 38875d73cd..905cef470b 100644
--- a/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java
+++ b/zanata-war/src/test/java/org/zanata/search/TransMemoryMatcherTest.java
@@ -177,12 +177,11 @@ public void tagsSwappedLocation() {
"How good are you? I am good.");
// When:
- String translation = matcher.translationFromTransMemory();
+ double similarityPercent = matcher.calculateSimilarityPercent();
- // Then:
- Assertions.assertThat(translation)
- .isEqualTo(
- "你好吗? 我不错。");
+ // Then: we cannot handle element swapped location
+ Assertions.assertThat(similarityPercent)
+ .isEqualTo(99);
}
@Test