Skip to content
This repository has been archived by the owner on Nov 9, 2017. It is now read-only.

Commit

Permalink
gave up on swapped tags translation
Browse files Browse the repository at this point in the history
  • Loading branch information
Patrick Huang committed Jun 5, 2015
1 parent 5b4b4b0 commit e354dc0
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 66 deletions.
147 changes: 86 additions & 61 deletions zanata-war/src/main/java/org/zanata/search/TransMemoryMatcher.java
@@ -1,18 +1,18 @@
package org.zanata.search;

import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import javax.annotation.Nullable;

import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.MoreObjects;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import lombok.EqualsAndHashCode;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
Expand All @@ -22,17 +22,14 @@
import org.zanata.model.HLocale;
import org.zanata.model.HTextFlow;

import com.beust.jcommander.internal.Maps;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.MoreObjects;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;

/**
* @author Patrick Huang
Expand All @@ -46,10 +43,8 @@ public class TransMemoryMatcher {
.prettyPrint(false);
private boolean canFullyReuseTM = false;
private final Element transMemorySourceRootElement;
private final Element transMemoryTargetRootElement;
private final Document upcomingSourceDoc;
private LinkedList<TextTokenKey> upcomingSourceTokens = Lists.newLinkedList();
private List<TextTokenKey> transMemoryTextTokens = Lists.newLinkedList();
private final String upcomingSourceTextOnly;

public TransMemoryMatcher(HTextFlow upcomingSource,
HTextFlow transMemory, HLocale targetLocale) {
Expand All @@ -66,12 +61,16 @@ public TransMemoryMatcher(HTextFlow upcomingSource,
// jsoup will append a </p> to the end if it sees a standalone <p>
// jsoup will ignore <br>
upcomingSourceDoc = Jsoup.parseBodyFragment(upcomingSourceContent);
upcomingSourceTextOnly = upcomingSourceDoc.body().text();

transMemorySourceRootElement = Jsoup.parseBodyFragment(tmSource).body();
transMemoryTargetRootElement = Jsoup.parseBodyFragment(tmTarget).body();
Element transMemoryTargetRootElement =
Jsoup.parseBodyFragment(tmTarget).body();


TransMemoryHTMLParser transMemoryParser =
new TransMemoryHTMLParser(transMemorySourceRootElement, transMemoryTargetRootElement);
new TransMemoryHTMLParser(transMemorySourceRootElement,
transMemoryTargetRootElement);

if (!transMemoryParser.perfectMatch && !transMemoryParser.heuristicMatch) {
log.info(
Expand All @@ -80,11 +79,12 @@ public TransMemoryMatcher(HTextFlow upcomingSource,
canFullyReuseTM = false;
return;
}
transMemoryTextTokens = transMemoryParser.textTokens;
List<TextTokenKey> transMemoryTextTokens = transMemoryParser.textTokens;

log.debug("=== about to parse upcoming source ===");
List<Node> upcomingNodes = upcomingSourceDoc.body()
.childNodes();
LinkedList<TextTokenKey> upcomingSourceTokens = Lists.newLinkedList();
depthFirstTraverseSourceNodes(upcomingNodes, new ParentNodes(),
upcomingSourceTokens);

Expand All @@ -96,11 +96,9 @@ public TransMemoryMatcher(HTextFlow upcomingSource,
TextTokenKey nextTransMemoryToken = transMemoryTokensIt.next();

while (nextSourceToken != null) {
if (nextTransMemoryToken != null
&& matchText(nextSourceToken.optElementBefore,
nextTransMemoryToken.optElementBefore)
&& matchText(nextSourceToken.optElementAfter,
nextTransMemoryToken.optElementAfter)) {
if (upcomingSourceTokenMatchesTMSourceToken(
nextSourceToken,
nextTransMemoryToken)) {
log.debug("found source token for [{}] in TM [{}]", nextSourceToken, nextTransMemoryToken);

// we can't update text on the node here because it will affect matchText(nextSourceToken.optElementBefore, nextTransMemoryToken.optElementBefore) above
Expand All @@ -115,6 +113,46 @@ && matchText(nextSourceToken.optElementAfter,

canFullyReuseTM = !upcomingSourceTokensIt.hasNext() &&
!transMemoryTokensIt.hasNext();

// apply matched tokens
if (canFullyReuseTM) {
StringBuilder generatedTargetTextOnly = new StringBuilder();
for (TextTokenKey upcomingSourceToken : upcomingSourceTokens) {
upcomingSourceToken.sourceNode.text(upcomingSourceToken.targetText);
generatedTargetTextOnly.append(upcomingSourceToken.targetText);
}
String tmTargetTextOnly =
transMemoryParser.tmTargetTextOnly.toString();
String translationBuildFromTM = upcomingSourceDoc.outputSettings(OUTPUT_SETTINGS).body().html();
log.debug("Translation build from given TM is:{}", translationBuildFromTM);
// do a final comparison to see if the generated translation matches
// TM translation (in case some tags has changed locations,
// I don't have a way to fix it.
log.debug(
"comparing TM target text only to generated target text:\nTM :{}\nGEN:{}",
tmTargetTextOnly, generatedTargetTextOnly);
canFullyReuseTM = generatedTargetTextOnly.toString().equals(
tmTargetTextOnly);
}

}

/**
* We compare the token's before and after element text.
*
* @param upcomingSourceToken text token in upcoming source
* @param transMemoryToken text token in trans memory
* @return true if two tokens' before element and after element matches in text
*/
private static boolean upcomingSourceTokenMatchesTMSourceToken(
TextTokenKey upcomingSourceToken,
TextTokenKey transMemoryToken) {
return transMemoryToken != null
&& upcomingSourceToken != null
&& matchText(upcomingSourceToken.optElementBefore,
transMemoryToken.optElementBefore)
&& matchText(upcomingSourceToken.optElementAfter,
transMemoryToken.optElementAfter);
}

private static boolean matchText(Optional<Element> optElement,
Expand Down Expand Up @@ -177,49 +215,28 @@ private static <T> T nextOrNull(Iterator<T> iterator) {
public double calculateSimilarityPercent() {
double similarity =
LevenshteinTokenUtil.getSimilarity(
upcomingSourceDoc.body().text(),
upcomingSourceTextOnly,
transMemorySourceRootElement.text());
double similarityPercent = similarity * 100;
if (similarityPercent < 99.99) {
// TODO pahuang here we could still try to put in reasonable effort (i.e. try to match as much text token as possible)
return similarityPercent;
}

if (canFullyReuseTM) {
} else if (canFullyReuseTM) {
// only return 100 if we can fully reuse TM.
return 100;
} else {
// text only matches 100% but we can not fully reuse TM translation
return 99;
}
return similarityPercent;

}


public String translationFromTransMemory() {
Preconditions.checkState(canFullyReuseTM, "do not know how to apply translation memory to this source! Similarity must be 100%.");
for (TextTokenKey upcomingSourceToken : upcomingSourceTokens) {
upcomingSourceToken.sourceNode.text(upcomingSourceToken.targetText);
}
// TODO pahuang reshuffle tags under same parent nodes to match what's in TM target
for (TextTokenKey tmToken : transMemoryTextTokens) {

}
breadthFirstTraverse(upcomingSourceDoc.body(), upcomingSourceDoc.body().childNodes());
String translationBuildFromTM = upcomingSourceDoc.outputSettings(OUTPUT_SETTINGS).body().html();
log.debug("Translation build from given TM is {}", translationBuildFromTM);
return translationBuildFromTM;
return upcomingSourceDoc.outputSettings(OUTPUT_SETTINGS).body().html();
}

private void breadthFirstTraverse(Element parent, List<Node> upcomingNodes) {
List<TextNode> textNodes = parent.textNodes();
// Iterables.filter(transMemoryTextTokens, new Predicate<TextTokenKey>() {
// @Override
// public boolean apply(@Nullable TextTokenKey input) {
// return input != null && input.sourceNode
// }
// })

}


private static class ParentNodes {
private final List<Element> parentElements;
private transient List<Tag> parentTags;
Expand Down Expand Up @@ -292,7 +309,7 @@ public String toString() {

@RequiredArgsConstructor
@EqualsAndHashCode
private static class TextTokenKey {
private static class TextTokenKey implements Comparable<TextTokenKey> {
private final TextNode sourceNode;
private final ParentNodes parentNodes;
private final Optional<Element> optElementBefore;
Expand All @@ -301,11 +318,17 @@ private static class TextTokenKey {
private final String sourceText;
private TranslationToken matchedTranslationToken;
private String targetText = "";
public int appearanceOrder;

@Override
public String toString() {
return "(" + parentNodes + ")#" + siblingIndex +":" + sourceText;
}

@Override
public int compareTo(@Nonnull TextTokenKey o) {
return appearanceOrder - o.appearanceOrder;
}
}

@RequiredArgsConstructor
Expand All @@ -326,6 +349,7 @@ private static class TransMemoryHTMLParser {
private boolean perfectMatch = true;
private boolean heuristicMatch = true;
private List<TextTokenKey> textTokens = Lists.newLinkedList();
private StringBuilder tmTargetTextOnly = new StringBuilder();

private transient Map<List<Tag>, List<TranslationToken>> parentTagsToTransTokens =
Maps.newHashMap();
Expand All @@ -351,6 +375,7 @@ private void depthFirstTraverseTargetNodes(List<Node> targetChildNodes,
for (Node node : targetChildNodes) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
tmTargetTextOnly.append(textNode.getWholeText());
TranslationToken translationToken = new TranslationToken(
parentNodes.parentTags(),
getOptionalBeforeSiblingElement(textNode),
Expand Down
Expand Up @@ -177,12 +177,11 @@ public void tagsSwappedLocation() {
"How <other>good</other> are <bold>you</bold>? I am <other>good</other>.");

// When:
String translation = matcher.translationFromTransMemory();
double similarityPercent = matcher.calculateSimilarityPercent();

// Then:
Assertions.assertThat(translation)
.isEqualTo(
"<bold>你</bold><other>好</other>吗? 我<other>不错<other>。");
// Then: we cannot handle element swapped location
Assertions.assertThat(similarityPercent)
.isEqualTo(99);
}

@Test
Expand Down

0 comments on commit e354dc0

Please sign in to comment.