Skip to content
This repository has been archived by the owner on Jul 30, 2020. It is now read-only.

Commit

Permalink
debug: langlinks reader
Browse files Browse the repository at this point in the history
  • Loading branch information
mschwarzer committed Apr 19, 2017
1 parent 1688775 commit 4f0ce6c
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 6 deletions.
17 changes: 12 additions & 5 deletions src/main/java/org/wikipedia/citolytics/multilang/MultiLang.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.util.Collector;

import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -58,12 +59,18 @@ public void flatMap(String s, Collector<LangLinkTuple> out) throws Exception {

if (cols.length == 3) {
// Match VALUES (csv split)
out.collect(new LangLinkTuple(
Integer.parseInt(cols[0]),
cols[1].replaceAll("^'|'$", "").replace("\\'", "'"),
cols[2].replaceAll("^'|'$", "").replace("\\'", "'")
));
try {
int pageId = Integer.parseInt(cols[0]);


out.collect(new LangLinkTuple(
pageId,
cols[1].replaceAll("^'|'$", "").replace("\\'", "'"),
cols[2].replaceAll("^'|'$", "").replace("\\'", "'")
));
} catch (NumberFormatException e) {
throw new Exception("Cannot parse pageID: " + cols[0] + " // cols: " + Arrays.toString(cols));
}
} else {
// Match INSERT INTO statement
Pattern p2 = Pattern.compile("([0-9]+),'(.*?)','(.*?)'");
Expand Down
2 changes: 1 addition & 1 deletion support/flink-jobs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ The following jobs perform pre-processing or analysis tasks.
--input $WIKI_DUMP \
--output $SEEALSO_PATH
With multi-language translation
With multi-language translation (See [cirrusearch.md](cirrussearch.md) for lang-links extraction)

$FLINK_HOME/bin/flink run -c org.wikipedia.citolytics.seealso.SeeAlsoExtrator -p $PARALLELISM $JAR \
--input $WIKI_DUMP \
Expand Down

0 comments on commit 4f0ce6c

Please sign in to comment.