diff --git a/common/client/src/main/java/zingg/common/client/ZFrame.java b/common/client/src/main/java/zingg/common/client/ZFrame.java index d7c2c97ad..6ffa25a5a 100644 --- a/common/client/src/main/java/zingg/common/client/ZFrame.java +++ b/common/client/src/main/java/zingg/common/client/ZFrame.java @@ -170,5 +170,10 @@ public interface ZFrame { public ZFrame groupByCount(String groupByCol1, String groupByCol2, String countColName); - + public ZFrame intersect(ZFrame other); + + public C substr(C col, int startPos, int len); + + public C gt(C column1, C column2); + } diff --git a/common/core/pom.xml b/common/core/pom.xml index 1f03d694c..40d61e0c4 100644 --- a/common/core/pom.xml +++ b/common/core/pom.xml @@ -29,5 +29,39 @@ httpclient 4.5.14 + + org.junit.jupiter + junit-jupiter-engine + 5.8.1 + test + + + org.junit.jupiter + junit-jupiter-api + 5.8.1 + test + + + org.junit.jupiter + junit-jupiter-params + 5.8.1 + test + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.3.2 + + + + test-jar + + + + + + diff --git a/common/core/src/test/java/zingg/common/core/executor/ExecutorTester.java b/common/core/src/test/java/zingg/common/core/executor/ExecutorTester.java new file mode 100644 index 000000000..8addea3f8 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/ExecutorTester.java @@ -0,0 +1,24 @@ +package zingg.common.core.executor; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import zingg.common.client.ZinggClientException; + +public abstract class ExecutorTester { + + public static final Log LOG = LogFactory.getLog(ExecutorTester.class); + + public ZinggBase executor; + + public ExecutorTester(ZinggBase executor) { + this.executor = executor; + } + + public void execute() throws ZinggClientException { + executor.execute(); + } + + public abstract void validateResults() throws ZinggClientException; + +} diff --git a/common/core/src/test/java/zingg/common/core/executor/JunitLabeller.java b/common/core/src/test/java/zingg/common/core/executor/JunitLabeller.java new file mode 100644 index 000000000..ecdba92f4 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/JunitLabeller.java @@ -0,0 +1,60 @@ +package zingg.common.core.executor; + +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; +import zingg.common.client.options.ZinggOptions; +import zingg.common.client.util.ColName; +import zingg.common.client.util.ColValues; +import zingg.common.core.context.Context; + +public class JunitLabeller extends Labeller { + + private static final long serialVersionUID = 1L; + + public JunitLabeller(Context context) { + setZinggOption(ZinggOptions.LABEL); + setContext(context); + } + + @Override + public ZFrame processRecordsCli(ZFrame lines) + throws ZinggClientException { + + // now get a list of all those rows which have same cluster and match due to fname => mark match + ZFrame lines2 = getDSUtil().getPrefixedColumnsDS(lines); + + // construct AND condition + C clusterCond = getJoinCondForCol(lines, lines2, ColName.CLUSTER_COLUMN,true); + C fnameCond = getJoinCondForCol(lines, lines2, "FNAME",true); + C idCond = getJoinCondForCol(lines, lines2, "ID",false); + C filterCond = lines2.and(lines2.and(clusterCond,idCond),fnameCond); + + ZFrame filtered = lines.joinOnCol(lines2, filterCond).cache(); + + ZFrame matches = filtered.select(ColName.CLUSTER_COLUMN).distinct().withColumn(ColName.MATCH_FLAG_COL, ColValues.IS_MATCH_PREDICTION).cache(); + + ZFrame nonMatches = lines.select(ColName.CLUSTER_COLUMN).except(matches.select(ColName.CLUSTER_COLUMN)).distinct().withColumn(ColName.MATCH_FLAG_COL, ColValues.IS_NOT_A_MATCH_PREDICTION).cache(); + + ZFrame all = matches.unionAll(nonMatches); + + ZFrame linesMatched = lines; + linesMatched = linesMatched.drop(ColName.MATCH_FLAG_COL); + linesMatched = linesMatched.joinOnCol(all, ColName.CLUSTER_COLUMN); + linesMatched = linesMatched.select(lines.columns()); // make same order + + return linesMatched; + } + + private C getJoinCondForCol(ZFrame df1, ZFrame dfToJoin,String colName, boolean equal) { + C column = df1.col(colName); + C columnWithPrefix = dfToJoin.col(ColName.COL_PREFIX + colName); + C equalTo = df1.equalTo(column,columnWithPrefix); + if (equal) { + return equalTo; + } else { + return df1.not(equalTo); + } + } + + +} diff --git a/common/core/src/test/java/zingg/common/core/executor/LabellerTester.java b/common/core/src/test/java/zingg/common/core/executor/LabellerTester.java new file mode 100644 index 000000000..d522a26b6 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/LabellerTester.java @@ -0,0 +1,36 @@ +package zingg.common.core.executor; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; +import zingg.common.client.util.ColName; + +public class LabellerTester extends ExecutorTester { + + public static final Log LOG = LogFactory.getLog(LabellerTester.class); + + public LabellerTester(Labeller executor) { + super(executor); + } + + @Override + public void validateResults() throws ZinggClientException { + // check that marked data has at least 1 match row and 1 unmatch row + ZFrame dfMarked = executor.getContext().getPipeUtil(). + read(false, false, executor.getContext().getPipeUtil().getTrainingDataMarkedPipe(executor.getArgs())); + + C matchCond = dfMarked.equalTo(ColName.MATCH_FLAG_COL, 1); + C notMatchCond = dfMarked.equalTo(ColName.MATCH_FLAG_COL, 0); + + long matchCount = dfMarked.filter(matchCond).count(); + assertTrue(matchCount > 1); + long unmatchCount = dfMarked.filter(notMatchCond).count(); + assertTrue(unmatchCount > 1); + LOG.info("matchCount : "+ matchCount + ", unmatchCount : " + unmatchCount); + } + +} diff --git a/common/core/src/test/java/zingg/common/core/executor/MatcherTester.java b/common/core/src/test/java/zingg/common/core/executor/MatcherTester.java new file mode 100644 index 000000000..24500fe3f --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/MatcherTester.java @@ -0,0 +1,78 @@ +package zingg.common.core.executor; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; +import zingg.common.client.util.ColName; + +public class MatcherTester extends ExecutorTester { + + public static final Log LOG = LogFactory.getLog(MatcherTester.class); + + public MatcherTester(Matcher executor) { + super(executor); + } + + @Override + public void validateResults() throws ZinggClientException { + assessAccuracy(); + } + + public String getClusterColName() { + return ColName.CLUSTER_COLUMN; + } + + protected void assessAccuracy() throws ZinggClientException { + ZFrame df = getOutputData(); + + df = df.withColumn("fnameId",df.concat(df.col("fname"), df.col("id"))); + df = df.select("fnameId", getClusterColName()); + df = df.withColumn("dupeFnameId",df.substr(df.col("fnameId"),0,8)).cache(); + ZFrame df1 = df.withColumnRenamed("fnameId", "fnameId1").withColumnRenamed("dupeFnameId", "dupeFnameId1") + .withColumnRenamed(getClusterColName(), getClusterColName() + "1").cache(); + + + ZFrame gold = joinAndFilter("dupeFnameId", df, df1).cache(); + ZFrame result = joinAndFilter(getClusterColName(), df, df1).cache(); + + ZFrame fn = gold.except(result); + ZFrame tp = gold.intersect(result); + ZFrame fp = result.except(gold); + + long fnCount = fn.count(); + long tpCount = tp.count(); + long fpCount = fp.count(); + double score1 = tpCount*1.0d/(tpCount+fpCount); + double score2 = tpCount*1.0d/(tpCount+fnCount); + + LOG.info("False negative " + fnCount); + LOG.info("True positive " + tpCount); + LOG.info("False positive " + fpCount); + LOG.info("precision " + score1); + LOG.info("recall " + tpCount + " denom " + (tpCount+fnCount) + " overall " + score2); + + System.out.println("precision score1 " + score1); + + System.out.println("recall score2 " + score2); + + assertTrue(0.8 <= score1); + assertTrue(0.8 <= score2); + } + + public ZFrame getOutputData() throws ZinggClientException { + ZFrame output = executor.getContext().getPipeUtil().read(false, false, executor.getArgs().getOutput()[0]); + return output; + } + + protected ZFrame joinAndFilter(String colName, ZFrame df, ZFrame df1){ + C col1 = df.col(colName); + C col2 = df1.col(colName+"1"); + ZFrame joined = df.joinOnCol(df1, df.equalTo(col1, col2)); + return joined.filter(joined.gt(joined.col("fnameId"), joined.col("fnameId1"))); + } + +} diff --git a/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java b/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java new file mode 100644 index 000000000..424eec10c --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/TestExecutorsGeneric.java @@ -0,0 +1,104 @@ +package zingg.common.core.executor; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.junit.jupiter.api.Test; + +import zingg.common.client.ArgumentsUtil; +import zingg.common.client.IArguments; +import zingg.common.client.ZinggClientException; + +public abstract class TestExecutorsGeneric { + + public static final Log LOG = LogFactory.getLog(TestExecutorsGeneric.class); + + protected IArguments args; + + + protected S session; + + public TestExecutorsGeneric() { + + } + + public TestExecutorsGeneric(S s) throws ZinggClientException, IOException { + init(s); + } + + public void init(S s) throws ZinggClientException, IOException { + this.session = s; + // set up args + setupArgs(); + } + + public String setupArgs() throws ZinggClientException, IOException { + String configFile = getClass().getClassLoader().getResource(getConfigFile()).getFile(); + args = new ArgumentsUtil().createArgumentsFromJSON( + configFile, + "findTrainingData"); + return configFile; + } + + public abstract String getConfigFile(); + + + @Test + public void testExecutors() throws ZinggClientException { + List> executorTesterList = new ArrayList>(); + + TrainingDataFinder trainingDataFinder = getTrainingDataFinder(); + trainingDataFinder.init(args); + TrainingDataFinderTester tdft = new TrainingDataFinderTester(trainingDataFinder); + executorTesterList.add(tdft); + + Labeller labeller = getLabeller(); + labeller.init(args); + LabellerTester lt = new LabellerTester(labeller); + executorTesterList.add(lt); + + // training and labelling needed twice to get sufficient data + TrainingDataFinder trainingDataFinder2 = getTrainingDataFinder(); + trainingDataFinder2.init(args); + TrainingDataFinderTester tdft2 = new TrainingDataFinderTester(trainingDataFinder2); + executorTesterList.add(tdft2); + + Labeller labeller2 = getLabeller(); + labeller2.init(args); + LabellerTester lt2 = new LabellerTester(labeller2); + executorTesterList.add(lt2); + + Trainer trainer = getTrainer(); + trainer.init(args); + TrainerTester tt = new TrainerTester(trainer); + executorTesterList.add(tt); + + Matcher matcher = getMatcher(); + matcher.init(args); + MatcherTester mt = new MatcherTester(matcher); + executorTesterList.add(mt); + + testExecutors(executorTesterList); + } + + + public void testExecutors(List> executorTesterList) throws ZinggClientException { + for (ExecutorTester executorTester : executorTesterList) { + executorTester.execute(); + executorTester.validateResults(); + } + } + + public abstract void tearDown(); + + protected abstract TrainingDataFinder getTrainingDataFinder() throws ZinggClientException; + + protected abstract Labeller getLabeller() throws ZinggClientException; + + protected abstract Trainer getTrainer() throws ZinggClientException; + + protected abstract Matcher getMatcher() throws ZinggClientException; + +} diff --git a/common/core/src/test/java/zingg/common/core/executor/TrainerTester.java b/common/core/src/test/java/zingg/common/core/executor/TrainerTester.java new file mode 100644 index 000000000..76d15e708 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/TrainerTester.java @@ -0,0 +1,19 @@ +package zingg.common.core.executor; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class TrainerTester extends ExecutorTester { + + public static final Log LOG = LogFactory.getLog(TrainerTester.class); + + public TrainerTester(Trainer executor) { + super(executor); + } + + @Override + public void validateResults() { + LOG.info("train successful"); + } + +} diff --git a/common/core/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java b/common/core/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java new file mode 100644 index 000000000..945be8ed0 --- /dev/null +++ b/common/core/src/test/java/zingg/common/core/executor/TrainingDataFinderTester.java @@ -0,0 +1,29 @@ +package zingg.common.core.executor; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; + +public class TrainingDataFinderTester extends ExecutorTester { + + public static final Log LOG = LogFactory.getLog(TrainingDataFinderTester.class); + + public TrainingDataFinderTester(TrainingDataFinder executor) { + super(executor); + } + + @Override + public void validateResults() throws ZinggClientException { + // check that unmarked data has at least 10 rows + ZFrame df = executor.getContext().getPipeUtil().read(false, false, executor.getContext().getPipeUtil().getTrainingDataUnmarkedPipe(executor.getArgs())); + + long trainingDataCount = df.count(); + assertTrue(trainingDataCount > 10); + LOG.info("trainingDataCount : "+ trainingDataCount); + } + +} diff --git a/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java b/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java index a3e04b4b0..a519cfe1f 100644 --- a/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java +++ b/common/infra/src/main/java/zingg/common/infra/util/PojoToArrayConverter.java @@ -1,4 +1,4 @@ -package zingg.common.infraForTest.util; +package zingg.common.infra.util; import java.lang.reflect.*; import java.security.NoSuchAlgorithmException; diff --git a/pom.xml b/pom.xml index 955a17ffd..cd421cc5f 100644 --- a/pom.xml +++ b/pom.xml @@ -247,7 +247,6 @@ - - + diff --git a/spark/client/src/main/java/zingg/spark/client/SparkFrame.java b/spark/client/src/main/java/zingg/spark/client/SparkFrame.java index 6de857dcf..ce20ea9a5 100644 --- a/spark/client/src/main/java/zingg/spark/client/SparkFrame.java +++ b/spark/client/src/main/java/zingg/spark/client/SparkFrame.java @@ -448,7 +448,20 @@ public ZFrame, Row, Column> groupByCount(String groupByCol1, String } + @Override + public ZFrame, Row, Column> intersect(ZFrame, Row, Column> other) { + return new SparkFrame(df.intersect(other.df())); + } + @Override + public Column substr(Column col, int startPos, int len) { + return col.substr(startPos, len); + } + + @Override + public Column gt(Column column1, Column column2) { + return column1.gt(column2); + } } diff --git a/spark/pom.xml b/spark/pom.xml index 2ea784073..3f7444b54 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -11,6 +11,7 @@ core client + spark-test diff --git a/spark/spark-test/pom.xml b/spark/spark-test/pom.xml new file mode 100644 index 000000000..54c2255b9 --- /dev/null +++ b/spark/spark-test/pom.xml @@ -0,0 +1,75 @@ + + 4.0.0 + + zingg + zingg-spark + ${zingg.version} + + zingg-spark-test + jar + + + zingg + zingg-spark-core + ${zingg.version} + + + zingg + zingg-spark-client + ${zingg.version} + + + zingg + zingg-common-core + ${zingg.version} + + + zingg + zingg-common-client + ${zingg.version} + + + zingg + zingg-common-core + tests + test-jar + ${zingg.version} + test + + + org.junit.jupiter + junit-jupiter-engine + 5.8.1 + test + + + org.junit.jupiter + junit-jupiter-api + 5.8.1 + test + + + org.junit.jupiter + junit-jupiter-params + 5.8.1 + test + + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.3.2 + + + + test-jar + + + + + + + diff --git a/spark/spark-test/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java b/spark/spark-test/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java new file mode 100644 index 000000000..ba1ed9372 --- /dev/null +++ b/spark/spark-test/src/test/java/zingg/spark/core/executor/JunitSparkLabeller.java @@ -0,0 +1,44 @@ +package zingg.spark.core.executor; + +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; + +import zingg.common.client.IArguments; +import zingg.common.client.ZFrame; +import zingg.common.client.ZinggClientException; +import zingg.common.client.options.ZinggOptions; +import zingg.common.core.executor.JunitLabeller; +import zingg.spark.core.context.ZinggSparkContext; + +public class JunitSparkLabeller extends SparkLabeller { + + private static final long serialVersionUID = 1L; + + JunitLabeller,Row,Column,DataType> junitLabeller; + + public JunitSparkLabeller() { + this(new ZinggSparkContext()); + } + + public JunitSparkLabeller(ZinggSparkContext sparkContext) { + setZinggOption(ZinggOptions.LABEL); + setContext(sparkContext); + junitLabeller = new JunitLabeller,Row,Column,DataType>(sparkContext); + } + + @Override + public void setArgs(IArguments args) { + super.setArgs(args); + junitLabeller.setArgs(args); + } + + @Override + public ZFrame,Row,Column> processRecordsCli(ZFrame,Row,Column> lines) + throws ZinggClientException { + return junitLabeller.processRecordsCli(lines); + } +} + diff --git a/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java b/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java new file mode 100644 index 000000000..8128fdc1a --- /dev/null +++ b/spark/spark-test/src/test/java/zingg/spark/core/executor/TestSparkExecutors.java @@ -0,0 +1,87 @@ +package zingg.spark.core.executor; + +import java.io.File; +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataType; +import org.junit.jupiter.api.AfterEach; + +import zingg.common.client.ZinggClientException; +import zingg.common.core.executor.Labeller; +import zingg.common.core.executor.TestExecutorsGeneric; +import zingg.spark.core.context.ZinggSparkContext; + +public class TestSparkExecutors extends TestExecutorsGeneric,Row,Column,DataType> { + protected static final String CONFIG_FILE = "zingg/spark/core/executor/configSparkIntTest.json"; + + protected static final String TEST_DATA_FILE = "zingg/spark/core/executor/test.csv"; + + public static final Log LOG = LogFactory.getLog(TestSparkExecutors.class); + + protected ZinggSparkContext ctx; + + + public TestSparkExecutors() throws IOException, ZinggClientException { + SparkSession spark = SparkSession + .builder() + .master("local[*]") + .appName("Zingg" + "Junit") + .getOrCreate(); + this.ctx = new ZinggSparkContext(); + this.ctx.setSession(spark); + this.ctx.setUtils(); + init(spark); + } + + @Override + public String getConfigFile() { + return CONFIG_FILE; + } + + @Override + protected SparkTrainingDataFinder getTrainingDataFinder() throws ZinggClientException { + SparkTrainingDataFinder stdf = new SparkTrainingDataFinder(ctx); + return stdf; + } + @Override + protected Labeller,Row,Column,DataType> getLabeller() throws ZinggClientException { + JunitSparkLabeller jlbl = new JunitSparkLabeller(ctx); + return jlbl; + } + @Override + protected SparkTrainer getTrainer() throws ZinggClientException { + SparkTrainer st = new SparkTrainer(ctx); + return st; + } + @Override + protected SparkMatcher getMatcher() throws ZinggClientException { + SparkMatcher sm = new SparkMatcher(ctx); + return sm; + } + + + @Override + public String setupArgs() throws ZinggClientException, IOException { + String configFile = super.setupArgs(); + String testFile = getClass().getClassLoader().getResource(TEST_DATA_FILE).getFile(); + // correct the location of test data + args.getData()[0].setProp("location", testFile); + return configFile; + } + + @Override + @AfterEach + public void tearDown() { + // just rename, would be removed automatically as it's in /tmp + File dir = new File(args.getZinggDir()); + File newDir = new File(dir.getParent() + "/zingg_junit_" + System.currentTimeMillis()); + dir.renameTo(newDir); + } + +} diff --git a/spark/spark-test/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json b/spark/spark-test/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json new file mode 100644 index 000000000..0ef68c004 --- /dev/null +++ b/spark/spark-test/src/test/resources/zingg/spark/core/executor/configSparkIntTest.json @@ -0,0 +1,95 @@ +{ + "fieldDefinition":[ + { + "fieldName" : "id", + "matchType" : "dont_use", + "fields" : "id", + "dataType": "string" + }, + { + "fieldName" : "fname", + "matchType" : "fuzzy", + "fields" : "fname", + "dataType": "string" + }, + { + "fieldName" : "lname", + "matchType" : "fuzzy", + "fields" : "lname", + "dataType": "string" + }, + { + "fieldName" : "stNo", + "matchType": "fuzzy", + "fields" : "stNo", + "dataType": "string" + }, + { + "fieldName" : "add1", + "matchType": "fuzzy", + "fields" : "add1", + "dataType": "string" + }, + { + "fieldName" : "add2", + "matchType": "fuzzy", + "fields" : "add2", + "dataType": "string" + }, + { + "fieldName" : "city", + "matchType": "fuzzy", + "fields" : "city", + "dataType": "string" + }, + { + "fieldName" : "areacode", + "matchType": "fuzzy", + "fields" : "areacode", + "dataType": "string" + }, + { + "fieldName" : "state", + "matchType": "fuzzy", + "fields" : "state", + "dataType": "string" + }, + { + "fieldName" : "dob", + "matchType": "fuzzy", + "fields" : "dob", + "dataType": "string" + }, + { + "fieldName" : "ssn", + "matchType": "fuzzy", + "fields" : "ssn", + "dataType": "string" + } + ], + "output" : [{ + "name":"output", + "format":"csv", + "props": { + "location": "/tmp/junit_integration_spark/zinggOutput", + "delimiter": ",", + "header":true + } + }], + "data" : [{ + "name":"test", + "format":"csv", + "props": { + "location": "test.csv", + "delimiter": ",", + "header":false + }, + "schema": "id string, fname string, lname string, stNo string, add1 string, add2 string, city string, state string, areacode string, dob string, ssn string" + } + ], + "labelDataSampleSize" : 0.5, + "numPartitions":4, + "modelId": "junit_integration_spark", + "zinggDir": "/tmp/junit_integration_spark" + +} diff --git a/spark/spark-test/src/test/resources/zingg/spark/core/executor/test.csv b/spark/spark-test/src/test/resources/zingg/spark/core/executor/test.csv new file mode 100644 index 000000000..4473ef4c2 --- /dev/null +++ b/spark/spark-test/src/test/resources/zingg/spark/core/executor/test.csv @@ -0,0 +1,63 @@ +rec-1020-org, blake, ryan,4, starling place, berkeley vlge, marsden,5412, nsw,19271027,2402765 +rec-1021-dup-0, thomas, georgze,1, mcmanus place, , north turarmurra,3130, sa,19630225,5460534 +rec-1021-org, thomas, george,1, mcmanus place, stoney creek, north turramurra,3130, sa,19630225,5460534 +rec-1022-dup-1, Érik, Guay,840, mountview, fowles treet, burlei gh heads,2803, sa,19830807,2932837 +rec-1022-dup-2, Érik, Guay,840, fowles street, moun tvjiew, burleigh heads,2830, ss, ,2932837 +rec-1022-dup-3, jackson, christo,840, fowles street, mou ntveiw, burleig heads,2830, sa,19830807,2932837 +rec-1022-dup-4, jackson, eglinton,840, fowles street, mountv iew, burleigh heads,2830, sa,19830807,2932837 +rec-1022-org, jackson, eglinton,840, fowles street, mountview, burleigh heads,2830, sa,19830807,2932837 +rec-1023-org, gianni, matson,701, willis street, boonooloo, clifton,3101, vic,19410111,2540080 +rec-1024-org, takeisha, freeborn,6, suttor street, the groves street, wentworth falls,4615, vic,19620206,8111362 +rec-1025-org, emiily, britten,8, kitchener street, hilltop hostel rowethorpe, lake heights,2463, qld,19491021,9588775 +rec-1026-dup-0, xani, green, , phill ip avenue, , armidale,5108, nsw,19390410,9201057 +rec-1026-dup-1, xani, green,2, phillip avenue, abbey green, armidale,5108, nsw,19390410,9201857 +rec-1026-org, xani, green,2, phillip avenue, abbey green, armidale,5108, nsw,19390410,9201057 +rec-1027-org, nathan, smallacombe,20, guthridge crescent, red cross units, sandy bay,6056, sa,19241223,7522263 +rec-1028-dup-0, , ,24, , woorinyan, riverwood,3749, qld,19180205,9341716 +rec-1028-dup-1, , eglinton,24, curriecrescent, woorinyan, riverwood,3749, qld,19180205,1909717 +rec-1028-org, , eglinton,24, currie crescent, woorinyan, riverwood,3749, qld,19180205,9341716 +rec-1029-dup-0, kylee, stepehndon,81, rose scott circuit, cordobak anor, ashfield,4226, vic,19461101,4783085 +rec-1029-dup-1, sachin, stephenson,81, rose scott circuit, cordoba manor, ashfi eld,4226, vic,19461101,4783085 +rec-1029-dup-2, annalise, stephenson,81, rose scott circuit, cordoba manor, ashfoeld,4226, vic,19461101,4783085 +rec-1029-dup-3, kykee, turale,81, rose scott circuit, , ashfield,4226, vic,19461101,4783085 +rec-1029-dup-4, kylee, stephenson,81, cordoba manor, rose scott circuit, ashfield,4226, vic,19461101,4783085 +rec-1029-org, kylee, stephenson,81, rose scott circuit, cordoba manor, ashfield,4226, vic,19461101,4783085 +rec-103-dup-0, benjamin, koerbin,15, wybel anah, violet grover place, mill park,2446, nsw,19210210,3808808 +rec-103-org, briony, koerbin,146, violet grover place, wybelanah, mill park,2446, nsw,19210210,3808808 +rec-1030-org, emma, crossman,53, mcdowall place, kellhaven, tara,5608, vic,19391027,3561186 +rec-1031-org, samantha, sabieray,68, quandong street, wattle brae, gorokan,4019, wa,19590807,2863290 +rec-1032-dup-0, brooklyn, naar-cafentas,210, duffy street, tourist psrk, berwick,2481, nsw, ,3624304 +rec-1032-org, brooklyn, naar-cafentas,210, duffy street, tourist park, berwick,2481, nsw,19840802,3624304 +rec-1033-dup-0, keziah, painter,18, ainsli e avenue, sec 1, torquay,3205, vic,19191031,7801066 +rec-1033-org, keziah, painter,18, ainslie avenue, sec 1, torquay,3205, vic,19191031,7801066 +rec-1034-dup-0, erin, maynard,24, , wariala, little river,2777, vic,19970430,7429462 +rec-1034-dup-1, erin, maynard,51, wilshire street, warialda, little irver,2777, vic,19970430,1815999 +rec-1034-dup-2, hayley, maynard,14, wilshire street, , little river,2777, vic,19970430,7429462 +rec-1034-org, erin, maynard,14, wilshire street, warialda, little river,2777, vic,19970430,7429462 +rec-1035-dup-0, jaiden, rollins,48, tulgeywood, rossarden street, balwyn north,2224, nt,19280722,7626396 +rec-1035-dup-1, jaiden, rollins,95, rossarden street, tulgewyood, balwyn north,2224, nt,19280722,7626396 +rec-1035-dup-2, jaiden, rolilns,48, swinden street, tulgeywood, balwyn north,2224, nt,19280722,7626396 +rec-1035-dup-3, jaiden, rolli ns,48, tulgeywomod, rossarden street, balwyn north,2224, nf,19280722,7626396 +rec-1035-org, jaiden, rollins,48, rossarden street, tulgeywood, balwyn north,2224, nt,19280722,7626396 +rec-1036-dup-0, , held,24, lampard circuit, emerald garden, golden bay,2447, vic,19510806,3710651 +rec-1036-dup-1, sarsha, held,42, lampard circuit, , golden bay,2447, vic,19510806,3710651 +rec-1036-org, amber, held,24, lampard circuit, emerald garden, golden bay,2447, vic,19510806,3710651 +rec-1037-org, connor, beckwith,10, heard street, , mill park,5031, nsw,19081103,2209091 +rec-1038-org, danny, campbell,95, totterdell street, moama, shellharbour,2209, vic,19951105,9554924 +rec-1039-dup-0, angus, roas,62, gormansto crescent, mlc centre, kiruwah,3350, sa,19250817,2655081 +rec-1039-org, angus, rosa,62, gormanston crescent, mlc centre, kirwan,3350, sa,19250817,2655081 +rec-104-dup-0, benjaminl, carbone,18, arthella, wattle s treet, orange,3550, vic,19050820,3677127 +rec-104-org, benjamin, carbone,18, wattle street, arthella, orange,3550, vic,19050820,3677127 +rec-1040-dup-0, matilda, mestrov, , housecicuit, retirement village, taringa,3820, qld,19801119,2536135 +rec-1040-dup-1, matilda, mestrv,5, house circuit, retirement village, taringa,3802, qld,19801119,2563135 +rec-1040-dup-2, matilda, mestrov,5, house circuit, retiremen tvillage, taringa,3820, ,19801119,2563135 +rec-1040-org, matilda, mestrov,5, house circuit, retirement village, taringa,3820, qld,19801119,2563135 +rec-1041-dup-0, tyler, frojd, , burramurra avenue, kmart p plaza, san rmeo,3670, sa,19800916,7812219 +rec-1041-org, tyler, froud,8, burramurra avenue, kmart p plaza, san remo,3670, sa,19800916,7812219 +rec-1042-dup-0, kiandra, ,2, gatliff place, rustenburg sth, girgarre,3995, qld,19801125,3328205 +rec-1042-dup-1, kiandra, cowle,2, gatliff place, rustenubr g sth, girgarre,3995, qld,19801125,3328205 +rec-1042-org, kiandra, cowle,2, gatliff place, rustenburg sth, girgarre,3995, qld,19801125,3328205 +rec-1043-org, giorgia, frahn,62, handasyde street, ramano estate locn 1, tallebudgera,4506, vic,19670206,9724789 +rec-1044-dup-0, nicole, shadbolt,46, schlich s treet, simpson army barracks, toowoomba,3000, wa,19030926,8190756 +rec-1044-dup-2, nicole, carbone,46, schlich street, simpson arm ybarracks, toowong,3000, wa,19030926,8190756 +rec-1044-org, nicole, carbone,46, schlich street, simpson army barracks, toowoomba,3000, wa,19030926,8190756