Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'zinggAI:main' into zDocumenter
- Loading branch information
Showing
13 changed files
with
491 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import zingg.client._; | ||
import java.util.ArrayList; | ||
|
||
//setting silent mode for Argument creation only | ||
:silent | ||
|
||
//build the arguments for zingg | ||
val args = new Arguments(); | ||
//set field definitions | ||
val fname = new FieldDefinition(); | ||
fname.setFieldName("fname"); | ||
fname.setDataType("\"string\""); | ||
fname.setMatchType(MatchType.FUZZY); | ||
fname.setFields("fname"); | ||
|
||
val lname = new FieldDefinition(); | ||
lname.setFieldName("lname"); | ||
lname.setDataType("\"string\""); | ||
lname.setMatchType(MatchType.FUZZY); | ||
lname.setFields("lname"); | ||
|
||
val stNo = new FieldDefinition(); | ||
stNo.setFieldName("stNo"); | ||
stNo.setDataType("\"string\""); | ||
stNo.setMatchType(MatchType.FUZZY); | ||
stNo.setFields("stNo"); | ||
|
||
val add1 = new FieldDefinition(); | ||
add1.setFieldName("add1"); | ||
add1.setDataType("\"string\""); | ||
add1.setMatchType(MatchType.FUZZY); | ||
add1.setFields("add1"); | ||
|
||
val add2 = new FieldDefinition(); | ||
add2.setFieldName("add2"); | ||
add2.setDataType("\"string\""); | ||
add2.setMatchType(MatchType.FUZZY); | ||
add2.setFields("add2"); | ||
|
||
val city = new FieldDefinition(); | ||
city.setFieldName("city"); | ||
city.setDataType("\"string\""); | ||
city.setMatchType(MatchType.FUZZY); | ||
city.setFields("city"); | ||
|
||
val areacode = new FieldDefinition(); | ||
areacode.setFieldName("areacode"); | ||
areacode.setDataType("\"string\""); | ||
areacode.setMatchType(MatchType.FUZZY); | ||
areacode.setFields("areacode"); | ||
|
||
val state = new FieldDefinition(); | ||
state.setFieldName("state"); | ||
state.setDataType("\"string\""); | ||
state.setMatchType(MatchType.FUZZY); | ||
state.setFields("state"); | ||
|
||
val dob = new FieldDefinition(); | ||
dob.setFieldName("dob"); | ||
dob.setDataType("\"string\""); | ||
dob.setMatchType(MatchType.FUZZY); | ||
dob.setFields("dob"); | ||
|
||
val ssn = new FieldDefinition(); | ||
ssn.setFieldName("ssn"); | ||
ssn.setDataType("\"string\""); | ||
ssn.setMatchType(MatchType.FUZZY); | ||
ssn.setFields("ssn"); | ||
:silent | ||
|
||
val fieldDef = new ArrayList[FieldDefinition](); | ||
fieldDef.add(fname); | ||
fieldDef.add(lname); | ||
fieldDef.add(stNo); | ||
fieldDef.add(add1); | ||
fieldDef.add(add2); | ||
fieldDef.add(city); | ||
fieldDef.add(areacode); | ||
fieldDef.add(state); | ||
fieldDef.add(dob); | ||
fieldDef.add(ssn); | ||
|
||
args.setFieldDefinition(fieldDef); | ||
//set the modelid and the zingg dir | ||
args.setModelId("100"); | ||
args.setZinggDir("models"); | ||
args.setNumPartitions(4); | ||
args.setLabelDataSampleSize(0.5f); | ||
|
||
//reading dataset into inputPipe and settint it up in 'args' | ||
//below line should not be required if you are reading from in memory dataset | ||
//in that case, replace df with input df | ||
val df = spark.read.option("header", "true").csv("examples/febrl/test.csv") | ||
import zingg.client.pipe.InMemoryPipe; | ||
import java.util.HashMap | ||
|
||
val inputPipe = new InMemoryPipe(df); | ||
inputPipe.setProps(new HashMap[String, String]()); | ||
val pipes = Array[zingg.client.pipe.Pipe](inputPipe); | ||
args.setData(pipes); | ||
|
||
//setting outputpipe in 'args' | ||
val outputPipe = new InMemoryPipe(); | ||
//outputPipe.setProps(new HashMap[String, String]()); | ||
val pipes = Array[zingg.client.pipe.Pipe](outputPipe); | ||
args.setOutput(pipes); | ||
|
||
val options = new ClientOptions("--phase", "match", "--conf", "dummy", "--license", "dummy", "--email", "xxx@yyy.com"); | ||
|
||
//Zingg execution for the given phase | ||
val client = new Client(args, options); | ||
client.init(); | ||
client.execute(); | ||
//the output is in outputPipe.getRecords | ||
outputPipe.getRecords().show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package zingg.client.pipe; | ||
|
||
import org.apache.spark.sql.Dataset; | ||
import org.apache.spark.sql.Row; | ||
|
||
public class InMemoryPipe extends Pipe{ | ||
Dataset <Row> dataset; | ||
|
||
public InMemoryPipe(Dataset<Row> ds) { | ||
dataset = ds; | ||
} | ||
|
||
public InMemoryPipe() { | ||
} | ||
|
||
public Dataset <Row> getRecords() { | ||
return dataset; | ||
} | ||
|
||
public void setRecords(Dataset <Row> ds) { | ||
dataset = ds; | ||
} | ||
|
||
public InMemoryPipe(Pipe p) { | ||
clone(p); | ||
} | ||
|
||
@Override | ||
public Format getFormat() { | ||
return Format.INMEMORY; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
package zingg.util; | ||
|
||
import static org.apache.spark.sql.functions.col; | ||
import static org.junit.jupiter.api.Assertions.assertTrue; | ||
import org.junit.jupiter.api.Test; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
||
import org.apache.commons.logging.Log; | ||
import org.apache.commons.logging.LogFactory; | ||
import org.apache.spark.sql.Column; | ||
import org.apache.spark.sql.Dataset; | ||
import org.apache.spark.sql.Row; | ||
import org.apache.spark.sql.RowFactory; | ||
import org.apache.spark.sql.types.DataTypes; | ||
import org.apache.spark.sql.types.StructField; | ||
import org.apache.spark.sql.types.StructType; | ||
|
||
import zingg.BaseSparkTest; | ||
import zingg.client.Arguments; | ||
import zingg.client.FieldDefinition; | ||
import zingg.client.MatchType; | ||
import zingg.client.util.ColName; | ||
|
||
public class TestDSUtil extends BaseSparkTest{ | ||
public static final Log LOG = LogFactory.getLog(TestDSUtil.class); | ||
|
||
@Test | ||
public void testGetFieldDefColumnsWhenShowConciseIsTrue() { | ||
FieldDefinition def1 = new FieldDefinition(); | ||
def1.setFieldName("field_fuzzy"); | ||
def1.setDataType("\"string\""); | ||
def1.setMatchType(MatchType.FUZZY); | ||
def1.setFields("field_fuzzy"); | ||
|
||
FieldDefinition def2 = new FieldDefinition(); | ||
def2.setFieldName("field_match_type_DONT_USE"); | ||
def2.setDataType("\"string\""); | ||
def2.setMatchType(MatchType.DONT_USE); | ||
def2.setFields("field_match_type_DONT_USE"); | ||
|
||
FieldDefinition def3 = new FieldDefinition(); | ||
def3.setFieldName("field_str_DONTspaceUSE"); | ||
def3.setDataType("\"string\""); | ||
def3.setMatchType(MatchType.getMatchType("DONT USE")); | ||
def3.setFields("field_str_DONTspaceUSE"); | ||
|
||
List<FieldDefinition> fieldDef = new ArrayList<FieldDefinition>(); | ||
fieldDef.add(def1); | ||
fieldDef.add(def2); | ||
fieldDef.add(def3); | ||
Arguments args = null; | ||
try { | ||
args = new Arguments(); | ||
args.setFieldDefinition(fieldDef); | ||
} catch (Throwable e) { | ||
e.printStackTrace(); | ||
} | ||
StructType schema = DataTypes.createStructType(new StructField[] { | ||
DataTypes.createStructField(def1.getFieldName(), def1.getDataType(), false), | ||
DataTypes.createStructField(def2.getFieldName(), def2.getDataType(), false), | ||
DataTypes.createStructField(def3.getFieldName(), def3.getDataType(), false), | ||
DataTypes.createStructField(ColName.SOURCE_COL, DataTypes.StringType, false) | ||
}); | ||
List<Row> list = Arrays.asList(RowFactory.create("1", "first", "one", "Junit"), RowFactory.create("2", "second", "two", "Junit"), | ||
RowFactory.create("3", "third", "three", "Junit"), RowFactory.create("4", "forth", "Four", "Junit")); | ||
Dataset<Row> ds = spark.createDataFrame(list, schema); | ||
|
||
List<String> expectedColumns = new ArrayList<String>(); | ||
expectedColumns.add("field_fuzzy"); | ||
expectedColumns.add(ColName.SOURCE_COL); | ||
List<Column> colList = DSUtil.getFieldDefColumns (ds, args, false, true); | ||
assertTrue(expectedColumns.size() == colList.size()); | ||
for (int i = 0; i < expectedColumns.size(); i++) { | ||
assertTrue(expectedColumns.get(i).equals(colList.get(i).toString())); | ||
}; | ||
} | ||
|
||
@Test | ||
public void testGetFieldDefColumnsWhenShowConciseIsFalse() { | ||
FieldDefinition def1 = new FieldDefinition(); | ||
def1.setFieldName("field_fuzzy"); | ||
def1.setDataType("\"string\""); | ||
def1.setMatchType(MatchType.FUZZY); | ||
def1.setFields("field_fuzzy"); | ||
|
||
FieldDefinition def2 = new FieldDefinition(); | ||
def2.setFieldName("field_match_type_DONT_USE"); | ||
def2.setDataType("\"string\""); | ||
def2.setMatchType(MatchType.DONT_USE); | ||
def2.setFields("field_match_type_DONT_USE"); | ||
|
||
FieldDefinition def3 = new FieldDefinition(); | ||
def3.setFieldName("field_str_DONTspaceUSE"); | ||
def3.setDataType("\"string\""); | ||
def3.setMatchType(MatchType.getMatchType("DONT USE")); | ||
def3.setFields("field_str_DONTspaceUSE"); | ||
|
||
List<FieldDefinition> fieldDef = new ArrayList<FieldDefinition>(); | ||
fieldDef.add(def1); | ||
fieldDef.add(def2); | ||
fieldDef.add(def3); | ||
Arguments args = null; | ||
try { | ||
args = new Arguments(); | ||
args.setFieldDefinition(fieldDef); | ||
} catch (Throwable e) { | ||
e.printStackTrace(); | ||
} | ||
StructType schema = DataTypes.createStructType(new StructField[] { | ||
DataTypes.createStructField(def1.getFieldName(), def1.getDataType(), false), | ||
DataTypes.createStructField(def2.getFieldName(), def2.getDataType(), false), | ||
DataTypes.createStructField(def3.getFieldName(), def3.getDataType(), false), | ||
DataTypes.createStructField(ColName.SOURCE_COL, DataTypes.StringType, false) | ||
}); | ||
List<Row> list = Arrays.asList(RowFactory.create("1", "first", "one", "Junit"), RowFactory.create("2", "second", "two", "Junit"), | ||
RowFactory.create("3", "third", "three", "Junit"), RowFactory.create("4", "forth", "Four", "Junit")); | ||
Dataset<Row> ds = spark.createDataFrame(list, schema); | ||
|
||
List<Column> colListTest2 = DSUtil.getFieldDefColumns (ds, args, false, false); | ||
List<String> expectedColumnsTest2 = new ArrayList<String>(); | ||
expectedColumnsTest2.add("field_fuzzy"); | ||
expectedColumnsTest2.add("field_match_type_DONT_USE"); | ||
expectedColumnsTest2.add("field_str_DONTspaceUSE"); | ||
expectedColumnsTest2.add(ColName.SOURCE_COL); | ||
|
||
assertTrue(expectedColumnsTest2.size() == colListTest2.size()); | ||
for (int i = 0; i < expectedColumnsTest2.size(); i++) { | ||
assertTrue(expectedColumnsTest2.get(i).contains(colListTest2.get(i).toString())); | ||
}; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.