diff --git a/client/src/main/java/zingg/client/ClientOptions.java b/client/src/main/java/zingg/client/ClientOptions.java index ea9f08722..9ffcd8a79 100644 --- a/client/src/main/java/zingg/client/ClientOptions.java +++ b/client/src/main/java/zingg/client/ClientOptions.java @@ -28,6 +28,7 @@ public class ClientOptions { public static final String MODEL_ID = "--modelId"; public static final String COLLECT_METRICS = "--collectMetrics"; public static final String SHOW_CONCISE = "--showConcise"; + public static final String LOCATION = "--location"; // Options that do not take arguments. public static final String HELP = "--help"; @@ -57,7 +58,8 @@ public class ClientOptions { optionMaster.put(MODEL_ID, new Option(MODEL_ID, true, "model identifier, can be a number ", false, false)); optionMaster.put(COLLECT_METRICS, new Option(COLLECT_METRICS, true, "collect analytics, true/false ", false, false)); optionMaster.put(SHOW_CONCISE, new Option(SHOW_CONCISE, true, "Display only fields that are used to make model, true/false ", false, false)); - + optionMaster.put(LOCATION, new Option(LOCATION, true, "location of CSV file for exported data ", false, false)); + //no args optionMaster.put(HELP,new Option(HELP, false, "print usage information", true, false)); optionMaster.put(HELP1,new Option(HELP1, false, " print usage information", true, false)); diff --git a/client/src/main/java/zingg/client/ZinggOptions.java b/client/src/main/java/zingg/client/ZinggOptions.java index 31545d27a..30c6afd2e 100644 --- a/client/src/main/java/zingg/client/ZinggOptions.java +++ b/client/src/main/java/zingg/client/ZinggOptions.java @@ -14,9 +14,10 @@ public enum ZinggOptions { LINK("link"), GENERATE_DOCS("generateDocs"), UPDATE_LABEL("updateLabel"), + FIND_AND_LABEL("findAndLabel"), ASSESS_MODEL("assessModel"), PEEK_MODEL("peekModel"), - FIND_AND_LABEL("findAndLabel"); + EXPORT_MODEL("exportModel"); private String value; diff --git a/python/phases/assessModel.py b/python/phases/assessModel.py index c49043e53..1dfcd8968 100644 --- a/python/phases/assessModel.py +++ b/python/phases/assessModel.py @@ -13,7 +13,7 @@ def main(): #excluding argv[0] that is nothing but the current executable file options = ClientOptions(sys.argv[1:]) - options.setPhase("label") + options.setPhase("peekModel") arguments = Arguments.createArgumentsFromJSON(options.getConf(), options.getPhase()) client = Zingg(arguments, options) client.init() diff --git a/python/phases/exportModel.py b/python/phases/exportModel.py new file mode 100644 index 000000000..f4c78749e --- /dev/null +++ b/python/phases/exportModel.py @@ -0,0 +1,52 @@ +from zingg import * +import argparse +import sys +import os + +logging.basicConfig(level=logging.INFO) +LOG = logging.getLogger("zingg.exportModel") + +def main(): + + # ckecking for mandatory option --location for this phase + if(ClientOptions(sys.argv[1:]).hasLocation()==False): + LOG.error("--location argument is mandatory for this phase") + LOG.info("--location is location of CSV file for exported data") + sys.exit() + + LOG.info("Phase ExportModel starts") + + options = ClientOptions(sys.argv[1:]) + options.setPhase("peekModel") + arguments = Arguments.createArgumentsFromJSON(options.getConf(), options.getPhase()) + client = Zingg(arguments, options) + client.init() + + pMarkedDF = client.getPandasDfFromDs(client.getMarkedRecords()) + labelledData = spark.createDataFrame(pMarkedDF) + location = options.getLocation() + + export_data(labelledData, location) + + LOG.info("Phase ExportModel ends") + +def export_data(labelledData, location): + + baseCols = ['z_cluster', 'z_zid', 'z_prediction', 'z_score', 'z_source', 'z_isMatch'] + sourceDataColumns = [c for c in labelledData.columns if c not in baseCols] + additionalTrainingColumns = ['z_cluster','z_isMatch'] + trainingSampleColumns = [*additionalTrainingColumns, *sourceDataColumns] + trainingSamples = labelledData.select(trainingSampleColumns) + + # Getting schema + trainingSamples.schema.jsonValue() + trainingSamples.show() + trainingSamples.columns + print(trainingSampleColumns) + + # Exporting the labelled data as CSV + trainingSamples.toPandas().to_csv(os.path.join(location,r'exportedData.csv')) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/python/phases/zingg.py b/python/phases/zingg.py index 6ff8cb3dd..5713a4caf 100644 --- a/python/phases/zingg.py +++ b/python/phases/zingg.py @@ -104,6 +104,7 @@ class ClientOptions: CONF = sc._jvm.zingg.client.ClientOptions.CONF LICENSE = sc._jvm.zingg.client.ClientOptions.LICENSE EMAIL = sc._jvm.zingg.client.ClientOptions.EMAIL + LOCATION = sc._jvm.zingg.client.ClientOptions.LOCATION def __init__(self, arguments): self.co = sc._jvm.zingg.client.ClientOptions(arguments) @@ -118,9 +119,15 @@ def getPhase(self): return self.co.get(ClientOptions.PHASE).getValue() def setPhase(self, newValue): return self.co.get(ClientOptions.PHASE).setValue(newValue) - def getConf(self): return self.co.get(ClientOptions.CONF).getValue() + def hasLocation(self): + if(self.co.get(ClientOptions.LOCATION)==None): + return False + else: + return True + def getLocation(self): + return self.co.get(ClientOptions.LOCATION).getValue() class FieldDefinition: def __init__(self, name, dataType, *matchType): diff --git a/scripts/zingg.sh b/scripts/zingg.sh index 1dc775988..c8929e116 100755 --- a/scripts/zingg.sh +++ b/scripts/zingg.sh @@ -5,6 +5,7 @@ EMAIL=xxx@yyy.com LICENSE="test" # Dictionary of phases written in python declare -A PYTHON_PHASES=(["assessModel"]="python/phases/assessModel.py" \ + ["exportModel"]="python/phases/exportModel.py" \ ) if [[ -z "${ZINGG_EXTRA_JARS}" ]]; then