Skip to content

Commit

Permalink
Merge pull request #325 from RavirajBaraiya/exportModel
Browse files Browse the repository at this point in the history
new python phase exportModel
  • Loading branch information
sonalgoyal committed Jun 10, 2022
2 parents d3c5eaf + c64fa1a commit c695950
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 4 deletions.
4 changes: 3 additions & 1 deletion client/src/main/java/zingg/client/ClientOptions.java
Expand Up @@ -28,6 +28,7 @@ public class ClientOptions {
public static final String MODEL_ID = "--modelId";
public static final String COLLECT_METRICS = "--collectMetrics";
public static final String SHOW_CONCISE = "--showConcise";
public static final String LOCATION = "--location";

// Options that do not take arguments.
public static final String HELP = "--help";
Expand Down Expand Up @@ -57,7 +58,8 @@ public class ClientOptions {
optionMaster.put(MODEL_ID, new Option(MODEL_ID, true, "model identifier, can be a number ", false, false));
optionMaster.put(COLLECT_METRICS, new Option(COLLECT_METRICS, true, "collect analytics, true/false ", false, false));
optionMaster.put(SHOW_CONCISE, new Option(SHOW_CONCISE, true, "Display only fields that are used to make model, true/false ", false, false));

optionMaster.put(LOCATION, new Option(LOCATION, true, "location of CSV file for exported data ", false, false));

//no args
optionMaster.put(HELP,new Option(HELP, false, "print usage information", true, false));
optionMaster.put(HELP1,new Option(HELP1, false, " print usage information", true, false));
Expand Down
3 changes: 2 additions & 1 deletion client/src/main/java/zingg/client/ZinggOptions.java
Expand Up @@ -14,9 +14,10 @@ public enum ZinggOptions {
LINK("link"),
GENERATE_DOCS("generateDocs"),
UPDATE_LABEL("updateLabel"),
FIND_AND_LABEL("findAndLabel"),
ASSESS_MODEL("assessModel"),
PEEK_MODEL("peekModel"),
FIND_AND_LABEL("findAndLabel");
EXPORT_MODEL("exportModel");

private String value;

Expand Down
2 changes: 1 addition & 1 deletion python/phases/assessModel.py
Expand Up @@ -13,7 +13,7 @@ def main():

#excluding argv[0] that is nothing but the current executable file
options = ClientOptions(sys.argv[1:])
options.setPhase("label")
options.setPhase("peekModel")
arguments = Arguments.createArgumentsFromJSON(options.getConf(), options.getPhase())
client = Zingg(arguments, options)
client.init()
Expand Down
52 changes: 52 additions & 0 deletions python/phases/exportModel.py
@@ -0,0 +1,52 @@
from zingg import *
import argparse
import sys
import os

logging.basicConfig(level=logging.INFO)
LOG = logging.getLogger("zingg.exportModel")

def main():

# ckecking for mandatory option --location for this phase
if(ClientOptions(sys.argv[1:]).hasLocation()==False):
LOG.error("--location argument is mandatory for this phase")
LOG.info("--location is location of CSV file for exported data")
sys.exit()

LOG.info("Phase ExportModel starts")

options = ClientOptions(sys.argv[1:])
options.setPhase("peekModel")
arguments = Arguments.createArgumentsFromJSON(options.getConf(), options.getPhase())
client = Zingg(arguments, options)
client.init()

pMarkedDF = client.getPandasDfFromDs(client.getMarkedRecords())
labelledData = spark.createDataFrame(pMarkedDF)
location = options.getLocation()

export_data(labelledData, location)

LOG.info("Phase ExportModel ends")

def export_data(labelledData, location):

baseCols = ['z_cluster', 'z_zid', 'z_prediction', 'z_score', 'z_source', 'z_isMatch']
sourceDataColumns = [c for c in labelledData.columns if c not in baseCols]
additionalTrainingColumns = ['z_cluster','z_isMatch']
trainingSampleColumns = [*additionalTrainingColumns, *sourceDataColumns]
trainingSamples = labelledData.select(trainingSampleColumns)

# Getting schema
trainingSamples.schema.jsonValue()
trainingSamples.show()
trainingSamples.columns
print(trainingSampleColumns)

# Exporting the labelled data as CSV
trainingSamples.toPandas().to_csv(os.path.join(location,r'exportedData.csv'))


if __name__ == "__main__":
main()
9 changes: 8 additions & 1 deletion python/phases/zingg.py
Expand Up @@ -104,6 +104,7 @@ class ClientOptions:
CONF = sc._jvm.zingg.client.ClientOptions.CONF
LICENSE = sc._jvm.zingg.client.ClientOptions.LICENSE
EMAIL = sc._jvm.zingg.client.ClientOptions.EMAIL
LOCATION = sc._jvm.zingg.client.ClientOptions.LOCATION

def __init__(self, arguments):
self.co = sc._jvm.zingg.client.ClientOptions(arguments)
Expand All @@ -118,9 +119,15 @@ def getPhase(self):
return self.co.get(ClientOptions.PHASE).getValue()
def setPhase(self, newValue):
return self.co.get(ClientOptions.PHASE).setValue(newValue)

def getConf(self):
return self.co.get(ClientOptions.CONF).getValue()
def hasLocation(self):
if(self.co.get(ClientOptions.LOCATION)==None):
return False
else:
return True
def getLocation(self):
return self.co.get(ClientOptions.LOCATION).getValue()

class FieldDefinition:
def __init__(self, name, dataType, *matchType):
Expand Down
1 change: 1 addition & 0 deletions scripts/zingg.sh
Expand Up @@ -5,6 +5,7 @@ EMAIL=xxx@yyy.com
LICENSE="test"
# Dictionary of phases written in python
declare -A PYTHON_PHASES=(["assessModel"]="python/phases/assessModel.py" \
["exportModel"]="python/phases/exportModel.py" \
)

if [[ -z "${ZINGG_EXTRA_JARS}" ]]; then
Expand Down

0 comments on commit c695950

Please sign in to comment.