Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new python phase exportModel #325

Merged
merged 1 commit into from Jun 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion client/src/main/java/zingg/client/ClientOptions.java
Expand Up @@ -28,6 +28,7 @@ public class ClientOptions {
public static final String MODEL_ID = "--modelId";
public static final String COLLECT_METRICS = "--collectMetrics";
public static final String SHOW_CONCISE = "--showConcise";
public static final String LOCATION = "--location";

// Options that do not take arguments.
public static final String HELP = "--help";
Expand Down Expand Up @@ -57,7 +58,8 @@ public class ClientOptions {
optionMaster.put(MODEL_ID, new Option(MODEL_ID, true, "model identifier, can be a number ", false, false));
optionMaster.put(COLLECT_METRICS, new Option(COLLECT_METRICS, true, "collect analytics, true/false ", false, false));
optionMaster.put(SHOW_CONCISE, new Option(SHOW_CONCISE, true, "Display only fields that are used to make model, true/false ", false, false));

optionMaster.put(LOCATION, new Option(LOCATION, true, "location of CSV file for exported data ", false, false));

//no args
optionMaster.put(HELP,new Option(HELP, false, "print usage information", true, false));
optionMaster.put(HELP1,new Option(HELP1, false, " print usage information", true, false));
Expand Down
3 changes: 2 additions & 1 deletion client/src/main/java/zingg/client/ZinggOptions.java
Expand Up @@ -14,9 +14,10 @@ public enum ZinggOptions {
LINK("link"),
GENERATE_DOCS("generateDocs"),
UPDATE_LABEL("updateLabel"),
FIND_AND_LABEL("findAndLabel"),
ASSESS_MODEL("assessModel"),
PEEK_MODEL("peekModel"),
FIND_AND_LABEL("findAndLabel");
EXPORT_MODEL("exportModel");

private String value;

Expand Down
2 changes: 1 addition & 1 deletion python/phases/assessModel.py
Expand Up @@ -13,7 +13,7 @@ def main():

#excluding argv[0] that is nothing but the current executable file
options = ClientOptions(sys.argv[1:])
options.setPhase("label")
options.setPhase("peekModel")
arguments = Arguments.createArgumentsFromJSON(options.getConf(), options.getPhase())
client = Zingg(arguments, options)
client.init()
Expand Down
52 changes: 52 additions & 0 deletions python/phases/exportModel.py
@@ -0,0 +1,52 @@
from zingg import *
import argparse
import sys
import os

logging.basicConfig(level=logging.INFO)
LOG = logging.getLogger("zingg.exportModel")

def main():

# ckecking for mandatory option --location for this phase
if(ClientOptions(sys.argv[1:]).hasLocation()==False):
LOG.error("--location argument is mandatory for this phase")
LOG.info("--location is location of CSV file for exported data")
sys.exit()

LOG.info("Phase ExportModel starts")

options = ClientOptions(sys.argv[1:])
options.setPhase("peekModel")
arguments = Arguments.createArgumentsFromJSON(options.getConf(), options.getPhase())
client = Zingg(arguments, options)
client.init()

pMarkedDF = client.getPandasDfFromDs(client.getMarkedRecords())
labelledData = spark.createDataFrame(pMarkedDF)
location = options.getLocation()

export_data(labelledData, location)

LOG.info("Phase ExportModel ends")

def export_data(labelledData, location):

baseCols = ['z_cluster', 'z_zid', 'z_prediction', 'z_score', 'z_source', 'z_isMatch']
sourceDataColumns = [c for c in labelledData.columns if c not in baseCols]
additionalTrainingColumns = ['z_cluster','z_isMatch']
trainingSampleColumns = [*additionalTrainingColumns, *sourceDataColumns]
trainingSamples = labelledData.select(trainingSampleColumns)

# Getting schema
trainingSamples.schema.jsonValue()
trainingSamples.show()
trainingSamples.columns
print(trainingSampleColumns)

# Exporting the labelled data as CSV
trainingSamples.toPandas().to_csv(os.path.join(location,r'exportedData.csv'))


if __name__ == "__main__":
main()
9 changes: 8 additions & 1 deletion python/phases/zingg.py
Expand Up @@ -104,6 +104,7 @@ class ClientOptions:
CONF = sc._jvm.zingg.client.ClientOptions.CONF
LICENSE = sc._jvm.zingg.client.ClientOptions.LICENSE
EMAIL = sc._jvm.zingg.client.ClientOptions.EMAIL
LOCATION = sc._jvm.zingg.client.ClientOptions.LOCATION

def __init__(self, arguments):
self.co = sc._jvm.zingg.client.ClientOptions(arguments)
Expand All @@ -118,9 +119,15 @@ def getPhase(self):
return self.co.get(ClientOptions.PHASE).getValue()
def setPhase(self, newValue):
return self.co.get(ClientOptions.PHASE).setValue(newValue)

def getConf(self):
return self.co.get(ClientOptions.CONF).getValue()
def hasLocation(self):
if(self.co.get(ClientOptions.LOCATION)==None):
return False
else:
return True
def getLocation(self):
return self.co.get(ClientOptions.LOCATION).getValue()

class FieldDefinition:
def __init__(self, name, dataType, *matchType):
Expand Down
1 change: 1 addition & 0 deletions scripts/zingg.sh
Expand Up @@ -5,6 +5,7 @@ EMAIL=xxx@yyy.com
LICENSE="test"
# Dictionary of phases written in python
declare -A PYTHON_PHASES=(["assessModel"]="python/phases/assessModel.py" \
["exportModel"]="python/phases/exportModel.py" \
)

if [[ -z "${ZINGG_EXTRA_JARS}" ]]; then
Expand Down