Skip to content

Commit

Permalink
inmemory pipe #441
Browse files Browse the repository at this point in the history
  • Loading branch information
sonalgoyal committed Mar 29, 2023
1 parent 04006b9 commit dc9f906
Show file tree
Hide file tree
Showing 63 changed files with 64 additions and 78 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public class Pipe<D,R,C> implements Serializable{ // St:StructType, Sv:SaveMode
String preprocessors;
Map<String, String> props = new HashMap<String, String>();
int id;
ZFrame<D, R, C> dataset;
protected ZFrame<D, R, C> dataset;
String schema;
String mode;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ private ZFrame<D,R,C> read(DFReader<D,R,C> reader, Pipe<D,R,C> p, boolean addSo
LOG.warn("Reading " + p);
try {

if (p.getFormat() == Pipe.FORMAT_INMEMORY) {
if (p.getFormat().equals(Pipe.FORMAT_INMEMORY)) {
input = p.getDataset(); //.df();
}
else {
Expand Down Expand Up @@ -194,7 +194,7 @@ public void write(ZFrame<D,R,C> toWriteOrig, Arguments args,

LOG.warn("Writing output " + p);

if (p.getFormat() == Pipe.FORMAT_INMEMORY) {
if (p.getFormat().equals(Pipe.FORMAT_INMEMORY)) {
p.setDataset(toWriteOrig);
return;
}
Expand Down
26 changes: 4 additions & 22 deletions examples/amazon-google/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,25 +39,16 @@
"header":false
}
}],
"data" : [{
"data" :[{
"name":"aws",
"format":"csv",
"props": {
"location": "examples/amazon-google/Amazon.csv",
"delimiter": ",",
"header":true
},
"schema":
"{\"type\" : \"struct\",
\"fields\" : [
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
{\"name\":\"title\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"description\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true} ,
{\"name\":\"price\", \"type\":\"double\", \"nullable\":true}
]
}"
},
"schema": "id string, title string, description string, manufacturer string, price double"
},
{
"name":"google",
"format":"csv",
Expand All @@ -66,16 +57,7 @@
"delimiter": ",",
"header":true
},
"schema":
"{\"type\" : \"struct\",
\"fields\" : [
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
{\"name\":\"title\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"description\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true} ,
{\"name\":\"price\", \"type\":\"double\", \"nullable\":true}
]
}"
"schema": "id string, title string, description string, manufacturer string, price double"
}
],
"labelDataSampleSize" : 0.4,
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"class":"org.apache.spark.ml.PipelineModel","timestamp":1658797719173,"sparkVersion":"3.1.2","uid":"pipeline_46ab3b798dc1","paramMap":{"stageUids":["vecAssembler_cb42757e866f","poly_1c7b648c7b13","logreg_a2f6b7295985"]},"defaultParamMap":{}}
{"class":"org.apache.spark.ml.PipelineModel","timestamp":1680013839842,"sparkVersion":"3.1.2","uid":"pipeline_6bdb624d34ef","paramMap":{"stageUids":["vecAssembler_d00f2c39b11a","poly_443766300674","logreg_0079fab6ffc2"]},"defaultParamMap":{}}
Binary file not shown.

This file was deleted.

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1680013839937,"sparkVersion":"3.1.2","uid":"vecAssembler_d00f2c39b11a","paramMap":{"outputCol":"z_featurevector","inputCols":["z_sim0","z_sim1","z_sim2","z_sim3","z_sim4"]},"defaultParamMap":{"handleInvalid":"error","outputCol":"vecAssembler_d00f2c39b11a__output"}}
Binary file not shown.

This file was deleted.

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.PolynomialExpansion","timestamp":1680013840111,"sparkVersion":"3.1.2","uid":"poly_443766300674","paramMap":{"outputCol":"z_feature","degree":3,"inputCol":"z_featurevector"},"defaultParamMap":{"outputCol":"poly_443766300674__output","degree":2}}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.classification.LogisticRegressionModel","timestamp":1680013840199,"sparkVersion":"3.1.2","uid":"logreg_0079fab6ffc2","paramMap":{"predictionCol":"z_prediction","labelCol":"z_isMatch","fitIntercept":true,"regParam":1.0,"maxIter":100,"featuresCol":"z_feature","threshold":0.4,"probabilityCol":"z_probability"},"defaultParamMap":{"family":"auto","predictionCol":"prediction","labelCol":"label","standardization":true,"fitIntercept":true,"regParam":0.0,"aggregationDepth":2,"maxBlockSizeInMB":0.0,"rawPredictionCol":"rawPrediction","elasticNetParam":0.0,"maxIter":100,"featuresCol":"features","threshold":0.5,"tol":1.0E-6,"probabilityCol":"probability"}}
Binary file not shown.
Binary file not shown.

This file was deleted.

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"class":"org.apache.spark.ml.Pipeline","timestamp":1658797718738,"sparkVersion":"3.1.2","uid":"pipeline_46ab3b798dc1","paramMap":{"stageUids":["vecAssembler_cb42757e866f","poly_1c7b648c7b13","logreg_a2f6b7295985"]},"defaultParamMap":{}}
{"class":"org.apache.spark.ml.Pipeline","timestamp":1680013839351,"sparkVersion":"3.1.2","uid":"pipeline_6bdb624d34ef","paramMap":{"stageUids":["vecAssembler_d00f2c39b11a","poly_443766300674","logreg_0079fab6ffc2"]},"defaultParamMap":{}}
Binary file not shown.

This file was deleted.

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1680013839542,"sparkVersion":"3.1.2","uid":"vecAssembler_d00f2c39b11a","paramMap":{"outputCol":"z_featurevector","inputCols":["z_sim0","z_sim1","z_sim2","z_sim3","z_sim4"]},"defaultParamMap":{"handleInvalid":"error","outputCol":"vecAssembler_d00f2c39b11a__output"}}
Binary file not shown.

This file was deleted.

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.PolynomialExpansion","timestamp":1680013839648,"sparkVersion":"3.1.2","uid":"poly_443766300674","paramMap":{"outputCol":"z_feature","degree":3,"inputCol":"z_featurevector"},"defaultParamMap":{"outputCol":"poly_443766300674__output","degree":2}}
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.classification.LogisticRegression","timestamp":1680013839756,"sparkVersion":"3.1.2","uid":"logreg_0079fab6ffc2","paramMap":{"predictionCol":"z_prediction","labelCol":"z_isMatch","fitIntercept":true,"featuresCol":"z_feature","maxIter":100,"probabilityCol":"z_probability"},"defaultParamMap":{"family":"auto","predictionCol":"prediction","labelCol":"label","standardization":true,"fitIntercept":true,"regParam":0.0,"aggregationDepth":2,"maxBlockSizeInMB":0.0,"rawPredictionCol":"rawPrediction","elasticNetParam":0.0,"maxIter":100,"featuresCol":"features","threshold":0.5,"tol":1.0E-6,"probabilityCol":"probability"}}
Binary file not shown.

This file was deleted.

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"class":"org.apache.spark.ml.evaluation.BinaryClassificationEvaluator","timestamp":1658797718627,"sparkVersion":"3.1.2","uid":"binEval_29fc509a0ee2","paramMap":{"labelCol":"z_isMatch"},"defaultParamMap":{"rawPredictionCol":"rawPrediction","numBins":1000,"metricName":"areaUnderROC","labelCol":"label"}}
{"class":"org.apache.spark.ml.evaluation.BinaryClassificationEvaluator","timestamp":1680013839161,"sparkVersion":"3.1.2","uid":"binEval_37a0d9e27f7b","paramMap":{"labelCol":"z_isMatch"},"defaultParamMap":{"metricName":"areaUnderROC","numBins":1000,"rawPredictionCol":"rawPrediction","labelCol":"label"}}
Binary file modified models/103/model/classifier/best.model/metadata/.part-00000.crc
Binary file not shown.
2 changes: 1 addition & 1 deletion models/103/model/classifier/best.model/metadata/part-00000
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"class":"org.apache.spark.ml.tuning.CrossValidatorModel","timestamp":1658797718446,"sparkVersion":"3.1.2","uid":"cv_21102e46a666","paramMap":{"numFolds":2,"seed":-1191137437,"foldCol":"","estimatorParamMaps":[[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_a2f6b7295985","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_a2f6b7295985","name":"threshold","value":"0.55","isJson":"true"}]]},"defaultParamMap":{"numFolds":3,"seed":-1191137437,"foldCol":""},"avgMetrics":[0.6234817813765183,0.6234817813765183,0.6234817813765183,0.6234817813765183,0.6060222672064778,0.6060222672064778,0.6060222672064778,0.6060222672064778,0.6629554655870447,0.6629554655870447,0.6629554655870447,0.6629554655870447,0.7687246963562754,0.7687246963562754,0.7687246963562754,0.7687246963562754,0.7820091093117408,0.7820091093117408,0.7820091093117408,0.7820091093117408],"persistSubModels":false}
{"class":"org.apache.spark.ml.tuning.CrossValidatorModel","timestamp":1680013838806,"sparkVersion":"3.1.2","uid":"cv_07757d959e12","paramMap":{"foldCol":"","numFolds":2,"seed":-1191137437,"estimatorParamMaps":[[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_0079fab6ffc2","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_0079fab6ffc2","name":"threshold","value":"0.55","isJson":"true"}]]},"defaultParamMap":{"foldCol":"","numFolds":3,"seed":-1191137437},"avgMetrics":[0.582995951417004,0.6077935222672064,0.6620698380566802,0.7555668016194332,0.768092105263158,0.582995951417004,0.6077935222672064,0.6620698380566802,0.7555668016194332,0.768092105263158,0.582995951417004,0.6077935222672064,0.6620698380566802,0.7555668016194332,0.768092105263158,0.582995951417004,0.6077935222672064,0.6620698380566802,0.7555668016194332,0.768092105263158],"persistSubModels":false}
4 changes: 2 additions & 2 deletions python/phases/assessModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def main():

#exec(open(sys.argv[2] + ".py"))

pMarkedDF = client.getPandasDfFromDs(client.getMarkedRecords())
pUnmarkedDF = client.getPandasDfFromDs(client.getUnmarkedRecords())
pMarkedDF = getPandasDfFromDs(client.getMarkedRecords())
pUnmarkedDF = getPandasDfFromDs(client.getUnmarkedRecords())

total_marked = pMarkedDF.shape[0]
total_unmarked = pUnmarkedDF.shape[0]
Expand Down
2 changes: 1 addition & 1 deletion python/phases/exportModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def main():
client = ZinggWithSpark(arguments, options)
client.init()

pMarkedDF = client.getPandasDfFromDs(client.getMarkedRecords())
pMarkedDF = getPandasDfFromDs(client.getMarkedRecords())
labelledData = spark.createDataFrame(pMarkedDF)
location = options.getLocation()

Expand Down
42 changes: 21 additions & 21 deletions python/zingg/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,26 @@
ColName = jvm.zingg.common.client.util.ColName
MatchType = jvm.zingg.common.client.MatchType

def getDfFromDs(data):
""" Method to convert spark dataset to dataframe
:param data: provide spark dataset
:type data: DataSet
:return: converted spark dataframe
:rtype: DataFrame
"""
return DataFrame(data.df(), sqlContext)

def getPandasDfFromDs(data):
""" Method to convert spark dataset to pandas dataframe
:param data: provide spark dataset
:type data: DataSet
:return: converted pandas dataframe
:rtype: DataFrame
"""
df = getDfFromDs(data)
return pd.DataFrame(df.collect(), columns=df.columns)


class Zingg:
Expand Down Expand Up @@ -141,27 +161,7 @@ def getUnsureMarkedRecordsStat(self):
"""
return self.client.getUnsureMarkedRecordsStat(self.getMarkedRecords())

def getDfFromDs(self, data):
""" Method to convert spark dataset to dataframe
:param data: provide spark dataset
:type data: DataSet
:return: converted spark dataframe
:rtype: DataFrame
"""
return DataFrame(data.df(), sqlContext)

def getPandasDfFromDs(self, data):
""" Method to convert spark dataset to pandas dataframe
:param data: provide spark dataset
:type data: DataSet
:return: converted pandas dataframe
:rtype: DataFrame
"""
df = self.getDfFromDs(data)
return pd.DataFrame(df.collect(), columns=df.columns)



class ZinggWithSpark(Zingg):

Expand Down

0 comments on commit dc9f906

Please sign in to comment.