zinggAI · sonalgoyal · Feb 13, 2024 · Feb 5, 2024 · Feb 7, 2024
diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonClassProcessor.java
@@ -36,9 +36,12 @@ public boolean process(Set<? extends TypeElement> annotations, RoundEnvironment
             if (element.getKind() == ElementKind.CLASS) {
                 TypeElement classElement = (TypeElement) element;
                 PackageElement packageElement = (PackageElement) classElement.getEnclosingElement();
+                String packageName = packageElement.getQualifiedName().toString();
                 List<String> methodNames = new ArrayList<>();
+
+                String outputDirectory = determineOutputDirectory(packageName);
 
-                try (FileWriter fileWriter = new FileWriter("python/zingg"+ File.separator + element.getSimpleName() + "Generated.py")) {
+                try (FileWriter fileWriter = new FileWriter(outputDirectory + File.separator + element.getSimpleName() + "Generated.py")) {
                     generateImportsAndDeclarations(element, fileWriter);
 
                     fileWriter.write("class " + element.getSimpleName() + ":\n");
@@ -68,6 +71,18 @@ Map<String, List<String>> getClassMethodsMap() {
         return classMethodsMap;
     }
 
+    private String determineOutputDirectory(String packageName) {
+        if (packageName.contains("enterprise") && packageName.contains("common")) {
+            return "common/python";
+        } else if (packageName.contains("enterprise") && packageName.contains("snowflake")) {
+            return "snowflake/python";
+        } else if (packageName.contains("enterprise") && packageName.contains("spark")) {
+            return "spark/python";
+        } else {
+            return "python/zingg";
+        }
+    }    
+
     private void generateImportsAndDeclarations(Element element, FileWriter fileWriter) throws IOException {
         fileWriter.write("from zingg.otherThanGenerated import *\n");
         if (element.getSimpleName().contentEquals("Pipe")) {
@@ -79,6 +94,13 @@ private void generateImportsAndDeclarations(Element element, FileWriter fileWrit
             fileWriter.write("JStructType = getJVM().org.apache.spark.sql.types.StructType\n");
             fileWriter.write("\n");
         }
+
+        String javadoc = processingEnv.getElementUtils().getDocComment(element);
+        if (javadoc != null) {
+            fileWriter.write("'''\n");
+            fileWriter.write(javadoc.trim());
+            fileWriter.write("\n'''\n");
+        }
     }
 
     private void generateClassInitializationCode(TypeElement classElement, Element element, FileWriter fileWriter) throws IOException {
@@ -87,9 +109,16 @@ private void generateClassInitializationCode(TypeElement classElement, Element e
             fileWriter.write("        self." + element.getSimpleName().toString().toLowerCase() + ".setName(name)\n");
             fileWriter.write("        self." + element.getSimpleName().toString().toLowerCase() + ".setFormat(format)\n");
         }
+        else if (element.getSimpleName().contentEquals("EPipe")) {
+            fileWriter.write("        self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()\n");
+            fileWriter.write("        self." + element.getSimpleName().toString().toLowerCase() + ".setPassthroughExpr(passthroughExpr)\n");
+        }
         else if (element.getSimpleName().contentEquals("Arguments")) {
             fileWriter.write("        self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.Arguments()\n");
         }
+        else if (element.getSimpleName().contentEquals("EArguments")) {
+            fileWriter.write("        self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.Arguments()\n");
+        }
         else if (element.getSimpleName().contentEquals("FieldDefinition")) {
             fileWriter.write("        self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.FieldDefinition()\n");
             fileWriter.write("        self." + element.getSimpleName().toString().toLowerCase() + ".setFieldName(name)\n");

diff --git a/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java b/common/py/src/main/java/zingg/common/py/processors/PythonMethodProcessor.java
@@ -36,6 +36,14 @@ public boolean process(Set<? extends TypeElement> annotations, RoundEnvironment
 
                     if (methodNames.contains(methodElement.getSimpleName().toString())) {
                         try (FileWriter fileWriter = new FileWriter("python/zingg" + File.separator + className + "Generated.py", true)) {
+
+                            String javadoc = processingEnv.getElementUtils().getDocComment(methodElement);
+                            if (javadoc != null) {
+                                fileWriter.write("    '''\n");
+                                fileWriter.write(javadoc.trim());
+                                fileWriter.write("\n    '''\n");
+                            }
+
                             fileWriter.write("    def " + methodElement.getSimpleName() + "(self" + generateMethodSignature(methodElement) + "):\n");
                             generateMethodReturn(methodElement, fileWriter);
                             generateFieldAssignment(methodElement, fileWriter);

diff --git a/python/zingg/ArgumentsGenerated.py b/python/zingg/ArgumentsGenerated.py
@@ -1,38 +1,163 @@
 from zingg.otherThanGenerated import *
+'''
+This class helps supply match arguments to Zingg. There are 3 basic steps
+ in any match process.
+ <ul>
+ <li>Defining - specifying information about data location, fields and our
+ notion of similarity.
+ <li>Training - making Zingg learn the matching rules
+ <li>Matching - Running the models on entire dataset
+ </ul>
+ <p>
+ There is another step, creating labeled data, which can be used to create
+ training data if none is present. Let us cover them in greater detail through
+ an example.
+ <p>
+ We have some positive and negative labeled examples from which we want
+ Zingg to learn. These are saved in
+ <p>
+ /path/to/training/data/positive.csv and
+ <p>
+ /path/to/training/data/negative.csv
+ <p>
+ Our actual data has colA,colB,colC,colD,colE with comma as the delimiter and
+ is saved at
+ <p>
+ /path/to/match/data.csv.
+ <p>
+ We want to match on colB and colD only, one of which is String and other is
+ int
+ <p>
+ Our program would look like
+
+ <pre>
+ {
+ 	&#064;code
+ 	Arguments args = new Arguments();
+ 	args.setDelimiter(&quot;,&quot;);
+ 	args.setPositiveTrainingSamples(&quot;/path/to/training/data/positive.csv&quot;);
+ 	args.setNegativeTrainingSamples(&quot;/path/to/training/data/negative.csv&quot;);
+
+ 	FieldDefinition colB = new FieldDefinition(1, FieldClass.STRING,
+ 			FieldType.WORD);
+ 	FieldDefinition colD = new FieldDefinition(3, FieldClass.INTEGER,
+ 			FieldType.NUMERIC);
+
+ 	List&lt;FieldDefinition&gt; fields = new ArrayList&lt;FieldDefinition&gt;();
+ 	fields.add(colB);
+ 	fields.add(colD);
+ 	args.setFieldDefinition(fields);
+
+ 	args.setMatchData(&quot;/path/to/match/data.csv&quot;);
+
+ 	args.setZinggDir(&quot;/path/to/models&quot;);
+ 	args.setOutputDir(&quot;/path/to/match/output&quot;);
+
+ 	Client client = new Client(args, &quot;local&quot;);
+ 	client.train();
+ 	client.run();
+ }
+ </pre>
+'''
 class Arguments:
     def __init__(self):
         self.arguments = getJVM().zingg.common.client.Arguments()
 
     def setNumPartitions(self, numPartitions):
         self.arguments.setNumPartitions(numPartitions)
 
+    '''
+Set the fraction of data to be used from complete data set to be used for
+ seeding the labelled data Labelling is costly and we want a fast
+ approximate way of looking at a small sample of the records and
+ identifying expected matches and non matches
+
+ @param labelDataSampleSize
+            - float between 0 and 1 denoting portion of dataset to use in
+            generating seed samples
+ @throws ZinggClientException
+    '''
     def setLabelDataSampleSize(self, labelDataSampleSize):
         self.arguments.setLabelDataSampleSize(labelDataSampleSize)
 
+    '''
+Location for internal Zingg use.
+
+ @return the path for internal Zingg usage
+
+	public Pipe[] getZinggInternal() {
+		return zinggInternal;
+	}
+
+	/**
+ Set the location for Zingg to save its internal computations and
+ models. Please set it to a place where the program has write access.
+
+ @param zinggDir
+            path to the Zingg directory
+
+	public void setZinggInternal(Pipe[] zinggDir) {
+		this.zinggInternal = zinggDir;
+	}
+    '''
     def getModelId(self):
         return self.arguments.getModelId()
 
     def setModelId(self, modelId):
         self.arguments.setModelId(modelId)
 
+    '''
+Set the output directory where the match result will be saved
+
+ @param outputDir
+            where the match result is saved
+ @throws ZinggClientException
+    '''
     def setOutput(self, outputDir):
         self.arguments.setOutput(outputDir)
 
+    '''
+Set the location for Zingg to save its internal computations and
+ models. Please set it to a place where the program has write access.
+
+ @param zinggDir
+            path to the Zingg directory
+    '''
     def setZinggDir(self, zinggDir):
         self.arguments.setZinggDir(zinggDir)
 
+    '''
+Location for internal Zingg use.
+
+ @return the path for internal Zingg usage
+    '''
     def getZinggBaseModelDir(self):
         return self.arguments.getZinggBaseModelDir()
 
     def getZinggModelDir(self):
         return self.arguments.getZinggModelDir()
 
+    '''
+Location for internal Zingg use.
+
+ @return the path for internal Zingg usage
+    '''
     def getZinggBaseTrainingDataDir(self):
         return self.arguments.getZinggBaseTrainingDataDir()
 
+    '''
+Location for internal Zingg use.
+
+ @return the path for internal Zingg usage
+    '''
     def getZinggTrainingDataUnmarkedDir(self):
         return self.arguments.getZinggTrainingDataUnmarkedDir()
 
+    '''
+Location for internal Zingg use.
+
+ @return the path for internal Zingg usage
+    '''
     def getZinggTrainingDataMarkedDir(self):
         return self.arguments.getZinggTrainingDataMarkedDir()
 

diff --git a/python/zingg/FieldDefinitionGenerated.py b/python/zingg/FieldDefinitionGenerated.py
@@ -1,4 +1,10 @@
 from zingg.otherThanGenerated import *
+'''
+This class defines each field that we use in matching We can use this to
+ configure the properties of each field we use for matching in Zingg.
+
+ @author sgoyal
+'''
 class FieldDefinition:
     def __init__(self, name, dataType, *matchType):
         self.fielddefinition = getJVM().zingg.common.client.FieldDefinition()
@@ -13,6 +19,13 @@ def getFieldDefinition(self):
     def setFields(self, fields):
         self.fielddefinition.setFields(fields)
 
+    '''
+Set the field type which defines the kind of matching we want to do
+
+ @see MatchType
+ @param type
+            the type to set
+    '''
     def setMatchType(self, type):
         self.fielddefinition.setMatchType(type)
 

diff --git a/python/zingg/PipeGenerated.py b/python/zingg/PipeGenerated.py
@@ -6,6 +6,11 @@
 FilePipe = getJVM().zingg.common.client.pipe.FilePipe
 JStructType = getJVM().org.apache.spark.sql.types.StructType
 
+'''
+Actual pipe def in the args. One pipe can be used at multiple places with different tables, locations, queries etc
+
+ @author sgoyal
+'''
 class Pipe:
     def __init__(self, name, format):
         self.pipe = getJVM().zingg.spark.client.pipe.SparkPipe()