Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

javadoc support for generated python code #776

Merged
merged 2 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,12 @@ public boolean process(Set<? extends TypeElement> annotations, RoundEnvironment
if (element.getKind() == ElementKind.CLASS) {
TypeElement classElement = (TypeElement) element;
PackageElement packageElement = (PackageElement) classElement.getEnclosingElement();
String packageName = packageElement.getQualifiedName().toString();
List<String> methodNames = new ArrayList<>();

String outputDirectory = determineOutputDirectory(packageName);

try (FileWriter fileWriter = new FileWriter("python/zingg"+ File.separator + element.getSimpleName() + "Generated.py")) {
try (FileWriter fileWriter = new FileWriter(outputDirectory + File.separator + element.getSimpleName() + "Generated.py")) {
generateImportsAndDeclarations(element, fileWriter);

fileWriter.write("class " + element.getSimpleName() + ":\n");
Expand Down Expand Up @@ -68,6 +71,18 @@ Map<String, List<String>> getClassMethodsMap() {
return classMethodsMap;
}

private String determineOutputDirectory(String packageName) {
if (packageName.contains("enterprise") && packageName.contains("common")) {
return "common/python";
} else if (packageName.contains("enterprise") && packageName.contains("snowflake")) {
return "snowflake/python";
} else if (packageName.contains("enterprise") && packageName.contains("spark")) {
return "spark/python";
} else {
return "python/zingg";
}
}

private void generateImportsAndDeclarations(Element element, FileWriter fileWriter) throws IOException {
fileWriter.write("from zingg.otherThanGenerated import *\n");
if (element.getSimpleName().contentEquals("Pipe")) {
Expand All @@ -79,6 +94,13 @@ private void generateImportsAndDeclarations(Element element, FileWriter fileWrit
fileWriter.write("JStructType = getJVM().org.apache.spark.sql.types.StructType\n");
fileWriter.write("\n");
}

String javadoc = processingEnv.getElementUtils().getDocComment(element);
if (javadoc != null) {
fileWriter.write("'''\n");
fileWriter.write(javadoc.trim());
fileWriter.write("\n'''\n");
}
}

private void generateClassInitializationCode(TypeElement classElement, Element element, FileWriter fileWriter) throws IOException {
Expand All @@ -87,9 +109,16 @@ private void generateClassInitializationCode(TypeElement classElement, Element e
fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setName(name)\n");
fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setFormat(format)\n");
}
else if (element.getSimpleName().contentEquals("EPipe")) {
fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.spark.client.pipe.SparkPipe()\n");
fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setPassthroughExpr(passthroughExpr)\n");
}
else if (element.getSimpleName().contentEquals("Arguments")) {
fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.Arguments()\n");
}
else if (element.getSimpleName().contentEquals("EArguments")) {
fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.Arguments()\n");
}
else if (element.getSimpleName().contentEquals("FieldDefinition")) {
fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + " = getJVM().zingg.common.client.FieldDefinition()\n");
fileWriter.write(" self." + element.getSimpleName().toString().toLowerCase() + ".setFieldName(name)\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ public boolean process(Set<? extends TypeElement> annotations, RoundEnvironment

if (methodNames.contains(methodElement.getSimpleName().toString())) {
try (FileWriter fileWriter = new FileWriter("python/zingg" + File.separator + className + "Generated.py", true)) {

String javadoc = processingEnv.getElementUtils().getDocComment(methodElement);
if (javadoc != null) {
fileWriter.write(" '''\n");
fileWriter.write(javadoc.trim());
fileWriter.write("\n '''\n");
}

fileWriter.write(" def " + methodElement.getSimpleName() + "(self" + generateMethodSignature(methodElement) + "):\n");
generateMethodReturn(methodElement, fileWriter);
generateFieldAssignment(methodElement, fileWriter);
Expand Down
125 changes: 125 additions & 0 deletions python/zingg/ArgumentsGenerated.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,163 @@
from zingg.otherThanGenerated import *
'''
This class helps supply match arguments to Zingg. There are 3 basic steps
in any match process.
<ul>
<li>Defining - specifying information about data location, fields and our
notion of similarity.
<li>Training - making Zingg learn the matching rules
<li>Matching - Running the models on entire dataset
</ul>
<p>
There is another step, creating labeled data, which can be used to create
training data if none is present. Let us cover them in greater detail through
an example.
<p>
We have some positive and negative labeled examples from which we want
Zingg to learn. These are saved in
<p>
/path/to/training/data/positive.csv and
<p>
/path/to/training/data/negative.csv
<p>
Our actual data has colA,colB,colC,colD,colE with comma as the delimiter and
is saved at
<p>
/path/to/match/data.csv.
<p>
We want to match on colB and colD only, one of which is String and other is
int
<p>
Our program would look like

<pre>
{
&#064;code
Arguments args = new Arguments();
args.setDelimiter(&quot;,&quot;);
args.setPositiveTrainingSamples(&quot;/path/to/training/data/positive.csv&quot;);
args.setNegativeTrainingSamples(&quot;/path/to/training/data/negative.csv&quot;);

FieldDefinition colB = new FieldDefinition(1, FieldClass.STRING,
FieldType.WORD);
FieldDefinition colD = new FieldDefinition(3, FieldClass.INTEGER,
FieldType.NUMERIC);

List&lt;FieldDefinition&gt; fields = new ArrayList&lt;FieldDefinition&gt;();
fields.add(colB);
fields.add(colD);
args.setFieldDefinition(fields);

args.setMatchData(&quot;/path/to/match/data.csv&quot;);

args.setZinggDir(&quot;/path/to/models&quot;);
args.setOutputDir(&quot;/path/to/match/output&quot;);

Client client = new Client(args, &quot;local&quot;);
client.train();
client.run();
}
</pre>
'''
class Arguments:
def __init__(self):
self.arguments = getJVM().zingg.common.client.Arguments()

def setNumPartitions(self, numPartitions):
self.arguments.setNumPartitions(numPartitions)

'''
Set the fraction of data to be used from complete data set to be used for
seeding the labelled data Labelling is costly and we want a fast
approximate way of looking at a small sample of the records and
identifying expected matches and non matches

@param labelDataSampleSize
- float between 0 and 1 denoting portion of dataset to use in
generating seed samples
@throws ZinggClientException
'''
def setLabelDataSampleSize(self, labelDataSampleSize):
self.arguments.setLabelDataSampleSize(labelDataSampleSize)

'''
Location for internal Zingg use.

@return the path for internal Zingg usage

public Pipe[] getZinggInternal() {
return zinggInternal;
}

/**
Set the location for Zingg to save its internal computations and
models. Please set it to a place where the program has write access.

@param zinggDir
path to the Zingg directory

public void setZinggInternal(Pipe[] zinggDir) {
this.zinggInternal = zinggDir;
}
'''
def getModelId(self):
return self.arguments.getModelId()

def setModelId(self, modelId):
self.arguments.setModelId(modelId)

'''
Set the output directory where the match result will be saved

@param outputDir
where the match result is saved
@throws ZinggClientException
'''
def setOutput(self, outputDir):
self.arguments.setOutput(outputDir)

'''
Set the location for Zingg to save its internal computations and
models. Please set it to a place where the program has write access.

@param zinggDir
path to the Zingg directory
'''
def setZinggDir(self, zinggDir):
self.arguments.setZinggDir(zinggDir)

'''
Location for internal Zingg use.

@return the path for internal Zingg usage
'''
def getZinggBaseModelDir(self):
return self.arguments.getZinggBaseModelDir()

def getZinggModelDir(self):
return self.arguments.getZinggModelDir()

'''
Location for internal Zingg use.

@return the path for internal Zingg usage
'''
def getZinggBaseTrainingDataDir(self):
return self.arguments.getZinggBaseTrainingDataDir()

'''
Location for internal Zingg use.

@return the path for internal Zingg usage
'''
def getZinggTrainingDataUnmarkedDir(self):
return self.arguments.getZinggTrainingDataUnmarkedDir()

'''
Location for internal Zingg use.

@return the path for internal Zingg usage
'''
def getZinggTrainingDataMarkedDir(self):
return self.arguments.getZinggTrainingDataMarkedDir()

Expand Down
13 changes: 13 additions & 0 deletions python/zingg/FieldDefinitionGenerated.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from zingg.otherThanGenerated import *
'''
This class defines each field that we use in matching We can use this to
configure the properties of each field we use for matching in Zingg.

@author sgoyal
'''
class FieldDefinition:
def __init__(self, name, dataType, *matchType):
self.fielddefinition = getJVM().zingg.common.client.FieldDefinition()
Expand All @@ -13,6 +19,13 @@ def getFieldDefinition(self):
def setFields(self, fields):
self.fielddefinition.setFields(fields)

'''
Set the field type which defines the kind of matching we want to do

@see MatchType
@param type
the type to set
'''
def setMatchType(self, type):
self.fielddefinition.setMatchType(type)

Expand Down
5 changes: 5 additions & 0 deletions python/zingg/PipeGenerated.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
FilePipe = getJVM().zingg.common.client.pipe.FilePipe
JStructType = getJVM().org.apache.spark.sql.types.StructType

'''
Actual pipe def in the args. One pipe can be used at multiple places with different tables, locations, queries etc

@author sgoyal
'''
class Pipe:
def __init__(self, name, format):
self.pipe = getJVM().zingg.spark.client.pipe.SparkPipe()
Expand Down
Loading