Skip to content

Commit

Permalink
Merge pull request #270 from navinrathore/generateDocs263
Browse files Browse the repository at this point in the history
GenerateDocs becomes independent of Data
  • Loading branch information
sonalgoyal committed May 18, 2022
2 parents 20d22a0 + b084e74 commit 61c9bc7
Show file tree
Hide file tree
Showing 14 changed files with 304 additions and 100 deletions.
8 changes: 6 additions & 2 deletions client/src/main/java/zingg/client/Arguments.java
Expand Up @@ -459,7 +459,7 @@ public static void checkNullBlankEmpty(String field, String fieldName) throws Zi
}

public static void checkNullBlankEmpty(Pipe[] field, String fieldName) throws ZinggClientException {
if (field == null || field.length == 0) {
if (field == null || field.length == 0) {
throw new ZinggClientException("Missing value for " + fieldName + ". Trying to set " + field);
}
}
Expand Down Expand Up @@ -517,10 +517,14 @@ public String getZinggDocDir() {
}

@JsonIgnore
public String getZinggDocFile() {
public String getZinggModelDocFile() {
return zinggDir + "/" + modelId + "/model.html";
}

@JsonIgnore
public String getZinggDataDocFile() {
return zinggDir + "/" + modelId + "/data.html";
}

/**
* Location for internal Zingg use.
Expand Down
13 changes: 8 additions & 5 deletions core/src/main/java/zingg/Documenter.java
Expand Up @@ -6,7 +6,8 @@
import zingg.client.ZinggClientException;
import zingg.client.ZinggOptions;
import zingg.documenter.ModelDocumenter;
import zingg.documenter.ColumnDocumenter;
import zingg.documenter.DataDocumenter;
import zingg.documenter.ModelColDocumenter;

public class Documenter extends ZinggBase {

Expand All @@ -20,12 +21,14 @@ public Documenter() {
public void execute() throws ZinggClientException {
try {
LOG.info("Documenter starts");
// Marked records details
//Documentation out of model
ModelDocumenter modelDoc = new ModelDocumenter(spark, args);
modelDoc.process();
// Stop Words generation
ColumnDocumenter columnsDoc = new ColumnDocumenter(spark, args);
columnsDoc.process();

//Documnetation/profiling of data
DataDocumenter dataDoc = new DataDocumenter(spark, args);
dataDoc.process();

LOG.info("Documenter finishes");
} catch (Exception e) {
e.printStackTrace();
Expand Down
72 changes: 0 additions & 72 deletions core/src/main/java/zingg/documenter/ColumnDocumenter.java

This file was deleted.

39 changes: 39 additions & 0 deletions core/src/main/java/zingg/documenter/DataColDocumenter.java
@@ -0,0 +1,39 @@
package zingg.documenter;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.StructField;

import zingg.client.Arguments;
import zingg.client.ZinggClientException;

public class DataColDocumenter extends DocumenterBase {
protected static String name = "zingg.DataColDocumenter";
public static final Log LOG = LogFactory.getLog(DataColDocumenter.class);
StopWordsDocumenter stopWordsDoc;

public DataColDocumenter(SparkSession spark, Arguments args) {
super(spark, args);
stopWordsDoc = new StopWordsDocumenter(spark, args);
}

public void process(Dataset<Row> data) throws ZinggClientException {
createStopWordsDocuments(data);
}

private void createStopWordsDocuments(Dataset<Row> data) throws ZinggClientException {
if (!data.isEmpty()) {
String columnsDir = args.getZinggDocDir();
checkAndCreateDir(columnsDir);

for (StructField field: data.schema().fields()) {
stopWordsDoc.createStopWordsDocument(data, field.name(), columnsDir);
}
} else {
LOG.info("No Stop Words document generated");
}
}
}
76 changes: 76 additions & 0 deletions core/src/main/java/zingg/documenter/DataDocumenter.java
@@ -0,0 +1,76 @@
package zingg.documenter;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.StructField;

import zingg.client.Arguments;
import zingg.client.ZinggClientException;
import zingg.util.PipeUtil;

public class DataDocumenter extends DocumenterBase {
protected static String name = "zingg.DataDocumenter";
protected static String TEMPLATE_TITLE = "Data Documentation";
private final String DATA_DOC_TEMPLATE = "dataDocTemplate.ftlh";

public static final Log LOG = LogFactory.getLog(DataDocumenter.class);
private DataColDocumenter dataColDoc;
private Dataset<Row> data;

public DataDocumenter(SparkSession spark, Arguments args) {
super(spark, args);
data = spark.emptyDataFrame();
dataColDoc = new DataColDocumenter(spark, args);
}

public void process() throws ZinggClientException {
try {
LOG.info("Data document generation starts");

try {
data = PipeUtil.read(spark, false, false, args.getData());
LOG.info("Read input data : " + data.count());
} catch (ZinggClientException e) {
LOG.warn("No data has been found");
}
if (!data.isEmpty()) {
createDataDocument();
dataColDoc.process(data);
} else {
LOG.info("No data document generated");
}
LOG.info("Data document generation finishes");
} catch (Exception e) {
e.printStackTrace();
throw new ZinggClientException(e.getMessage());
}
}

private void createDataDocument() throws ZinggClientException {
if (!data.isEmpty()) {
Map<String, Object> root = new HashMap<String, Object>();
root.put(TemplateFields.TITLE, TEMPLATE_TITLE);
root.put(TemplateFields.MODEL_ID, args.getModelId());

List<String[]> list = new ArrayList<String[]> ();
for (StructField field: data.schema().fields()) {
String[] row = new String [3];
row[0] = field.name();
row[1] = field.dataType().toString();
row[2] = field.nullable()? "true": "false";
list.add(row);
}
root.put(TemplateFields.DATA_FIELDS_LIST, list);

writeDocument(DATA_DOC_TEMPLATE, root, args.getZinggDataDocFile());
}
}
}
8 changes: 0 additions & 8 deletions core/src/main/java/zingg/documenter/DocumenterBase.java
Expand Up @@ -3,8 +3,6 @@
import java.io.File;
import java.io.FileWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

import org.apache.spark.sql.SparkSession;
Expand All @@ -22,8 +20,6 @@ class DocumenterBase {
protected SparkSession spark;
protected Arguments args;

private List<String> zColList = Arrays.asList(ColName.CLUSTER_COLUMN, ColName.ID_COL, ColName.PREDICTION_COL, ColName.SCORE_COL, ColName.MATCH_FLAG_COL, ColName.SOURCE_COL);

public DocumenterBase(SparkSession spark, Arguments args) {
this.spark = spark;
this.args = args;
Expand Down Expand Up @@ -79,10 +75,6 @@ protected void checkAndCreateDir(String dirName) {
}
}

protected List<String> getZColumnList() {
return zColList;
}

public boolean isZColumn(String colName) {
return colName.startsWith(ColName.COL_PREFIX);
}
Expand Down
55 changes: 55 additions & 0 deletions core/src/main/java/zingg/documenter/ModelColDocumenter.java
@@ -0,0 +1,55 @@
package zingg.documenter;

import java.util.HashMap;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.StructField;

import zingg.client.Arguments;
import zingg.client.ZinggClientException;

public class ModelColDocumenter extends DocumenterBase {
protected static String name = "zingg.ModelColDocumenter";
public static final Log LOG = LogFactory.getLog(ModelColDocumenter.class);

private final String COLUMN_DOC_TEMPLATE = "columnDocTemplate.ftlh";
private final String Z_COLUMN_TEMPLATE = "zColumnTemplate.ftlh";

public ModelColDocumenter(SparkSession spark, Arguments args) {
super(spark, args);
}

public void process(Dataset<Row> data) throws ZinggClientException {
createColumnDocuments(data);
}

private void createColumnDocuments(Dataset<Row> data) throws ZinggClientException {
LOG.info("Column Documents generation starts");
if (!data.isEmpty()) {
String columnsDir = args.getZinggDocDir();
checkAndCreateDir(columnsDir);
for (StructField field: data.schema().fields()) {
prepareAndWriteColumnDocument(field.name(), columnsDir);
}
}
LOG.info("Column Documents generation finishes");
}

private void prepareAndWriteColumnDocument(String fieldName, String columnsDir) throws ZinggClientException {
Map<String, Object> root = new HashMap<String, Object>();
root.put(TemplateFields.TITLE, fieldName);
root.put(TemplateFields.MODEL_ID, args.getModelId());

String filenameHTML = columnsDir + fieldName + ".html";
if (isZColumn(fieldName)) {
writeDocument(Z_COLUMN_TEMPLATE, root, filenameHTML);
} else {
writeDocument(COLUMN_DOC_TEMPLATE, root, filenameHTML);
}
}
}
23 changes: 15 additions & 8 deletions core/src/main/java/zingg/documenter/ModelDocumenter.java
Expand Up @@ -24,28 +24,36 @@ public class ModelDocumenter extends DocumenterBase {
public static final Log LOG = LogFactory.getLog(ModelDocumenter.class);

private final String MODEL_TEMPLATE = "model.ftlh";
ModelColDocumenter modelColDoc;
private Dataset<Row> markedRecords;

public ModelDocumenter(SparkSession spark, Arguments args) {
super(spark, args);
markedRecords = spark.emptyDataFrame();
modelColDoc = new ModelColDocumenter(spark, args);
}

public void process() throws ZinggClientException {
createModelDocument();
modelColDoc.process(markedRecords);
}

private void createModelDocument() throws ZinggClientException {
try {
LOG.info("Model document generation starts");

Dataset<Row> markedRecords = spark.emptyDataFrame();
try {
markedRecords = PipeUtil.read(spark, false, false, PipeUtil.getTrainingDataMarkedPipe(args));
} catch (Exception e) {
} catch (ZinggClientException e) {
LOG.warn("No marked record has been found");
}

markedRecords = markedRecords.cache();
/* Create a data-model */
Map<String, Object> root = new HashMap<String, Object>();
root.put(TemplateFields.MODEL_ID, args.getModelId());
if(!markedRecords.isEmpty()) {
markedRecords = markedRecords.cache();

if (!markedRecords.isEmpty()) {
root.put(TemplateFields.CLUSTERS, markedRecords.collectAsList());
root.put(TemplateFields.NUM_COLUMNS, markedRecords.columns().length);
root.put(TemplateFields.COLUMNS, markedRecords.columns());
Expand All @@ -55,18 +63,17 @@ public void process() throws ZinggClientException {
markedRecords.schema().fieldIndex(ColName.CLUSTER_COLUMN));
} else {
// fields required to generate basic document
List<String> list = args.getFieldDefinition().stream().map(fd -> fd.getFieldName())
List<String> columnList = args.getFieldDefinition().stream().map(fd -> fd.getFieldName())
.collect(Collectors.toList());
List<String> columnList = new ArrayList<String>(getZColumnList());
columnList.addAll(list);
root.put(TemplateFields.NUM_COLUMNS, columnList.size());
root.put(TemplateFields.COLUMNS, columnList.toArray());
root.put(TemplateFields.CLUSTERS, Collections.emptyList());
root.put(TemplateFields.ISMATCH_COLUMN_INDEX, 0);
root.put(TemplateFields.CLUSTER_COLUMN_INDEX, 1);
}
checkAndCreateDir(args.getZinggDocDir());
writeDocument(MODEL_TEMPLATE, root, args.getZinggDocFile());
writeDocument(MODEL_TEMPLATE, root, args.getZinggModelDocFile());

LOG.info("Model document generation finishes");
} catch (Exception e) {
e.printStackTrace();
Expand Down

0 comments on commit 61c9bc7

Please sign in to comment.