# Conversion Utility for BE Documentations

1. Convert HTML docs of BE functions to JSON array
1. Convert JSON array to jsonl training data for fine-tuning

In [None]:
!pip install html-to-json

In [None]:
import html_to_json
import json

# convert a function doc into JSON object
def readHtmlFile(path):
  htmlFile = open(path, "r")
  jsonContent = html_to_json.convert(htmlFile.read())
  doc = jsonContent["html"][0]["body"][0]["div"][1]

  result = {}
  # function name
  result["Function"] = "".join(s["_value"] for s in doc["h1"][0]["span"])

  # print (doc["div"])
  for x in doc["div"]:
    if "h2" in x:
      # attribute name
      elem = x["h2"][0]["_value"]

      # attribute value
      c = x["div"][0]
      if "code" in c:
        # print(elem, c)
        if "_value" in c["code"][0]:
          result[elem] = c["code"][0]["_value"]
        elif "_values" in c["code"][0]:
          result[elem] = "\n".join(c["code"][0]["_values"])
      elif "table" in c:
        value = []
        for r in c["table"][0]["tr"]:
          if "td" in r:
            # print(r["td"])
            item = {}
            for d in r["td"]:
              name = d["_attributes"]["class"][0]
              if "code" in d:
                item[name] = d["code"][0]["_value"]
              elif "_value" in d:
                item[name] = d["_value"]
            value.append(item)
        result[elem] = value
      elif "_values" in c:
        # print(c)
        tup = None
        if "tt" in c:
          tup = tuple(t["_value"] for t in c["tt"])
        if tup is None:
          result[elem] = "\n".join(c["_values"])
        else:
          result[elem] = " %s ".join(c["_values"]) % tup
      elif "_value" in c:
        result[elem] = c["_value"]
      else:
        # complex values, such as Lock function description
        result[elem] = c
        print("Unknown data format of %s in %s: %s" % (elem, path, c))
  htmlFile.close()
  return result

In [None]:
import glob

# convert all function docs under a specified folder to JSON array
path = "/Users/yxu/tibco/be63/be/6.3/docs/functions/Stardard"
parr = path.split("/")
out = open(parr[len(parr)-1] + ".json", "a")

sep = "["
for f in glob.iglob(path + "/**/*.html", recursive=True):
  farr = f.split("/")
  if farr[len(farr)-1] not in ["index.html", "category-summary.html"]:
    print(f)
    out.write(sep)
    result = readHtmlFile(f)
    out.write(json.dumps(result))
    sep = ","
out.write("]")
out.close()

In [None]:
# Unit test for converting an HTML file

# path = "/Users/yxu/tibco/be63/be/6.3/docs/functions/Standard/File/fileClose.html"
# path = "/Users/yxu/tibco/be63/be/6.3/docs/functions/Standard/Cluster/DataGrid/EvictCache.html"
path = "/Users/yxu/tibco/be63/be/6.3/docs/functions/CEP_Query/Query/Datagrid/Aggregate/groupMinList.html"
result = readHtmlFile(path)
print(json.dumps(result))

In [None]:
import json

# Open and read the JSON file
inFile = 'Standard.json'
with open(inFile, 'r') as file:
  data = json.load(file)

# Write jsonl training data
out = open(inFile + "l", "a")
for func in data:
  package = func['Function'].rsplit(".", 1)[0]
  name = func['Signature']
  sig = ''
  pref = name.split("(", 1)[0]
  tokens = pref.strip().split(" ")
  if len(tokens) > 1:
    sig = tokens[0]
    name = func['Signature'].split(" ", 1)[1]
  fullName = "{0} {1}.{2}".format(sig, package, name).strip()
  msg = {
    "messages": [{
        "role": "system",
        "content": "BE Assistant"
      },
      {
        "role": "user",
        "content": func.get('Description')
      },
      {
        "role": "assistant",
        "content": fullName
      }]
  }
  out.write(json.dumps(msg))
  out.write('\n')
out.close()