## Build usable data for HScript Expressions

### Goal structure
```yaml
foo:
  help: <html>foo</html>
  args: float
  argnames: bar
  return: vector
```
It's much simpler than VEX functions structure, since there is no
such thing as function signatures exist in HScript, and all functions
are properly documented.

In [1]:
import re
import bs4
import json
import pickle
import random
import requests
import collections
import os.path as op
from pprint import pprint

### Parse list of expression strings into data.

#### Resulting structure

```yaml
foo:
  args: float float float
  argnames: bar baz qux
  return: vector
```

Instead of generating it from scratch, I have a complete list
of expression functions. It's easy to keep it up to date by
diffing with list printed by Texport. HScript is barely changing.

In [2]:
expression_strings = '''float abs(float number)
float acos(float number)
vector angvel(vector rot1, vector rot2, float time)
float arclen(string surface_node, float prim_num, float ustart, float ustop)
string arg(string line, float argNum)
float argc(string line)
float asin(float number)
float atan(float number)
float atan2(float y, float x)
float atof(string source)
float bbox(string surface_node, float type)
float bezier()
float boneangle(string bone1, string bone2)
float ceil(float number)
float centroid(string surface_node, float type)
float ch(string channel)
float chexist(string channel_name)
float chf(string channel, float frame)
string chgroup(string group_name)
float chop(string channel)
float chopcf(string CHOP, float channel_index, float frame)
float chopci(string CHOP, float channel_index, float index)
float chopct(string CHOP, float channel_index, float time)
float chope(string CHOP)
float chopf(string channel, float frame)
float chopi(string channel, float index)
float chopl(string CHOP)
float chopn(string CHOP)
float chopr(string CHOP)
float chops(string CHOP)
string chopstr(string channel)
float chopt(string channel, float time)
float chramp(string ramp_path, float position, float component_index)
float chrampf(string ramp_path, float position, float component_index, float frame)
float chrampt(string ramp_path, float position, float component_index, float time)
string chs(string channel)
string chsop(string path)
string chsoplist(string path) 
string chsraw(string channel)
float cht(string channel, float time)
float clamp(float value, float minimum, float maximum)
float clamptosphere(float x, float y, float z, float min_radius, float max_radius, string constant_type)
float constant()
string cophasmeta(string compositing_node, string metadata_name)
float copmeta(string compositing_node, string metadata_name, float index) 
string copmetas(string compositing_node, string metadata_name)
float cos(float number)
float cosh(float number)
vector cross(vector v1, vector v2)
float cubic()
float curvature(string surface_node, float prim_num, float u, float v)
float cycle(float f1, float f2)
float cycleoffset(float f1, float f2)
float cycleoffsett(float t1, float t2)
float cyclet(float t1, float t2)
float deg(float radians)
float degree(string surface_node, float prim_num, float du_or_dv)
float detail(string surface_node, string attrib_name, float attrib_index)
float detailattribsize(string surface_node, string attribute)
float detailattribtype(string surface_node, string attribute)
string details(string surface_node, string attribute)
string detailsmap(string surface_node, string attribute, float index)
float detailsnummap(string surface_node, string attribute)
float determinant(matrix mat)
matrix dihedral(vector v0, vector v1)
float distance(float x1, float y1, float z1, float x2, float y2, float z2)
string dopallfields(string dop, string objectSpec, string subDataName, string recordType)
string dopcontextgeo(string name, float index)
float dopcountslices(string dop, string objectFilter, string subDataName)
float dopfield(string dop, string objectSpec, string subDataName, string recordType, float recordNum, string fieldName)
string dopfieldname(string dop, string objectSpec, string subDataName, string recordType, float fieldNum)
string dopfields(string dop, string objectSpec, string subDataName, string recordType, float recordNum, string fieldName)
string dopfieldtype(string dop, string objectSpec, string subDataName, string recordType, float fieldNum)
float dopframe(string dop)
float dopframetost(string dop, float simulationframe)
float dopgrouphasobject(string dop, string objectSpec, string group)
string dopgrouplist(string dop)
float dophasfield(string dop, string objectSpec, string subDataName, string recordType, float recordNum, string fieldName)
float dophassubdata(string dop, string objectSpec, string subDataName)
string dopnodeobjs(string dop)
float dopnumfields(string dop, string objectSpec, string subDataName, string recordType)
float dopnumobjects(string dop, string objectFilter)
float dopnumrecords(string dop, string objectFilter, string subDataName, string recordType)
float dopnumrecordtypes(string dop, string objectSpec, string subDataName)
float dopnumsubdata(string dop, string objectSpec, string subDataName)
string dopobjectlist(string dop, string objectSpec, float listNames)
float dopobjectsareaffectors(string dop, string objectSpec, string affectors)
string dopobjscreatedby(string dop)
float dopoption(string dop, string objectSpec, string subDataName, string fieldName)
string dopoptions(string dop, string objectSpec, string subDataName, string fieldName)
string doprecordtypename(string dop, string objectSpec, string subDataName, float recordTypeNum)
string dopsolvedopnet()
float dopsolvenewobject(float object_index)
float dopsolvenumnewobjects()
float dopsolvenumobjects()
float dopsolveobject(float object_index)
float dopsolvetimestep()
float dopsttoframe(string dop, float simulationtime)
float dopsttot(string dop, float simulationtime)
string dopsubdataname(string dop, string objectSpec, string subDataName, float subDataNum)
float doptime(string dop)
matrix doptransform(string dop, string objectSpec, string subDataName)
float dopttost(string dop, float globaltime)
vector dopvelatpos(string dop, string objectSpec, float posx, float posy, float posz, float usevolumevelocity, float usepointvelocity)
float dot(vector v0, vector v1)
float ease()
float easein()
float easeinp(float number)
float easeout()
float easeoutp(float number)
float easep(float number)
string edgegrouplist(string surface_node)
string edgegroupmask(string surface_node, string pattern)
float eval(string expression)
string evals(string expression)
string execute(string command)
string executeb(string command)
string executee(string command)
float exp(float number)
float explodematrix(matrix mat, string trs, string xyz, string component)
float explodematrixp(matrix mat, vector p, string trs, string xyz, string component)
string findfile(string filename)
string findfiles(string filename, string separator)
float fit(float num, float oldmin, float oldmax, float newmin, float newmax)
float fit01(float num, float newmin, float newmax)
float fit10(float num, float newmin, float newmax)
float fit11(float num, float newmin, float newmax)
float floor(float number)
float frac(float number)
string ftoa(float number)
string groupbyval(string surface_node, float class, string attribute, float id)
string groupbyvals(string surface_node, float class, string attribute, string id)
float hasdetailattrib(string surface_node, string attribute)
float haspoint(string group_name, string surface_node, float point_num)
float haspointattrib(string surface_node, string attribute)
float hasprim(string group_name, string surface_node, float prim_num)
float hasprimattrib(string surface_node, string attribute)
float hasvertexattrib(string surface_node, string attribute)
float hextoint(string value)
float hsv(float red, float green, float blue, string component)
float ic(float input_index, float channel_index, float index)
float ice(float input_index)
float icl(float input_index)
float icmax(float input_index, float channel_index)
float icmin(float input_index, float channel_index)
float icn(float input_index)
float icr(float input_index)
float ics(float input_index)
matrix identity(float size)
float if(float expression, float true_value, float false_value)
string ifs(float expression, string true_value, string false_value)
float imgbounds(string foo, string bar, string baz, float qux)
float index(string source, string pattern)
float instancepoint()
float int(float number)
string inttohex(float value)
matrix invert(matrix mat)
float iprquery(string query, string pane, float x, float y)
string iprquerys(string query, string pane, float x, float y)
float isclosed(string surface_node, float prim_num)
float iscollided(string surface_node, float pointnumber)
float ishvariable(string variable_name)
float isspline(string surface_node, float prim_num)
float isstuck(string surface_node, float pointnumber)
float isvariable(string variable_name)
float iswrapu(string surface_node, float prim_num)
float iswrapv(string surface_node, float prim_num)
float length(float x, float y, float z)
float linear()
string listbyval(string surface_node, float class, string attribute, float id)
string listbyvals(string surface_node, float class, string attribute, string id)
float lock(float float)
float log(float number)
float log10(float number)
float match()
float matchin()
float matchout()
matrix matrix(string pattern)
vector matrixtoquat(matrix m)
float max(float value1, float value2)
float mcols(matrix mat)
float metaweight(string surface_node, float x, float y, float z)
float min(float value1, float value2)
float mindist(string surface_node, float point_num, string surface_node, float prim_num, float return_type)
matrix mlookat(vector v1, vector v2)
matrix mlookatup(vector v1, vector v2, vector upv)
matrix mobjlookat(string base_node, string target_node, vector upv)
float modblend(float val1, float val2, float length, float weight)
matrix morient(vector zaxis, vector yaxis)
string mousepane()
string mousepath()
float mrows(matrix mat)
matrix mzero(matrix mat)
float nearpoint(string surface_node, float x, float y, float z)
float noise(float X, float Y, float Z)
float normal(string surface_node, float prim_num, float u, float v, float index)
vector normalize(vector v)
float npoints(string surface_node)
float npointsgroup(string surface_node, string group_name)
float nprims(string name)
float nprimsgroup(string surface_node, string group_name)
float nuniquevals(string surface_node, float class, string attribute)
string objkinoverride()
string objlightmask(string geometry, string options)
vector objlookat(string base_node, string target_node, vector upv)
matrix objpretransform(string object_name)
float oc(float output_channel_index, float index)
float oldrand(float value)
string opblist(string bundle_name)
string opcreator(string name)
float opdigits(string name)
float opexist(string op_name)
string opflag(string network, string flag)
string opfullpath(string relpath)
string opfullpathfrom(string node, string basenode)
float opid(string name)
string opinput(string name, float index)
string opinputpath(string name, float index)
float opisloading()
float opisquitting()
string oplightmask(string geometry)
string opname(string name)
float opnchildren(string name)
float opninputs(string name)
float opnoutputs(string name)
string opoutput(string name, float index)
string opoutputpath(string name, float index)
string oppinput(string name, float index)
string oppwd()
string oppwf()
string oprelativepath(string srcpath, string destpath)
string opselect(string network)
string opselectpath(string network)
string opselectrecurse(string network, float flag) 
string opselectrecurse(string network, float flag)
string opstreamname(string nodepath)
string opsubpath(string node)
matrix optransform(string object_name)
string optype(string name)
string optypeinfo(string name, string pattern)
float origin(string obj1, string obj2, string constant_type)
float originoffset(string obj1, vector pos1, string obj2, vector pos2, string constant_type)
string padzero(float size, float value)
float param(string token, float value)
float parmisstring(string parameter_name)
float pic(string copname, float U, float V, float color_type)
float picni(string copname, float U, float V, float color_type)
string pluralize(string s)
float point(string surface_node, float point_number, string attribute, float index)
float pointattribsize(string surface_node, string attribute)
float pointattribtype(string surface_node, string attribute)
float pointavg(string surface_node, string attribute, float index)
float pointdist(string surface_node, float point_num, string surface_node, float prim_num, float return_type)
string pointgrouplist(string surface_node)
string pointgroupmask(string surface_node, string pattern)
string pointlist(string surface_node, string group_name)
string pointneighbours(string surface_node, float point_num, float num_shared_prims)
string pointpattern(string surface_node, string pattern)
string points(string surface_node, float point_number, string attribute)
string pointsmap(string surface_node, string attribute, float index)
float pointsnummap(string surface_node, string attribute)
string popcontextgeo(float index)
float popevent(string event_name)
float popeventtime(string event_name)
float poppoint(float point_number, string attribute, float index)
float poppointid(float particle_id, string attribute, float index)
float poppointnum(float particle_id)
string poppoints(float point_number, string attribute)
string poppointsid(float particle_id, string attribute)
float pow(float base, float exponent)
float prim(string surface_node, float prim_num, string attrib_name, float attrib_index)
float primattribsize(string surface_node, string attribute)
float primattribtype(string surface_node, string attribute)
float primdist(string surface_node, float prim1_num, string surface_node, float prim2_num, float return_type)
float primduv(string surface_node, float prim_num, string attrib_name, float attrib_index, float u, float v, float du, float dv)
string primgrouplist(string surface_node)
string primgroupmask(string surface_node, string pattern)
string primlist(string surface_node, string group_name)
string primneighbours(string surface_node, float prim_num, float num_shared_pts)
string prims(string surface_node, float primitive_number, string attribute)
string primsmap(string surface_node, string attribute, float index)
float primsnummap(string surface_node, string attribute)
float primuv(string surface_node, float prim_num, string attrib_name, float attrib_index, float u, float v)
float print(string label, float expression)
float property(string foo, float bar)
float propertyf(string foo, float bar, float baz)
string propertys(string foo, string bar)
string propertysop(string foo, string bar)
string propertysraw(string foo, string bar)
float propertyt(string foo, float bar, float baz)
float pulse(float value, float start, float end)
float pythonexprf(string expression)
string pythonexprs(string expression)
float qlinear()
matrix quattomatrix(vector q)
float quintic()
float rad(float number)
float rand(float value)
float raw()
float realuv(string surface_node, float prim_num, float uv_unit, float D_U|D_V)
float repeat(float f1, float f2)
float repeatt(float t1, float t2)
float res(string compositing_node, float res_type)
float rgb(float hue, float saturation, float value, string component)
float rindex(string source, string pattern)
float rint(float number)
matrix rotate(float angle, string axis)
matrix rotaxis(float angle, vector axisv)
float round(float number)
string run(string command)
string runb(string command)
string rune(string command)
matrix scale(float sx, float sy, float sz)
float seqanim(string compositing_node)
float seqend(string compositing_node)
float seqlength(string compositing_node)
float seqstart(string compositing_node)
string shopstring(string shop_path, string render_type)
float sign(float value)
float sin(float number)
float sinh(float number)
float smooth(float value, float minimum, float maximum)
float snoise(float X, float Y, float Z)
float spknot(string surface_node, float prim_num, float knot_index, float du_or_dv)
float spline()
float sqrt(float number)
float stamp(string scope, string token, float value)
string stamps(string stamp_op_path, string token, string value)
float strcasecmp(string s1, string s2)
float strcasematch(string pattern, string s)
string strcat(string s1, string s2)
float strcmp(string s1, string s2)
string strdup(float count, string s2)
string stripmatrix(string mat)
float strlen(string s)
float strmatch(string pattern, string s)
string strreplace(string s, string old, string new)
float sturb(float X, float Y, float Z, float depth)
string substr(string s, float start, float length)
float surflen(string surface_node, float prim_num, float ustart, float vstart, float ustop, float vstop)
string system(string command_string)
float systemES(string command_string)
string systemRAW(string command_string)
float tan(float number)
float tanh(float number)
float tex(string filename, float U, float V, string color_type)
float texni(string diskfile, float U, float V, string color_type)
string tolower(string s)
string toupper(string s)
matrix translate(float tx, float ty, float tz)
matrix transpose(matrix mat)
float trunc(float number)
float turb(float X, float Y, float Z, float depth)
float uniqueval(string surface_node, float class, string attribute, float index)
string uniquevals(string surface_node, float class, string attribute, float index)
float unituv(string surface_node, float prim_num, float uv_real, float D_U|D_V)
float uvdist(string surface_node, float prim1_num, float u1, float v1, string surface_node, float prim2_num, float u2, float v2)
float vangle(vector v0, vector v1)
vector vector(string pattern)
vector vector3(float x, float y, float z)
vector vector4(float x, float y, float z, float w)
float vertex(string surface_node, float primitive_number, float vertex_number, string attribute, float index)
float vertexattribsize(string surface_node, string attribute)
float vertexattribtype(string surface_node, string attribute)
string vertexs(string surface_node, float primitive_number, float vertex_number, string attribute)
string vertexsmap(string surface_node, string attribute, float index)
float vertexsnummap(string surface_node, string attribute)
float vlength(vector vec)
float vlength2(vector vec)
float vmatch()
float vmatchin()
float vmatchout()
float volumeaverage(string surface_node, float prim_id)
float volumegradient(string surface_node, float prim_id, float x, float y, float z, float axis)
float volumeindex(string surface_node, float prim_id, float ix, float iy, float iz)
float volumeindextopos(string surface_node, float prim_id, float ix, float iy, float iz, float axis)
float volumemax(string surface_node, float prim_id)
float volumemin(string surface_node, float prim_id)
float volumepostoindex(string surface_node, float prim_id, float x, float y, float z, float axis)
float volumeres(string surface_node, float prim_id, float axis)
float volumesample(string surface_node, float prim_id, float x, float y, float z)
float volumevoxeldiameter(string surface_node, float prim_id)
vector vorigin(string obj1, string obj2)
string vpname(string viewer, float viewport_quadrant_number)
vector vrorigin(string obj1, string obj2)
vector vscale(vector vec, float scale)
vector vset(float size, float value)
float vsize(vector vec)
vector vtorigin(string obj1, string obj2)
float wrap(float value, float minimum, float maximum)
float xyzdist(float x, float y, float z, string surface_node, float prim_num, float return_type)'''.split('\n')

expressions = {}

for e in expression_strings:
    match = re.match(r'(\w+) (\w+)\((.*)\)', e)
    r, f, args = match.group(1, 2, 3)
    expressions[f] = {'return': r}
    if args:
        args, argnames = zip(*[pair.split() for pair in args.split(',')])
        expressions[f]['args'], expressions[f]['argnames'] = args, argnames

def test():
    print(len(expressions))
    f = random.choice(list(expressions.keys()))
    print(f)
    pprint(expressions[f])
    
test()

390
hasvertexattrib
{'argnames': ('surface_node', 'attribute'),
 'args': ('string', 'string'),
 'return': 'float'}


### Generate simplified HTML pages from data available in Houdini help documentation

#### Resulting data
```yaml
foo:
  help: <html>foo</html>
```

In [3]:
# Page retrieving will take 15 minutes to complete if not cached.
index = 'http://localhost:48626/expressions/'

def ret(url):
    return requests.get(url).content

index_page = bs4.BeautifulSoup(ret(index), 'html.parser')

if op.exists('expressions.p'):
    with open('expressions.p', 'rb') as f:
        pages = pickle.load(f)
else:
    pages = {f : None
             for f in sorted({a.text for a in index_page.find_all('a', class_='expression')})
             if f in expressions}

    for i, f in enumerate(pages):
        pages[f] = ret(index + f)
        if i>0 and (i%25==0 or i==len(pages)-1):
            print('Loaded %d pages.' % i)
            
    with open('expressions.p', 'wb') as f:
        pickle.dump(pages, f)

Loaded 25 pages.
Loaded 50 pages.
Loaded 75 pages.
Loaded 100 pages.
Loaded 125 pages.
Loaded 150 pages.
Loaded 175 pages.
Loaded 200 pages.
Loaded 225 pages.
Loaded 250 pages.
Loaded 275 pages.
Loaded 300 pages.
Loaded 325 pages.
Loaded 350 pages.
Loaded 375 pages.
Loaded 387 pages.


In [4]:
def make_helpcard(f):
    global expressions, pages
    index = 'https://www.sidefx.com/docs/houdini/expressions/'
    s = bs4.BeautifulSoup(pages[f], 'html.parser')
    nonmeta, = s.main.select('#content')
    nonmeta['id'] = 'helpcard'

    # Strip redundant tags which does not work good inside ST3 popups.
    for span in nonmeta.find_all('span', class_='line'):
        span.wrap(s.new_tag('code', **{'class': 'line'}))
        span.unwrap()
    for var in nonmeta.find_all('var'):
        var.wrap(s.new_tag('code'))
        var.unwrap()
    for div in nonmeta.find_all('div', class_='def'):
        div.wrap(s.new_tag('p'))
        div.unwrap()
    for div in nonmeta.find_all('div', class_='content'):
        if not div.text.strip():
            div.unwrap()
    for div in nonmeta.find_all('div', class_='clear'):
        div.unwrap()
    for code in nonmeta.find_all('code', class_='codehilite'):
        code.unwrap()
    for p in nonmeta.find_all('p', class_='label'):
        p.unwrap()
    for tag in nonmeta.find_all(['pre', 'dl', 'span', 'section']):
        tag.unwrap()

    # Append related section.
    related = s.main.select('#postmeta')[0].select('a.Exp')
    if related:
        h = s.new_tag('h2')
        h.string = 'See also'
        nonmeta.append(h)

        for a in related:
            title = a.get('title')
            text = a.text
            a = s.new_tag('a', href=index+a['href'])
            a.string = text

            p = s.new_tag('p')
            p.append(a)

            if title:
                div = s.new_tag('div', **{'class': 'related-summary'})
                div.string = title
                p.append(div)

            nonmeta.append(p)

    for table in nonmeta.find_all('table'):
        table.decompose()
        
    # Destroy default documentation signatures.
    # TODO: Remove small indents in ST3 caused by properly formatted HTML code by removing '\n' strings.
    for c in nonmeta.children:
        if c != '\n':
            if (c.name == 'ul' and 'bullets' in c['class']) \
                    or (c.name == 'div' and 'usage_group' in c['class']):
                c.decompose()
            break

    # Make URLs full.
    for a in nonmeta.find_all('a'):
        a['href'] = index + a['href']

    # Images still not work from HTTP for me. Replace with urls.
    for img in nonmeta.find_all('img'):
        img_link = s.new_tag('a', href=index + img['src'])
        img_link.string = '[Image \U0001f517]'
        img.wrap(img_link)
        img.unwrap()

    # Fix not working <pre> tag with non-breaking spaces.
    for code in nonmeta.find_all('code', class_='line'):
        newtext = code.text.lstrip(' ')
        code.string = '\xa0' * (len(code.text)-len(newtext)) + newtext

    # Add summary paragraph.
    summary = s.main.select('.summary')
    if not summary:
        summary = s.new_tag('p', **{'class': 'summary'})
    else:
        summary = summary[0]
    nonmeta.insert(0, summary)
        
    # Insert function signature from read list.
    code = s.new_tag('code')
    summary.insert_after(code)
    args = ''
    if 'args' in expressions[f]:
        atypes = expressions[f]['args']
        anames = expressions[f]['argnames']
        args = ', '.join(' '.join(x) for x in zip(atypes, anames))
    code.string = '{} {}({})'.format(expressions[f]['return'], f, args)
    code.wrap(s.new_tag('p'))

    # Insert title and online documentation hyperlink.
    title = s.new_tag('a', href=index+f)
    title.string = f
    heading = s.new_tag('h1')
    heading.append(title)
    summary.insert_before(heading)

    # print(nonmeta.prettify())
    return str(nonmeta)

# # For quick testing.
# with open('../commands/helpcards.json', 'w') as f:
#     helpcards = {
#         'hscript': {
#             'chope' : make_helpcard('point'),
#             'opinput' : make_helpcard('opinput'),
#         }
#     }
#     json.dump(helpcards, f, indent=4, sort_keys=True)

helpcards = {}
for i, f in enumerate(sorted(pages)):
    try:
        helpcard = make_helpcard(f)
        helpcards[f] = {'help': helpcard}
    except:
        print(f)
        raise
    if i>0 and (i%25==0 or i==len(pages)-1):
        print('Finished %d helpcards.' % i)

Finished 25 helpcards.
Finished 50 helpcards.
Finished 75 helpcards.
Finished 100 helpcards.
Finished 125 helpcards.
Finished 150 helpcards.
Finished 175 helpcards.
Finished 200 helpcards.
Finished 225 helpcards.
Finished 250 helpcards.
Finished 275 helpcards.
Finished 300 helpcards.
Finished 325 helpcards.
Finished 350 helpcards.
Finished 375 helpcards.
Finished 387 helpcards.


### Merge data into same object

#### Input structures

`expressions`
```yaml
foo:
  args: float
  argnames: bar
  return: vector
```

`helpcards`
```yaml
foo:
  help: <html>foo</html>
```

#### Resulting structure

`merged`
```yaml
foo:
  help: <html>foo</html>
  args: float
  argnames: bar
  return: vector
```

In [5]:
# Dump generated content on disk.
merged = {}
for f in expressions:
    merged[f] = {'return': expressions[f]['return']}
    if 'args' in expressions[f]:
        merged[f]['args'] = expressions[f]['args']
        merged[f]['argnames'] = expressions[f]['argnames']
        
    if f in helpcards:
        merged[f]['help'] = helpcards[f]['help']

with open('expressions.json', 'w') as f:
    json.dump(merged, f, indent=4, sort_keys=True)