## Build usable data for HScript Expressions

### Goal structure
```yaml
foo:
  help: <html>foo</html>
  args: float
  argnames: bar
  return: vector
```
It's much simpler than VEX functions structure, since there is no
such thing as function signatures exist in HScript, and all functions
are properly documented.

In [1]:
import re
import bs4
import json
import pickle
import random
import requests
import subprocess
import collections
import os.path as op
from pprint import pprint

### Parse list of expression strings into data.

#### Resulting structure

```yaml
foo:
  args: float float float
  argnames: bar baz qux
  return: vector
```

Instead of generating it from scratch, I have a complete list
of expression functions. It's easy to keep it up to date by
diffing with list printed by Texport. HScript is barely changing.

In [2]:
with open('expressions.cmd') as f:
    expressions_content = f.read() 

# Check the difference between functions listed in the file
# and functions available in current Houdini exhelp command.
proc = subprocess.run(['hbatch', '-q', '-c', 'exhelp'],
                      stdout=subprocess.PIPE,
                      universal_newlines=True)
exhelp = set(proc.stdout.split())
listed = set(re.findall(r'^\w+ (\w+)\b.*$', expressions_content, flags=re.M))
print(exhelp - listed)
print(listed - exhelp)

expressions = {}

for e in expressions_content.split('\n'):
    match = re.match(r'(\w+) (\w+)\((.*)\)', e)
    r, f, args = match.group(1, 2, 3)
    expressions[f] = {'return': r}
    if args:
        args, argnames = zip(*[pair.split() for pair in args.split(',')])
        expressions[f]['args'], expressions[f]['argnames'] = args, argnames

def test():
    print(len(expressions))
    f = random.choice(list(expressions.keys()))
    print(f)
    pprint(expressions[f])
    
test()

set()
{'iprquery', 'iprquerys'}
390
details
{'argnames': ('surface_node', 'attribute'),
 'args': ('string', 'string'),
 'return': 'string'}


### Generate simplified HTML pages from data available in Houdini help documentation

#### Resulting data
```yaml
foo:
  help: <html>foo</html>
```

In [3]:
# Page retrieving will take 15 minutes to complete if not cached.
# Run help server using "hhelp serve" command.
# http://www.sidefx.com/docs/houdini/help/central
index = 'http://localhost:8080/expressions/'

def ret(url):
    return requests.get(url).content

index_page = bs4.BeautifulSoup(ret(index), 'html.parser')

if op.exists('expressions.p'):
    with open('expressions.p', 'rb') as f:
        pages = pickle.load(f)
else:
    pages = {f : None
             for f in sorted({a.text for a in index_page.find_all('a', class_='expression')})
             if f in expressions}

    for i, f in enumerate(pages):
        pages[f] = ret(index + f)
        if i>0 and (i%25==0 or i==len(pages)-1):
            print('Loaded %d pages.' % i)
            
    with open('expressions.p', 'wb') as f:
        pickle.dump(pages, f)

In [4]:
def make_helpcard(f):
    global expressions, pages
    index = 'https://www.sidefx.com/docs/houdini/expressions/'
    s = bs4.BeautifulSoup(pages[f], 'html.parser')
    nonmeta, = s.main.select('#content')
    nonmeta['id'] = 'helpcard'

    # Strip redundant tags which does not work good inside ST3 popups.
    for span in nonmeta.find_all('span', class_='line'):
        span.wrap(s.new_tag('code', **{'class': 'line'}))
        span.unwrap()
    for var in nonmeta.find_all('var'):
        var.wrap(s.new_tag('code'))
        var.unwrap()
    for div in nonmeta.find_all('div', class_='def'):
        div.wrap(s.new_tag('p'))
        div.unwrap()
    for div in nonmeta.find_all('div', class_='content'):
        if not div.text.strip():
            div.unwrap()
    for div in nonmeta.find_all('div', class_='clear'):
        div.unwrap()
    for code in nonmeta.find_all('code', class_='codehilite'):
        code.unwrap()
    for p in nonmeta.find_all('p', class_='label'):
        p.unwrap()
    for tag in nonmeta.find_all(['pre', 'dl', 'span', 'section']):
        tag.unwrap()

    # Append related section.
    related = s.main.select('#postmeta')[0].select('a.Exp')
    if related:
        h = s.new_tag('h2')
        h.string = 'See also'
        nonmeta.append(h)

        for a in related:
            title = a.get('title')
            text = a.text
            a = s.new_tag('a', href=index+a['href'])
            a.string = text

            p = s.new_tag('p')
            p.append(a)

            if title:
                div = s.new_tag('div', **{'class': 'related-summary'})
                div.string = title
                p.append(div)

            nonmeta.append(p)

    for table in nonmeta.find_all('table'):
        table.decompose()
        
    # Destroy default documentation signatures.
    # TODO: Remove small indents in ST3 caused by properly formatted HTML code by removing '\n' strings.
    for c in nonmeta.children:
        if c != '\n':
            if (c.name == 'ul' and 'bullets' in c['class']) \
                    or (c.name == 'div' and 'usage_group' in c['class']):
                c.decompose()
            break

    # Make URLs full.
    for a in nonmeta.find_all('a'):
        a['href'] = index + a['href']

    # Images still not work from HTTP for me. Replace with urls.
    for img in nonmeta.find_all('img'):
        img_link = s.new_tag('a', href=index + img['src'])
        img_link.string = '[Image \U0001f517]'
        img.wrap(img_link)
        img.unwrap()

    # Fix not working <pre> tag with non-breaking spaces.
    for code in nonmeta.find_all('code', class_='line'):
        newtext = code.text.lstrip(' ')
        code.string = '\xa0' * (len(code.text)-len(newtext)) + newtext

    # Add summary paragraph.
    summary = s.main.select('.summary')
    if not summary:
        summary = s.new_tag('p', **{'class': 'summary'})
    else:
        summary = summary[0]
    nonmeta.insert(0, summary)
        
    # Insert function signature from read list.
    code = s.new_tag('code')
    summary.insert_after(code)
    args = ''
    if 'args' in expressions[f]:
        atypes = expressions[f]['args']
        anames = expressions[f]['argnames']
        args = ', '.join(' '.join(x) for x in zip(atypes, anames))
    code.string = '{} {}({})'.format(expressions[f]['return'], f, args)
    code.wrap(s.new_tag('p'))

    # Insert title and online documentation hyperlink.
    title = s.new_tag('a', href=index+f)
    title.string = f
    heading = s.new_tag('h1')
    heading.append(title)
    summary.insert_before(heading)

    # print(nonmeta.prettify())
    return str(nonmeta)

# # For quick testing.
# with open('../commands/helpcards.json', 'w') as f:
#     helpcards = {
#         'hscript': {
#             'chope' : make_helpcard('point'),
#             'opinput' : make_helpcard('opinput'),
#         }
#     }
#     json.dump(helpcards, f, indent=4, sort_keys=True)

helpcards = {}
for i, f in enumerate(sorted(pages)):
    try:
        helpcard = make_helpcard(f)
        helpcards[f] = {'help': helpcard}
    except:
        print(f)
        raise
    if i>0 and (i%25==0 or i==len(pages)-1):
        print('Finished %d helpcards.' % i)

Finished 25 helpcards.
Finished 50 helpcards.
Finished 75 helpcards.
Finished 100 helpcards.
Finished 125 helpcards.
Finished 150 helpcards.
Finished 175 helpcards.
Finished 200 helpcards.
Finished 225 helpcards.
Finished 250 helpcards.
Finished 275 helpcards.
Finished 300 helpcards.
Finished 325 helpcards.
Finished 350 helpcards.
Finished 375 helpcards.
Finished 387 helpcards.


### Merge data into same object

#### Input structures

`expressions`
```yaml
foo:
  args: float
  argnames: bar
  return: vector
```

`helpcards`
```yaml
foo:
  help: <html>foo</html>
```

#### Resulting structure

`merged`
```yaml
foo:
  help: <html>foo</html>
  args: float
  argnames: bar
  return: vector
```

In [5]:
# Dump generated content on disk.
merged = {}
for f in expressions:
    merged[f] = {'return': expressions[f]['return']}
    if 'args' in expressions[f]:
        merged[f]['args'] = expressions[f]['args']
        merged[f]['argnames'] = expressions[f]['argnames']
        
    if f in helpcards:
        merged[f]['help'] = helpcards[f]['help']

with open('expressions.json', 'w') as f:
    json.dump(merged, f, indent=4, sort_keys=True)