### Book5 - parsing question sheets to json
```
TODO
 [ ] handle end character of question attribute
 [ ] best way to view markdown in vscode? 
    - extension? that hotloads?
    - react type live server?
```

In [1]:
import json
import yaml

In [9]:
fn = '../wordle-qa-1/alpha/basic.md'
md_schema_fn = '../wordle-qa-1/alpha/md-schema.yaml'

with open(fn, 'r') as f:
    text = f.readlines()

with open(md_schema_fn, 'r') as f:
    md_schema = yaml.safe_load(f)

In [31]:
def parse(
    text: list, 
    d_md: dict, 
    check_name: bool = False
) -> list:

    parsed_markers = []
    for i, line in enumerate(text):
        for obj, md_header in d_md.items():
            marker = "#" * md_header + " "
            if line.startswith(marker):
                data = line.replace(marker, "").strip()
                if check_name and (data.lower() != obj): continue
                parsed_markers.append((i, obj, data))

    parsed_markers.append((len(text) + 1, None, None)) # include last line

    doc_markers = [
        {
            'start': parsed_markers[i][0] + 1, 
            'end': parsed_markers[i+1][0] - 1, 
            'obj_type': parsed_markers[i][1],
            'obj_name': parsed_markers[i][2]
        }
        for i in range(len(parsed_markers) - 1)
    ]

    return doc_markers

In [42]:
def extract_sections(
    text: list,
    parsed_markers: list,
    compress: bool = False,
) -> list:
    sections = []
    for section in parsed_markers:
        section_text = text[section['start']: section['end']]
        if compress:
            section_text = "".join(section_text)
        sections.append({
            'type': section['obj_type'],
            'name': section['obj_name'],
            'text': section_text
        })
    return sections

In [43]:
md_schema

{'sheet': {'md_header': 1,
  'children': {'md_header': 4, 'options': ['meta', 'templates']}},
 'question': {'md_header': 2,
  'children': {'md_header': 4,
   'options': ['meta', 'answer', 'templates', 'question']}}}

In [44]:
# parse to major sections
d_md = {obj: md_schema[obj]['md_header'] for obj in md_schema}
parsed_markers = parse(text, d_md, check_name=False)
major_sections = extract_sections(text, parsed_markers)

# parse sub-sections
output = []
for section in major_sections:
    
    section_type = section['type']
    d_md = {subsection_name: md_schema[section_type]['children']['md_header']
            for subsection_name in md_schema[section_type]['children']['options']
    }
    
    parsed_markers = parse(section['text'], d_md, check_name=True)
    
    sub_sections = extract_sections(section['text'], parsed_markers, compress=True)

    # TODO - handle meta k/v parsing
    # TODO - handle end character on question text parsing

    output.append({
        'type': section_type,
        'name': section['name'],
        'sub_sections': sub_sections
    })


print(json.dumps(output, indent=2))


[
  {
    "type": "sheet",
    "name": "Basic Wordle Questions",
    "sub_sections": [
      {
        "type": "meta",
        "name": "meta",
        "text": "This is the area\n"
      }
    ]
  },
  {
    "type": "question",
    "name": "Explain-State-1",
    "sub_sections": [
      {
        "type": "meta",
        "name": "meta",
        "text": " - name: Explain-State-1\n - description: given a wordle game state, explain what we know about the game\n - answer type: free-response, multi-question\n - answer suggested length: 300\n - question assets: plain-text-board, python-nyt-board\n - prompt style: QA\n"
      },
      {
        "type": "question",
        "name": "question",
        "text": "Below is the state of a wordle game:\n\nA B O U T\nS A L E S\nF L A M E\n\u25a1 \u25a1 \u25a1 \u25a1 \u25a1\n\u25a1 \u25a1 \u25a1 \u25a1 \u25a1\n\u25a1 \u25a1 \u25a1 \u25a1 \u25a1\n\n[[['absent', 'a'], ['present', 'b'], ['correct', 'o'], ['absent', 'u'], ['absent', 't']], [['present', 's'], 

### Using `markdown` and `BeautifulSoup` to extract text from HTML

In [None]:
import markdown
from bs4 import BeautifulSoup

In [None]:
html_content = markdown.markdown(markdown_content)
soup = BeautifulSoup(html_content, 'html.parser')

In [4]:
print(html_content)

<h1>Basic Wordle Questions</h1>
<p>Written 11.18.23
These are to get a baseline on LLM capabilities.</p>
<h2>Metadata</h2>
<h3>Explain-State-1</h3>
<h4>meta</h4>
<ul>
<li>name: Explain-State-1</li>
<li>description: given a wordle game state, explain what we know about the game</li>
<li>answer type: free response, multi-question</li>
<li>answer suggested length: 300</li>
<li>question assets: plain-text-board, python-nyt-board</li>
<li>prompt style: QA</li>
</ul>
<h5>question</h5>
<p>Below is the state of a wordle game:</p>
<p>A B O U T
S A L E S
F L A M E
□ □ □ □ □
□ □ □ □ □
□ □ □ □ □</p>
<p>[[['absent', 'a'], ['present', 'b'], ['correct', 'o'], ['absent', 'u'], ['absent', 't']], [['present', 's'], ['absent', 'a'], ['absent', 'l'], ['absent', 'e'], ['correct', 's']], [['absent', 'f'], ['absent', 'l'], ['absent', 'a'], ['absent', 'm'], ['absent', 'e']], [['empty', ''], ['empty', ''], ['empty', ''], ['empty', ''], ['empty', '']], [['empty', ''], ['empty', ''], ['empty', ''], ['empty', '']

In [15]:
children = list(soup.children)

In [18]:
all_children = soup.children

In [17]:
children.

[<h1>Basic Wordle Questions</h1>,
 '\n',
 <p>Written 11.18.23
 These are to get a baseline on LLM capabilities.</p>]

In [12]:
for elem in :
    print(elem.name)

h1
None
p
None
h2
None
h3
None
h4
None
ul
None
h5
None
p
None
p
None
p
None
p
None
p
None
h3
None
h4
None
ul
None
h5
None
p
None
p
None
p
None
p
None
p
None
h3
None
h4
None
ul
None
h5
None
p
None
p
None
p
None
p
None
h3
None
h4
None
ul
None
h5
None
p
None
p
None
p
None
p
None
p


In [5]:
def extract_text_between_headings(soup):
    data = {}
    current_h3 = None
    current_h4_h5 = None
    text_buffer = ''

    for elem in soup.children:
        if elem.name == 'h3':
            if current_h3:
                if current_h4_h5:
                    data[current_h3][current_h4_h5] = text_buffer
                else:
                    data[current_h3] = text_buffer
            current_h3 = elem.text.strip()
            current_h4_h5 = None
            text_buffer = ''
            data[current_h3] = {}
        elif elem.name in ['h4', 'h5']:
            if current_h4_h5 and text_buffer:
                data[current_h3][current_h4_h5] = text_buffer
            current_h4_h5 = elem.text.strip()
            text_buffer = ''
        elif elem.name == 'p':
            text_buffer += elem.text.strip() + '\n'

    # Add the last section
    if current_h3:
        if current_h4_h5:
            data[current_h3][current_h4_h5] = text_buffer
        elif text_buffer:
            data[current_h3] = text_buffer

    return data


In [6]:

# Extract text between headings
extracted_data = extract_text_between_headings(soup)


In [10]:
import json
print(json.dumps(extracted_data, indent=2))

{
  "Explain-State-1": {
    "question": "Below is the state of a wordle game:\nA B O U T\nS A L E S\nF L A M E\n\u25a1 \u25a1 \u25a1 \u25a1 \u25a1\n\u25a1 \u25a1 \u25a1 \u25a1 \u25a1\n\u25a1 \u25a1 \u25a1 \u25a1 \u25a1\n[[['absent', 'a'], ['present', 'b'], ['correct', 'o'], ['absent', 'u'], ['absent', 't']], [['present', 's'], ['absent', 'a'], ['absent', 'l'], ['absent', 'e'], ['correct', 's']], [['absent', 'f'], ['absent', 'l'], ['absent', 'a'], ['absent', 'm'], ['absent', 'e']], [['empty', ''], ['empty', ''], ['empty', ''], ['empty', ''], ['empty', '']], [['empty', ''], ['empty', ''], ['empty', ''], ['empty', ''], ['empty', '']], [['empty', ''], ['empty', ''], ['empty', ''], ['empty', ''], ['empty', '']]]\nQ: What do we know about the game? Has the player won yet? What is the current turn? what words have been tried?\nA:\n"
  },
  "Better-Guess-1a": {
    "question": "Below is the state of a wordle game:\nA B O U T\nS A L E S\nF L A M E\n\u25a1 \u25a1 \u25a1 \u25a1 \u25a1\n\u25a1 \u

In [11]:

# Print or process the extracted data
for key, value in extracted_data.items():
    print(f'{key}: {value}')

Explain-State-1: {'question': "Below is the state of a wordle game:\nA B O U T\nS A L E S\nF L A M E\n□ □ □ □ □\n□ □ □ □ □\n□ □ □ □ □\n[[['absent', 'a'], ['present', 'b'], ['correct', 'o'], ['absent', 'u'], ['absent', 't']], [['present', 's'], ['absent', 'a'], ['absent', 'l'], ['absent', 'e'], ['correct', 's']], [['absent', 'f'], ['absent', 'l'], ['absent', 'a'], ['absent', 'm'], ['absent', 'e']], [['empty', ''], ['empty', ''], ['empty', ''], ['empty', ''], ['empty', '']], [['empty', ''], ['empty', ''], ['empty', ''], ['empty', ''], ['empty', '']], [['empty', ''], ['empty', ''], ['empty', ''], ['empty', ''], ['empty', '']]]\nQ: What do we know about the game? Has the player won yet? What is the current turn? what words have been tried?\nA:\n"}
Better-Guess-1a: {'question': 'Below is the state of a wordle game:\nA B O U T\nS A L E S\nF L A M E\n□ □ □ □ □\n□ □ □ □ □\n□ □ □ □ □\n[[[\'absent\', \'a\'], [\'present\', \'b\'], [\'correct\', \'o\'], [\'absent\', \'u\'], [\'absent\', \'t\']], [

In [42]:
def extract_questions_and_metadata(soup):
    questions = []
    # for header in soup.find_all(['h3', 'h4', 'h5', 'h6']):
    for header in soup.find_all(['h3']):
        question_data = {'title': header.text.strip()}
        next_node = header.next_sibling
        print(header)
        print(next_node.text)
        print('------')
        # while next_node and next_node.name != header.name:
        #     print(next_node.name)
        #     next_node = next_node.next_sibling

extract_questions_and_metadata(soup)


<h3>Explain-State-1</h3>


------
<h3>Better-Guess-1a</h3>


------
<h3>Better-Guess-1b</h3>


------
<h3>Better-Guess-2</h3>


------


In [None]:
def extract_questions_and_metadata(soup):
    questions = []
    # for header in soup.find_all(['h3', 'h4', 'h5', 'h6']):
    for header in soup.find_all(['h3']):
        question_data = {'title': header.text.strip()}
        next_node = header.next_sibling
        print(header)
        print(next_node.text)
        print('------')
        # while next_node and next_node.name != header.name:
        #     print(next_node.name)
        #     next_node = next_node.next_sibling

extract_questions_and_metadata(soup)


<h3>Explain-State-1</h3>


------
<h3>Better-Guess-1a</h3>


------
<h3>Better-Guess-1b</h3>


------
<h3>Better-Guess-2</h3>


------


In [32]:
def extract_questions_and_metadata(soup):
    questions = []
    for header in soup.find_all(['h3', 'h4', 'h5', 'h6']):
        question_data = {'title': header.text.strip()}
        next_node = header.next_sibling

        while next_node and next_node.name != header.name:
            if next_node.name == 'ul':
                metadata = [li.text.strip() for li in next_node.find_all('li')]
                question_data['metadata'] = metadata
            if next_node.name == 'p': 
                question = [p.text.strip() for p in next_node.find_all('p')]
                question_data['question'] = "\n".join(question)
            next_node = next_node.next_sibling

        questions.append(question_data)
    return questions


In [34]:
extracted_data = extract_questions_and_metadata(soup)


In [35]:
extracted_data

[{'title': 'Explain-State-1',
  'metadata': ['name: Explain-State-1',
   'description: given a wordle game state, explain what we know about the game',
   'answer type: free response, multi-question',
   'answer suggested length: 300',
   'question assets: plain-text-board, python-nyt-board',
   'prompt style: QA'],
  'question': ''},
 {'title': 'meta',
  'metadata': ['name: Explain-State-1',
   'description: given a wordle game state, explain what we know about the game',
   'answer type: free response, multi-question',
   'answer suggested length: 300',
   'question assets: plain-text-board, python-nyt-board',
   'prompt style: QA'],
  'question': ''},
 {'title': 'question',
  'question': '',
  'metadata': ['name: Better-Guess-1a',
   'description: given a wordle game state, make the best guess from choices supplied.',
   'answer type: multiple-choice',
   'answer suggested length: 5',
   'question assets: plain-text-board, python-nyt-board',
   'prompt style: QA, implicit-multiple-c

In [27]:
def extract_questions_and_metadata(soup):
    questions = []
    for header in soup.find_all(['h3', 'h4', 'h5', 'h6']):
        if header.text.strip().lower() == 'question':
            question_data = {'title': header.text.strip()}
            question_text = ''
            next_node = header.next_sibling

            # Traverse siblings of the header to gather question text
            while next_node and (next_node.name != 'h3' and next_node.name != 'h4' and next_node.name != 'h5' and next_node.name != 'h6'):
                if next_node.name == 'p':
                    if next_node.text.strip() == 'A:':
                        break
                    question_text += next_node.text.strip() + '\n'
                next_node = next_node.next_sibling

            question_data['question'] = question_text
            questions.append(question_data)
    return questions

In [28]:
extracted_data = extract_questions_and_metadata(soup)

In [29]:
extracted_data

[{'title': 'question',
  'question': "Below is the state of a wordle game:\nA B O U T\nS A L E S\nF L A M E\n□ □ □ □ □\n□ □ □ □ □\n□ □ □ □ □\n[[['absent', 'a'], ['present', 'b'], ['correct', 'o'], ['absent', 'u'], ['absent', 't']], [['present', 's'], ['absent', 'a'], ['absent', 'l'], ['absent', 'e'], ['correct', 's']], [['absent', 'f'], ['absent', 'l'], ['absent', 'a'], ['absent', 'm'], ['absent', 'e']], [['empty', ''], ['empty', ''], ['empty', ''], ['empty', ''], ['empty', '']], [['empty', ''], ['empty', ''], ['empty', ''], ['empty', ''], ['empty', '']], [['empty', ''], ['empty', ''], ['empty', ''], ['empty', ''], ['empty', '']]]\nQ: What do we know about the game? Has the player won yet? What is the current turn? what words have been tried?\n"},
 {'title': 'question',
  'question': 'Below is the state of a wordle game:\nA B O U T\nS A L E S\nF L A M E\n□ □ □ □ □\n□ □ □ □ □\n□ □ □ □ □\n[[[\'absent\', \'a\'], [\'present\', \'b\'], [\'correct\', \'o\'], [\'absent\', \'u\'], [\'absent\',

In [19]:
soup

<h1>Basic Wordle Questions</h1>
<p>Written 11.18.23
These are to get a baseline on LLM capabilities.</p>
<h2>Metadata</h2>
<p>... (rest of your markdown text)</p>

In [14]:
soup.find_all(['h3', 'h4', 'h5', 'h6'])

[]

In [13]:
extracted_data

[]

In [None]:

# Print or process the extracted data
for data in extracted_data:
    print(data)
