# Line Parsing Prototyping Notebook

This notebook is scrapwork related to line parsing function development. All functions developed here will be rolled into `parse.py` as completed.

## Import statements

In [None]:
import pandas as pd                    # Data handling
import numpy as np                     # Math utilities
import string                          # String utilities
import json                            # JSON handling
import re                              # Regular expressions
from bs4 import BeautifulSoup as soup  # BeautifulSoup object

## Global variables

In [None]:
SCRIPTS = './scripts/{}'

## Testing data import

In [None]:
class Script(object):
    
    def __init__(self, file):    
        with open(SCRIPTS.format(file)) as f:
            data = json.load(f)
        
        self.data = str(soup(data['script'], "lxml").find_all("pre")[-1])
        self.df = pd.DataFrame()

In [None]:
aliens = Script('Aliens.json')

In [None]:
aliens.data

In [None]:
lines = pd.Series(aliens.data.split(sep='\n'), dtype=str)

In [None]:
lines

## Component function definitions
* **setup_metrics** = creates dictionary for line metric recording
* **setup_data** = creates dictionary for line data recording
* **update_metric** = updates line metrics based on character data
* **update_data** = updates line data based on character data

In [None]:
import L1

test = L1.Line()
vars(test)

In [None]:
lines[28]

In [None]:
# Set line sample for testing
sample = (28, lines[28])

In [None]:
# Unpack sample to simulate enumerate loop
i, x = sample
# Setting basic info and data
test.data['raw'] = x
test.metrics['id'] = i + .1
test.links['L1'] = test.metrics['id']
test.metrics['length'] = len(x)

In [None]:
# Creating 'fmt' string
temp = x
tags = ['<pre>', '</pre>', '<b>', '</b>']
for tag in tags:
    if tag in x:
        temp = temp.replace(tag, '')
test.data['fmt'] = temp

In [None]:
# Bold checking
if L1.B_OPEN.match(x) != None:  # Open tag check
    test.metrics['bold']['open'] = True
    test.metrics['bold']['bolded'] = (L1.B_OPEN.match(x).end(), len(x))
    test.metrics['bold']['has'] = True
    
if L1.B_CLOSE.match(x) != None:  # Close tag check
    test.metrics['bold']['close'] = True
    if test.metrics['bold']['bolded'] == None:  # Check if 'bolded' already set
        test.metrics['bold']['bolded'] = (0, L1.B_CLOSE.match(x).start())
        if test.metrics['bold']['bolded'][1] - test.metrics['bold']['bolded'][0] != 0:
            test.metrics['bold']['has'] = True
    else:
        test.metrics['bold']['bolded'][1] = L1.B_CLOSE.match(x).start()

# Set remaining metrics
test.metrics['bold']['num'] = test.metrics['bold']['bolded'][1] - test.metrics['bold']['bolded'][0]
test.metrics['bold']['pct'] = test.metrics['bold']['num']/test.metrics['length']
test.metrics['bold']['p80'] = test.metrics['bold']['num']/80.

In [None]:
# Char loop parsing sequence
temp_map = ''   # Map string container
temp_pnc = {}   # Punctuation string container
temp_rgx = r''  # Regex string container
temp_spc = []   # Spacing list container
prev_c = None   # Stores previous character for comparison
html = False    # Flag to track if in html tag

for c in x:
    

In [None]:
punc = re.compile(r'[\!-/<-@\[-`\{-~]')
print(punc.findall(x))

## Parsing routine testing
1. Create lines from raw text
2. Feed lines into line parsing loop -> `for i,x in enumerate(lines)`
    3. Initiate `metrics` dictionary values -> `setup_metrics()`
    4. Initiate `data` dictionary values -> `setup_data()`
    5. Check if string is empty -> `check_empty()`
    6. Check if string is bolded -> `check_bold()`
    7. Pass chars to char parsing loop
        8. Record bold status -> `log_bold()`
        9. Use helper function to check char type -> `check_char()`
            10. Record space info -> `log_space()`
            11. Record punc info -> `log_punc()`
            12. Record num info -> `log_num()`
            13. Record char info -> `log_char()`
                14. Record upper info -> `log_upper()`
                15. Record lower info -> `log_lower()`
    16. Update `data` as necessary
17. Aggregate metrics in `df`
18. Extract metadata info and add to object