In [80]:
from lxml import etree as et
from pathlib import Path
import re

parser = et.HTMLParser()
#encoding="utf-8"

import logging as log
log.basicConfig(level=log.DEBUG)

In [54]:
htmls = list(Path('season').glob('*.html'))

In [None]:
def _clean_text(text, from_enc, to_enc='utf-8'):
    text = text.encode(from_enc, errors='ignore').decode(to_enc, errors='ignore')
    return re.sub('\s+', ' ', text)

ignore_lines = set(["opening credits", "commercial break", "closing credits", "end",
                    "ending credits", "the end"])


def extract_transcript(html: Path):
    """
    
    """
    id = html.name.replace('.html', '')
    doc = et.parse(str(html), parser)
    items = doc.getroot().xpath('//p')
    if not items:
        raise Exception(f"Cant parse {html}")
    del items[0]     # remove the first one, since it is metadata

    count = 0
    for it in items:
        count += 1
        pieces = it.xpath('.//text()')
        pieces = [p.strip() for p in pieces] # strip white spaces
        pieces = [p for p in pieces if p]  # skip empty pieces
        
        if not pieces:
            log.warning(f"Empty para: {id}; count={count}")
            continue

        text = " ".join(pieces)     
        if text.lower() in ignore_lines:
            continue # its okay to skip them 

        if '[' == text[0]:
            text = text.replace("[", "", 1).replace("]", "")
            tag = '<scene>'
        elif text[0] in ("(", "<"): # event
            text = text.replace("(", "", 1).replace(")", "").replace("<", "", 1).replace(">", "")
            tag = '<event>'
        else:
            if pieces[0][-1] == ':':
                tag = pieces[0][:-1]
                text = " ".join(pieces[1:])
            else:          
                matched = re.match("^([A-Za-z\.,\(\)' ]+:)(.*)", text)
                if matched:
                    tag, text = matched.groups()
                    tag = tag.replace(":", "").strip()
                else:
                    log.warning(f"Dont know how to parse: {pieces}")                
                    tag = "<error>"
        text = _clean_text(text, from_enc=doc.docinfo.encoding)
        yield f'{id}-{(count):03d}', tag, text

with open("transcripts.tsv", 'w', encoding='utf-8', errors='ignore') as out:
    for html in htmls:
        chat = extract_transcript(html)
        for msg in chat:
            out.write('%s\t%s\t%s\n' % msg)