In [156]:
from lxml import etree as et
from pathlib import Path
import re

parser = et.HTMLParser()
#encoding="utf-8"

import logging as log
log.basicConfig(level=log.DEBUG)

In [167]:
def _clean_text(text, from_enc, to_enc='utf-8'):
    text = text.encode(from_enc, errors='ignore').decode(to_enc, errors='ignore')
    return re.sub('\s+', ' ', text)

ignore_lines = set(
    ["opening credits", "commercial break", "closing credits",
     "end", "ending credits", "the end", "opening titles", "transcriber"])

def _can_ignore(text):
    """
    True if the given text be ignored, False otherwise
    """
    text = text.lower()
    for ig in ignore_lines:
        if text.startswith(ig):
            return True
    return False

def extract_transcript(html: Path):
    """
    Extracts transcripts from a HTML page
    """
    id = html.name.replace('.html', '')
    doc = et.parse(str(html), parser)
    items = doc.getroot().xpath('//p')
    if not items:
        raise Exception(f"Cant parse {html}")
    del items[0]     # remove the first one, since it is metadata

    count = 0
    for it in items:
        count += 1
        pieces = it.xpath('.//text()')
        pieces = [p.strip() for p in pieces] # strip white spaces
        pieces = [p for p in pieces if p]  # skip empty pieces
        
        if not pieces:
            log.warning(f"Empty para: {id}; count={count}")
            continue

        text = " ".join(pieces)
        text = _clean_text(text, from_enc=doc.docinfo.encoding)
        if _can_ignore(text):
            continue # its okay to skip them 

        if '[' == text[0]:
            text = text.replace("[", "", 1).replace("]", "").replace("Scene:", "").strip()
            tag = '<scene>'
        elif text[0] in ("(", "<"): # event
            text = text.replace("(", "", 1).replace(")", "").replace("<", "", 1).replace(">", "")
            tag = '<event>'
        else:
            matched = re.match("^([A-Za-z0-1\.,\(\)'#\- ]+:)(.*)", text)
            if matched:
                tag, text = matched.groups()
                tag = tag.replace(":", "").strip()
                tag = tag.title()
            else:
                log.warning(f"Dont know how to parse: {pieces}")                
                tag = "<error>"
        
        yield f'{id}-{(count):03d}', tag, text

In [None]:
htmls = list(Path('season').glob('*.html'))
htmls = sorted(htmls)
print(f"found {len(htmls)} files")
with open("friends.transcripts.all.tsv", 'w', encoding='utf-8', errors='ignore') as out:
    for html in htmls:
        chat = extract_transcript(html)
        for msg in chat:
            out.write('%s\t%s\t%s\n' % msg)

In [143]:
p = Path('season/fixedformat/0224.html')
for i, x in enumerate(extract_transcript(p)):
    print(x)

('0224-001', '<scene>', 'Scene: Central Perk, the whole gang is there as Joey enters')
('0224-002', 'RACHEL', " Hey Joey, how'd the audition go?")
('0224-003', 'JOEY', " Incredible! I met the director this time and you'll never believe who it was.")
('0224-004', 'ALL', ' Who?')
('0224-005', 'JOEY', " All right. I'll give you one hint. Warren Beatty.")
('0224-006', 'ALL', ' Wow!')
('0224-007', 'JOEY', " Yeah, there's just one thing that might be kind've a problem. See, I, uh, had to kiss this guy.")
('0224-008', 'CHANDLER', " 'Cause he was just so darn cute.")
('0224-009', 'JOEY', " No, as part of the audition. See, I'm up for this part of this guy, who the main guy kisses.")
('0224-010', 'ROSS', " Well, hey. You're an actor, I say you just suck it up and do it. (Rachel looks at him in disbelief) Or you just do it.")
('0224-011', 'JOEY', " I did do it, I'm a professional.")
('0224-012', 'MONICA', " Then what's the problem?")
