In [222]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
from pprint import pprint

In [2]:
DATA_FILE = "challenge-hamlet/hamlet.txt"

In [265]:
RE_SCENE_DELIMITER = r"(\tHAMLET\n+ACT \w+\n+SCENE \w+\t.+\n+)"
RE_LINE_DELIMITER = r"(\n[A-Z ]+\t)"

RE_ACT_NUM = r"(?:\tHAMLET\n+ACT )([A-Z]+)(?:\n)"
RE_SCENE_NUM = r"(?:\tHAMLET\n+ACT [A-Z]+\n+SCENE )([A-Z]+)(\t.+\n+)"
RE_SCENE_BODY = r"(?:\tHAMLET\n+ACT [A-Z]+\n+SCENE [A-Z]+\t.+\n+)((.*\n)+)"
RE_CHAR_NAME = r"(?:\n)([A-Z ]+)(?:\t)"
RE_LINE_BODY = r"(?:\n[A-Z ]+\t)((.*\n?)+)"

In [325]:
def read_file_to_str(file_path: str) -> str:
    f = open(file_path, 'r')
    txt = f.read()
    f.close()
    return txt

def get_scenes(book: str) -> pd.DataFrame:
    cast_n_scenes = custom_regex_split(full_book, RE_SCENE_DELIMITER)
    scenes = cast_n_scenes[1:]  # ignore first item which is cast
    scenes_df = scenes_list_to_dataframe(scenes)
    return scenes_df

def custom_regex_split(input_text: str, re_delimiter: str) -> list:
    """split input_text by regex delimiter
    """
    splits = re.split(re_delimiter, input_text)

    items = []
    current_delimiter = ""
    current_item = ""
    for i, text in enumerate(splits):
        if i==0:
            current_item = text
        else:
            is_delimiter = re.match(re_delimiter, text)
            if text == '\nMARCELLUS\t':
                print(text)
                print(is_delimiter)
            if is_delimiter:
                items.append(current_item)
                current_item = text
            else:
                current_item += text
    items.append(current_item)

    assert(''.join(items) == input_text)
    return items

def scenes_list_to_dataframe(scenes: list) -> pd.DataFrame:
    scenes_dicts = [scene_to_dict(s) for s in scenes]
    scenes_df = pd.DataFrame(scenes_dicts)
    return scenes_df

def scene_to_dict(scene_txt: str) -> dict:
    act_num = re.match(RE_ACT_NUM, scene_txt).group(1)
    scene_num = re.match(RE_SCENE_NUM, scene_txt).group(1)
    scene_body = re.match(RE_SCENE_BODY, scene_txt).group(1)
    scene_dict = {'act_num':act_num, 'scene_num': scene_num, 'scene_text': scene_body}
    return scene_dict

def get_lines_in_scene(scene: str) -> pd.DataFrame:
    lines_list = custom_regex_split(scene, RE_LINE_DELIMITER)
    lines_df = lines_list_to_dataframe(lines_list)
    return lines_df

def lines_list_to_dataframe(lines: list) -> pd.DataFrame:
    lines_dicts = [lines_to_dict(line) for line in lines]
    lines_df = pd.DataFrame(lines_dicts)
    return lines_df

def lines_to_dict(line: str) -> dict:
    char_name_matched = re.match(RE_CHAR_NAME, line)
    if char_name_matched:
        char_name = char_name_matched.group(1)
        line_body = re.match(RE_LINE_BODY, line).group(1)
        line_dict = {'char_name': char_name, 'line_body': line_body}
    else:
        line_dict = {'char_name': np.NaN, 'line_body': line}
    return line_dict

In [330]:
lines_df = pd.DataFrame()
for i, scene in scenes.iterrows():
    scene_lines = get_lines_in_scene(scene.scene_text)
    scene_lines['act_num'] = scene.act_num
    scene_lines['scene_num'] = scene.scene_num
    lines_df = lines_df.append(scene_lines)


MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), m

In [331]:
lines_df

Unnamed: 0,char_name,line_body,act_num,scene_num
0,,\t[FRANCISCO at his post. Enter to him BERNARD...,I,I
1,BERNARDO,Who's there?\n,I,I
2,FRANCISCO,"Nay, answer me: stand, and unfold yourself.\n",I,I
3,BERNARDO,Long live the king!\n,I,I
4,FRANCISCO,Bernardo?\n,I,I
5,BERNARDO,He.\n,I,I
6,FRANCISCO,You come most carefully upon your hour.\n,I,I
7,BERNARDO,"'Tis now struck twelve; get thee to bed, Franc...",I,I
8,FRANCISCO,"For this relief much thanks: 'tis bitter cold,...",I,I
9,BERNARDO,Have you had quiet guard?\n,I,I


In [360]:
lines_df.char_name.unique()

array([nan, 'BERNARDO', 'FRANCISCO', 'HORATIO', 'MARCELLUS',
       'KING CLAUDIUS', 'CORNELIUS', 'VOLTIMAND', 'LAERTES',
       'LORD POLONIUS', 'HAMLET', 'QUEEN GERTRUDE', 'OPHELIA', 'REYNALDO',
       'ROSENCRANTZ', 'GUILDENSTERN', 'LUCIANUS', 'PRINCE FORTINBRAS'],
      dtype=object)

In [347]:
hamlet_txt = lines_df[lambda df: df.char_name=='CLAUDIUS'].line_body.pipe(lambda s: '\n'.join(s))

In [348]:
hamlet_txt

''

In [364]:
lines_raw = re.split(r"\n+", lines_df[lambda df: df.char_name=='KING CLAUDIUS'].line_body.pipe(lambda s: '\n'.join(s)))

In [365]:
[line for line in lines_raw if line != '']

["Though yet of Hamlet our dear brother's death",
 '\tThe memory be green, and that it us befitted',
 '\tTo bear our hearts in grief and our whole kingdom',
 '\tTo be contracted in one brow of woe,',
 '\tYet so far hath discretion fought with nature',
 '\tThat we with wisest sorrow think on him,',
 '\tTogether with remembrance of ourselves.',
 '\tTherefore our sometime sister, now our queen,',
 '\tThe imperial jointress to this warlike state,',
 "\tHave we, as 'twere with a defeated joy,--",
 '\tWith an auspicious and a dropping eye,',
 '\tWith mirth in funeral and with dirge in marriage,',
 '\tIn equal scale weighing delight and dole,--',
 "\tTaken to wife: nor have we herein barr'd",
 '\tYour better wisdoms, which have freely gone',
 '\tWith this affair along. For all, our thanks.',
 '\tNow follows, that you know, young Fortinbras,',
 '\tHolding a weak supposal of our worth,',
 "\tOr thinking by our late dear brother's death",
 '\tOur state to be disjoint and out of frame,',
 '\tColl

In [366]:
len([line for line in lines_raw if line != ''])

560

In [233]:
_.line_body.iloc[3]

'|\n\n'

In [326]:
full_book = read_file_to_str(DATA_FILE)

In [327]:
scenes = get_scenes(full_book)

In [328]:
custom_regex_split(scenes.scene_text[1], RE_LINE_DELIMITER)


MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>

MARCELLUS	
<_sre.SRE_Match object; span=(0, 11), match='\nMARCELLUS\t'>


['\t[Enter KING CLAUDIUS, QUEEN GERTRUDE, HAMLET,\n\tPOLONIUS, LAERTES, VOLTIMAND, CORNELIUS, Lords,\n\tand Attendants]\n',
 "\nKING CLAUDIUS\tThough yet of Hamlet our dear brother's death\n\tThe memory be green, and that it us befitted\n\tTo bear our hearts in grief and our whole kingdom\n\tTo be contracted in one brow of woe,\n\tYet so far hath discretion fought with nature\n\tThat we with wisest sorrow think on him,\n\tTogether with remembrance of ourselves.\n\tTherefore our sometime sister, now our queen,\n\tThe imperial jointress to this warlike state,\n\tHave we, as 'twere with a defeated joy,--\n\tWith an auspicious and a dropping eye,\n\tWith mirth in funeral and with dirge in marriage,\n\tIn equal scale weighing delight and dole,--\n\tTaken to wife: nor have we herein barr'd\n\tYour better wisdoms, which have freely gone\n\tWith this affair along. For all, our thanks.\n\tNow follows, that you know, young Fortinbras,\n\tHolding a weak supposal of our worth,\n\tOr thinking by ou