In [1]:
import os
import re
import pandas as pd
from dateparser import parse

import matplotlib.pyplot as plt
%matplotlib inline

In [96]:
SCAN_BASE_DIR = 'First Law Scans Cropped/'
SCAN_DIRS = [SCAN_BASE_DIR + subdir + '/' for subdir in os.listdir(SCAN_BASE_DIR) if '.DS_Store' not in subdir]

In [98]:
counter = 0

# Regex for letter number heading
letter_number_pattern = re.compile('^\d+\..*$')

# Initialize empty dataframe for storing segmented text
df = pd.DataFrame()

for volume_dir in SCAN_DIRS:
    id2filename = {}
    
    # Associate each filename with its numbered id
    # so we can iterate in order through the files
    for filename in os.listdir(volume_dir):
        if '.txt' in filename:
            id = filename.split('_')[0]
            id2filename[int(id)] = filename
    
    for filename in id2filename.values():
        with open(volume_dir + filename, 'r') as f:
            contents = ""
            
            # TODO: Figure out why this hacky workaround is needed for reading the txt file
            # Likely Jupyter environment issue
            for line in f.readlines():
                contents += line
            
            contents_list = contents.split('\r')
            
            ptr = 0
            num_lines = len(contents_list)

            while ptr < num_lines:
                # Look for section heading
                if letter_number_pattern.match(contents_list[ptr]):
                    section_title = contents_list[ptr]
                    
                    # Initialize string buffer for storing text for this section
                    text_buffer = ''
                    ptr += 1

                    # Move pointer forward until next section heading or end of file encountered
                    while ptr < num_lines and letter_number_pattern.match(contents_list[ptr]) is None:
                        text_buffer += contents_list[ptr]
                        ptr += 1

                    # Append to dataframe
                    df = df.append({
                                    'filename': filename,
                                    'section_title': section_title,
                                    'text': text_buffer,
                                    }, 
                                    ignore_index=True)

                # Otherwise advance pointer
                else:
                    ptr += 1


In [99]:
df.head()

Unnamed: 0,filename,section_title,text
0,1_Sekondi_Vol_I_p1_21.txt,"1. James Parri s Succondee , 25 Apr. 1683",These are to give you an account of our affair...
1,1_Sekondi_Vol_I_p1_21.txt,"2. Mark Bedford Whiting Succond~e Factory, 12 ...",This is only to give your Worship an accompt t...
2,1_Sekondi_Vol_I_p1_21.txt,"3. Mark Bedford Whiting Succondee Factory, 20 ...",Since my last to your Worship here are come do...
3,1_Sekondi_Vol_I_p1_21.txt,"4. Mark Bedford Whiting Succondee Factory, 26 ...",Your Worships of the 25th instant is come to h...
4,1_Sekondi_Vol_I_p1_21.txt,"5. Mark Bedford Whiting Succondee Factory, 27 ...",Just now arrived a canoe from Axim whoe inform...


In [116]:
def parse_wrapper(str):
    '''parse function from dateparser with exception handling'''
    try:
        return parse(str.split(',')[-1])
    except Exception :
        return None

In [117]:
# Attempt to extract date from section titles
df['date'] = df['section_title'].apply(parse_wrapper)

In [118]:
df.head()

Unnamed: 0,filename,section_title,text,date
0,1_Sekondi_Vol_I_p1_21.txt,"1. James Parri s Succondee , 25 Apr. 1683",These are to give you an account of our affair...,1683-04-25 00:00:00
1,1_Sekondi_Vol_I_p1_21.txt,"2. Mark Bedford Whiting Succond~e Factory, 12 ...",This is only to give your Worship an accompt t...,1683-06-12 00:00:00
2,1_Sekondi_Vol_I_p1_21.txt,"3. Mark Bedford Whiting Succondee Factory, 20 ...",Since my last to your Worship here are come do...,1683-06-20 00:00:00
3,1_Sekondi_Vol_I_p1_21.txt,"4. Mark Bedford Whiting Succondee Factory, 26 ...",Your Worships of the 25th instant is come to h...,1683-06-26 00:00:00
4,1_Sekondi_Vol_I_p1_21.txt,"5. Mark Bedford Whiting Succondee Factory, 27 ...",Just now arrived a canoe from Axim whoe inform...,1683-06-27 00:00:00


In [122]:
# Export segmented text to csv
df.to_csv('csv/segmented.csv', index=False)