In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# Read in raw scraped data corresponding to the 3 volumes
df1 = pd.read_csv('scraped/vol_1_all.csv')
df2 = pd.read_csv('scraped/vol_2_all.csv')
df3 = pd.read_csv('scraped/vol_3_all.csv')
# Some letters were missed in initial scrape, so we fill them in here
df2_missing = pd.read_csv('scraped/vol_2_missing.csv')
df3_missing = pd.read_csv('scraped/vol_3_missing.csv')

# Extract Letter Number from Raw HTML

In [3]:
def extract_letter_number(html):
    '''Given HTML corresponding to a scraped letter, extract the letter number'''
    soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
    
    # Remove page breaks if they exist
    if soup.find('span', class_="page-break"):
        soup.find('span', class_="page-break").decompose()
    
    return soup.text.split('.')[0].strip()

In [4]:
# Initialize empty dataframe for storing html with corresponding volume and letter numbers
labelled_df = pd.DataFrame()

for df in [df1, df2, df3, df2_missing, df3_missing]:
    if df is df1:
        vol = 1
    elif df is df2 or df is df2_missing:
        vol = 2
    else:
        vol = 3
    
    for i, row in df.iterrows():
        html = row['content']
        # Extract letter number from raw html
        letter_no = extract_letter_number(html)
        
        # Append to dataframe only if letter number successfully extracted
        if letter_no.isdigit():
            labelled_df = labelled_df.append({
                        'Vol': vol,
                        'LetterNo': int(letter_no),
                        'raw_html': html,
                        }, 
                        ignore_index=True)

No handlers could be found for logger "bs4.dammit"


In [5]:
# Sort by volume, then letter number
labelled_df = labelled_df.sort_values(['Vol', 'LetterNo'])

# Remove non-ASCII Characters

In [6]:
def remove_non_ascii(text):
    '''Remove non-ASCII characters'''
    if text is not np.nan:
        return ''.join(i for i in text if ord(i)<128)
    else:
        return np.nan

# Extract text from Raw HTML
Removing:
* Letter headings
* Footnote numbers (span class="fragment-marker work-legend" data-type="fragment-marker")
* Line breaks (class = "page-break")

In [7]:
def extract_text(html):
    '''Given HTML corresponding to a scraped letter, extract the letter number'''
    soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
    
    # Remove page breaks if they exist
    for pgbreak in soup.findAll('span', class_="page-break"):
        pgbreak.decompose()
        
    # Remove footnote hyperlinks if they exist
    for footnote in soup.findAll('span', class_="fragment-marker work-legend"):
        footnote.decompose()
        
    # Remove header (letter number and title)
    if soup.find('h2'):
        soup.find('h2').decompose()
    
    if soup.find('h3'):
        soup.find('h3').decompose()
        
    if soup.find('h4'):
        soup.find('h4').decompose()
    
    return soup.text.strip()

In [8]:
labelled_df['text'] = labelled_df['raw_html'].apply(extract_text)\
                                            .apply(remove_non_ascii)

In [9]:
# Drop empty rows and any duplicates
labelled_df = labelled_df.dropna()
labelled_df = labelled_df[~labelled_df.duplicated()]

In [10]:
labelled_df.to_csv('csv/letters.csv', index=False)

# Merge with Metadata Dataframe based on Volume and Letter Number

In [11]:
# Read cleaned dataframes
text_df = pd.read_csv('csv/letters.csv')
meta_df = pd.read_excel('csv/032818_RAC_Networks_Database.xlsx')

# Join dataframes on Letter Number and Volume
merged_df = pd.merge(meta_df, text_df, on=['Vol', 'LetterNo'])

# Filter out for a subset of the columns
merged_filtered = merged_df[[u'UID', u'Vol', u'LetterNo', u'Sender',  u'Place Sent From', u'Ship Name', u'Place Going To', u'Date',
          u'Boat/Fort', u'RAC/Other Nation', u'text']].copy()

# Remove non-ASCII
for col in merged_filtered.columns:
    if col in [u'Sender',u'Place Sent From',u'Ship Name', u'Place Going To',u'Date',u'Boat/Fort', u'RAC/Other Nation', u'text']:
        merged_filtered[col] = merged_filtered[col].apply(remove_non_ascii)

# Export joined dataframe to csv file
merged_filtered.to_csv('csv/metadata_text_merged.csv', index=False)