In [3]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import os
import re

In [2]:
# Read in raw scraped data corresponding to the 3 volumes
df1 = pd.read_csv('scraped/vol_1_all.csv')
df2 = pd.read_csv('scraped/vol_2_all.csv')
df3 = pd.read_csv('scraped/vol_3_all.csv')
# Some letters were missed in initial scrape, so we fill them in here
df2_missing = pd.read_csv('scraped/vol_2_missing.csv')
df3_missing = pd.read_csv('scraped/vol_3_missing.csv')

# Extract Letter Number from Raw HTML

In [3]:
def extract_letter_number(html):
    '''Given HTML corresponding to a scraped letter, extract the letter number'''
    soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
    
    # Remove page breaks if they exist
    if soup.find('span', class_="page-break"):
        soup.find('span', class_="page-break").decompose()
    
    return soup.text.split('.')[0].strip()

In [4]:
# Initialize empty dataframe for storing html with corresponding volume and letter numbers
labelled_df = pd.DataFrame()

for df in [df1, df2, df3, df2_missing, df3_missing]:
    if df is df1:
        vol = 1
    elif df is df2 or df is df2_missing:
        vol = 2
    else:
        vol = 3
    
    for i, row in df.iterrows():
        html = row['content']
        # Extract letter number from raw html
        letter_no = extract_letter_number(html)
        
        # Append to dataframe only if letter number successfully extracted
        if letter_no.isdigit():
            labelled_df = labelled_df.append({
                        'Vol': vol,
                        'LetterNo': int(letter_no),
                        'raw_html': html,
                        }, 
                        ignore_index=True)

No handlers could be found for logger "bs4.dammit"


In [5]:
# Sort by volume, then letter number
labelled_df = labelled_df.sort_values(['Vol', 'LetterNo'])

# Remove non-ASCII Characters

In [77]:
def remove_non_ascii(text):
    '''Remove non-ASCII characters'''
    if text is not np.nan:
        return ''.join(i for i in text if ord(i)<128)
    else:
        return np.nan

# Extract text from Raw HTML
Removing:
* Letter headings
* Footnote numbers (span class="fragment-marker work-legend" data-type="fragment-marker")
* Line breaks (class = "page-break")

In [7]:
def extract_text(html):
    '''Given HTML corresponding to a scraped letter, extract the letter number'''
    soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
    
    # Remove page breaks if they exist
    for pgbreak in soup.findAll('span', class_="page-break"):
        pgbreak.decompose()
        
    # Remove footnote hyperlinks if they exist
    for footnote in soup.findAll('span', class_="fragment-marker work-legend"):
        footnote.decompose()
        
    # Remove header (letter number and title)
    if soup.find('h2'):
        soup.find('h2').decompose()
    
    if soup.find('h3'):
        soup.find('h3').decompose()
        
    if soup.find('h4'):
        soup.find('h4').decompose()
    
    return soup.text.strip()

In [8]:
labelled_df['text'] = labelled_df['raw_html'].apply(extract_text)\
                                            .apply(remove_non_ascii)

In [9]:
# Drop empty rows and any duplicates
labelled_df = labelled_df.dropna()
labelled_df = labelled_df[~labelled_df.duplicated()]

In [10]:
labelled_df.to_csv('csv/letters.csv', index=False)

# Merge with Metadata Dataframe based on Volume and Letter Number

In [92]:
# Read cleaned dataframes
text_df = pd.read_csv('csv/letters.csv')
meta_df = pd.read_csv('csv/032818_RAC_Networks_Database.csv')

# Join dataframes on Letter Number and Volume
merged_df = pd.merge(meta_df, text_df, on=['Vol', 'LetterNo'])

# Filter out for a subset of the columns
merged_filtered = merged_df[[u'UID', u'Vol', u'LetterNo', u'Sender',  u'Place Sent From', u'Ship Name', u'Place Going To', u'Date',
          u'Boat/Fort', u'RAC/Other Nation', u'text', u'Year', u'Month']].copy()

# Remove non-ASCII
for col in merged_filtered.columns:
    if col in [u'Sender',u'Place Sent From',u'Ship Name', u'Place Going To',u'Date',u'Boat/Fort', u'RAC/Other Nation', u'text']:
        merged_filtered[col] = merged_filtered[col].apply(remove_non_ascii)
        
# Replace 1000 years with None
merged_filtered['Year'] = merged_filtered['Year'].replace(1000, np.NaN)

# Export joined dataframe to csv file
merged_filtered.to_csv('csv/metadata_text_merged.csv', index=False)

# Cleaning
From https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/

- lowercasing
- punctuation removal
- removal of stopwords
- Deduping of aliases

## Helper Functions for Cleaning Text

In [4]:
def replace_aliases(text, alias_to_name, aliases):
    for alias in aliases:
        if alias in text:
            text = text.replace(alias, alias_to_name[alias])
    
    return text

def correct_common_mispellings(text):
    vowels = ['a', 'e', 'i', 'o', 'u']
    # Words not to change with the i before e rule
    ie_exceptions = ['specie', 'species', 'science', 'sufficient', 'seize', 'vein', 'weird', 'their', 'feisty', 'foreign', 'eight', 'neighbor', 'neighbors', '']
    text_split = text.split(' ')
    
    for i, word in enumerate(text_split):
        # If word starts with ff -> f
        if word.startswith('ff'):
            text_split[i] = text_split[i][1:]
        # If word ends with tt -> t
        if word.endswith('tt'):
            text_split[i] = text_split[i][:-1]
        # If word ends with pp -> p
        if word.endswith('pp'):
            text_split[i] = text_split[i][:-1]
        # Replace comeing with coming
        if word == 'comeing':
            text_split[i] = 'coming'
        # ei corrected to ie
#         if 'ei' in text_split[i] and not (text_split[i].startswith('ei') or text_split[i].endswith('ei')):
#             text_split[i] = word.replace('ei', 'ie')
        # TODO: receive, perceive, deceive, eight
        # Implement i before e except for c|
        # Remove final "e" after two consonants
        if word.endswith('e') and len(word) > 2 and word[-2] not in vowels and word[-3] not in vowels:
            text_split[i] = text_split[i][:-1]
            
    return ' '.join(text_split)

In [5]:
# Load aliases that we want to dedupe in the text
alias_df = pd.DataFrame()

for alias_file in os.listdir('aliases/'):
    temp = pd.read_csv('aliases/' + alias_file, header=None)
    # Append to alias_df
    alias_df = alias_df.append(temp)

# Regex for alphabetical characters
alpha_regex = re.compile('[^a-zA-Z\s]')

# Dict mapping alias to name
alias_to_name = {}

for _, row in alias_df.iterrows():
    name = alpha_regex.sub('', row[0].strip().lower())

    if row[1] is not np.nan and row[1] != '':
        aliases = [alpha_regex.sub('', s.strip().lower()) for s in row[1].split(';')]
    
    for alias in aliases:
        if alias != '':
            alias_to_name[alias] = name

# Sort aliases in decreasing order by length so that longer phrases are replaced first
aliases = alias_to_name.keys()
aliases.sort(key=len, reverse=True)

In [6]:
# Load joined dataframe
df = pd.read_csv('csv/metadata_text_merged.csv')
# Remove rows with empty text column for now
df = df.dropna(subset=['text'])
# Filter out Barbados texts for our analysis
df = df[df['Place Sent From'] != 'Barbados']

In [7]:
# lowercasing
df['text_cleaned'] = df['text'].apply((lambda x: " ".join(x.lower() for x in x.split())))
# remove puctuation
df['text_cleaned'] = df['text_cleaned'].str.replace('[^\w\s]','')
# stopword removal
stop = stopwords.words('english')
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
# Replace all aliases with the term they actually refer to 
df['text_cleaned_dealiased'] = df['text_cleaned'].apply(lambda x: replace_aliases(x, alias_to_name, aliases))
# Correct common mispellings
df['text_cleaned_dealiased'] = df['text_cleaned_dealiased'].apply(lambda x: correct_common_mispellings(x))

In [8]:
# Save processed dataframe
df.to_csv('csv/metadata_text_merged_cleaned.csv', index=False)