In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load cleaned text and metadata datasets
text_df = pd.read_csv('csv/segmented_cleaned.csv')
meta_df = pd.read_excel('csv/032818_RAC_Networks_Database.xlsx')

# Extract Volume and Letter Numbers

In [3]:
def extract_vol(filename):
    '''Extract Vol for each text'''
    if 'Volume_III' in filename:
        return 3
    elif 'Vol_II'in filename:
        return 2
    elif 'Vol_1' or 'Vol_I'in filename:
        return 1
    elif 'Winneba_p150_152' in filename:
        return 1
    elif 'Tantumkweri_Quansas_Croome' in filename:
        return 3
    else:
        # Sanity check for whether anything was missed
        return None

def extract_letter_number(section_title):
    '''Extract letter number from section title'''
    return section_title.split('.')[0]

In [4]:
assert(sum(text_df['filename'].apply(extract_vol).isnull() == 0))

In [5]:
# Add columns for Vol and LetterNo 
text_df['Vol'] = text_df['filename'].apply(extract_vol)
text_df['LetterNo'] = text_df['section_title'].apply(extract_letter_number)

In [6]:
text_df.to_csv('csv/segmented_with_numbers.csv',index=False)

# Join with Metadata dataframe on Volume and Letter Number

In [7]:
# Read cleaned dataframes
text_df = pd.read_csv('csv/segmented_with_numbers.csv')
meta_df = pd.read_excel('csv/032818_RAC_Networks_Database.xlsx')

In [8]:
# Join dataframes on Letter Number and Volume
merged_df = pd.merge(meta_df, text_df, on=['Vol', 'LetterNo'])

In [32]:
# Filter out for a subset of the columns
merged_filtered = merged_df[[u'UID', u'Vol', u'LetterNo', u'Sender',  u'Place Sent From', u'Ship Name', u'Place Going To', u'Date',
          u'Boat/Fort', u'RAC/Other Nation', u'text']].copy()

In [33]:
def remove_non_ascii(text):
    '''Remove non-ASCII characters (mostly due to OCR parsing error)'''
    if text is not np.nan:
        return ''.join(i for i in text if ord(i)<128)
    else:
        return np.nan
    
# Strip away non-ASCII characters from columns
for col in merged_filtered.columns:
    if merged_filtered[col].dtype == np.object and col != 'Date':
        merged_filtered[col] = merged_filtered[col].apply(remove_non_ascii)

In [34]:
# Export joined dataframe to csv file
merged_filtered.to_csv('csv/metadata_text_merged.csv', index=False)