In [115]:
import numpy as np
import pandas as pd
import os
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline

In [208]:
pd.set_option('display.max_rows', 101)

In [155]:
# Load joined dataframe
df = pd.read_csv('csv/metadata_text_merged.csv')
# Remove rows with empty text column for now
df = df.dropna(subset=['text'])
# Filter out Barbados texts for this analysis
df = df[df['Place Sent From'] != 'Barbados']

# Load Aliases

In [156]:
alias_df = pd.DataFrame()

for alias_file in os.listdir('aliases/'):
    temp = pd.read_csv('aliases/' + alias_file, header=None)
    # Append to alias_df
    alias_df = alias_df.append(temp)

In [157]:
# Regex for alphabetical characters
alpha_regex = re.compile('[^a-zA-Z\s]')

# Dict mapping alias to name
alias_to_name = {}

for _, row in alias_df.iterrows():
    name = alpha_regex.sub('', row[0].strip().lower())

    if row[1] is not np.nan and row[1] != '':
        aliases = [alpha_regex.sub('', s.strip().lower()) for s in row[1].split(';')]
    
    for alias in aliases:
        if alias != '':
            alias_to_name[alias] = name

In [158]:
# Sort aliases in decreasing order by length so that longer phrases are replaced first
aliases = alias_to_name.keys()
aliases.sort(key=len, reverse=True)

# Preprocess Text

In [183]:
def replace_aliases(text, alias_to_name, aliases):
    for alias in aliases:
        if alias in text:
            text = text.replace(alias, alias_to_name[alias])
    
    return text

def replace_double_letters(text):
    '''Replace ff at beginning of word with f; tt and pp at end with single t and p'''
    text_split = text.split(' ')
    
    for i, word in enumerate(text_split):
        if word.startswith('ff'):
            text_split[i] = text_split[i][1:]
        if word.endswith('tt'):
            text_split[i] = text_split[i][:-1]
        if word.endswith('pp'):
            text_split[i] = text_split[i][:-1]
    
    return ' '.join(text_split)

In [194]:
# lowercasing
df['text_cleaned'] = df['text'].apply((lambda x: " ".join(x.lower() for x in x.split())))
# remove punctuation
df['text_cleaned'] = df['text_cleaned'].str.replace('[^\w\s]','')
# stopword removal
stop = stopwords.words('english')
df['text_cleaned'] = df['text_cleaned'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
# Replace all aliases with the term they actually refer to 
df['text_cleaned_dealiased'] = df['text_cleaned'].apply(lambda x: replace_aliases(x, alias_to_name, aliases))
# Remove double letter occurrences
df['text_cleaned_dealiased'] = df['text_cleaned_dealiased'].apply(lambda x: replace_double_letters(x))

In [200]:
# Get list of words that appear most frequently
freq = pd.Series(' '.join(df['text_cleaned']).split()).value_counts()
dealiased_freq = pd.Series(' '.join(df['text_cleaned_dealiased']).split()).value_counts()

In [211]:
print len(freq), len(dealiased_freq)

17478 16587


In [209]:
freq[:100]

send            2729
sent            2598
shall           2590
mr              2400
one             2390
would           1932
received        1875
two             1839
goods           1747
slaves          1709
per             1665
desire          1650
canoe           1624
captain         1620
may             1577
soe             1343
last            1244
come            1223
wee             1215
great           1215
worship         1206
company         1137
order           1119
much            1113
men             1111
doe             1107
good            1071
att             1046
give            1019
came            1008
people          1006
time            1000
corne            941
pleased          922
accompt          912
royall           912
itt              909
hope             899
dutch            898
noe              885
downe            882
hand             877
instant          871
also             865
hee              857
hath             853
mee              838
since        

In [210]:
dealiased_freq[:100]

send         2729
sent         2605
shall        2590
mr           2400
one          2390
would        1932
received     1876
two          1839
canoe        1758
goods        1747
slaves       1709
per          1665
desire       1650
captain      1626
may          1577
so           1343
worship      1292
corn         1281
last         1244
great        1238
come         1223
wee          1215
also         1187
company      1151
order        1119
much         1113
men          1111
do           1107
time         1073
good         1071
at           1046
give         1019
came         1008
people       1006
coast         975
cape          955
royal         954
worships      951
dutch         928
pleased       922
accompt       912
it            909
hope          899
noe           885
down          882
hand          877
instant       871
castle        857
hee           857
hath          853
mee           838
since         800
us            793
want          790
place         782
upon      