In [5]:
import numpy as np
import pandas as pd
import re

In [6]:
df = pd.read_csv('csv/metadata_text_merged_cleaned.csv')

In [7]:
nationalities = ['dutch', 'french', 'flemish', 'portuguese', 'danish', 'brandenberg', 'english']

# Maintain list of UIDS of occurrences of interloper for each nationality
interloper_uids_by_nationality = {}
for nationality in nationalities:
    interloper_uids_by_nationality[nationality] = []

for nationality in nationalities:
    for uid, text in zip(df['UID'], df['text_cleaned_dealiased']):
        if re.search(nationality + '.{0,15}interloper', text) is not None:
            interloper_uids_by_nationality[nationality].append(uid)

interloper_uids_by_nationality['no_nationality'] = []

# Search for occurrences of interloper without nationality in front
for uid, text in zip(df['UID'], df['text_cleaned_dealiased']):    
    if 'interloper' in text and not any([nationality in text for nationality in nationalities]):
        interloper_uids_by_nationality['no_nationality'].append(uid)

for nationality, uids in interloper_uids_by_nationality.iteritems():
    print nationality, len(uids)

portuguese 3
flemish 3
english 35
no_nationality 98
french 5
dutch 90
brandenberg 0
danish 0


In [9]:
# keyed by uid, values are list of nationalities
interloper_by_uid = {}

for nationality, uids in interloper_uids_by_nationality.iteritems():
    for uid in uids:
        if uid not in interloper_by_uid.keys():
            interloper_by_uid[uid] = []
        
        interloper_by_uid[uid].append(nationality)
            

In [10]:
# Stringify the keys for easier conversion to excel
interloper_by_uid_str = {}

for k, v in interloper_by_uid.iteritems():
    interloper_by_uid_str[k] = ','.join(v)

In [11]:
interloper_df = pd.DataFrame.from_dict(interloper_by_uid_str, orient='index')
interloper_df.reset_index(level=0, inplace=True)
interloper_df.columns = ['UID', 'nationalities']
interloper_df.sort_values(by='UID', inplace=True)

In [12]:
interloper_df.to_csv('csv/uids_with_interloper.csv', index=False)

# UIDs with Interloper not Preceded By "Dutch"

In [18]:
# Copy dictionary so we don't modify original
interloper_by_uid_copy = interloper_by_uid.copy()
keys_to_delete = []

for key, nation_list in interloper_by_uid_copy.iteritems():
    if 'dutch' in nation_list:
        # Add to list of keys to remove
        keys_to_delete.append(key)

# Delete all UIDs corresponding to letters which have 'dutch' in them
for key in keys_to_delete:
    del interloper_by_uid_copy[key]

In [31]:
# Subset full dataset for instances where interloper is not preceded by "Dutch"
df[df['UID'].isin(interloper_by_uid_copy.keys())].to_csv('csv/letters_non_dutch_interlopers.csv', index=False)