# Distance calculations

In [1]:
# imports and setup
from   ast import literal_eval
from   collections import Counter, defaultdict
from   geopy import distance
from   itertools import pairwise
import numpy as np
import os
import pandas as pd
import string
from   unidecode import unidecode

data_dir    = os.path.join('..', 'data')
derived_dir = os.path.join(data_dir, 'derived')
raw_dir     = os.path.join(data_dir, 'booknlp')
geo_dir     = os.path.join(data_dir, 'geo')

conlit_input_file = 'conlit.csv.gz'
conlit_distances_output_file = 'conlit_distances.csv.gz'
early_distances_output_file = 'early_distances.csv.gz'

## Load derived data

In [2]:
def string_to_list(x):
    lst = literal_eval(x.replace(', nan', "', ZZZZ'").replace('[nan, ', "'['").replace(', nan]', "']'"))
    return [i for i in lst if i != 'ZZZZ']

conlit = pd.read_csv(
    os.path.join(derived_dir, conlit_input_file), 
    index_col='book_id',
    converters={
        'gpe_places': string_to_list,
        'nongpe_places': string_to_list,
        'all_places': string_to_list,
        'gpe_sequences': string_to_list
    },
)
conlit = conlit.fillna('')

## CONLIT GPE distances

### Geo data

In [3]:
# regularize data to existing geo format
punctuation_to_space = str.maketrans({key:' ' for key in string.punctuation})

def regularize_string(place_string):
    return(unidecode(' '.join(place_string.translate(punctuation_to_space).lower().split())))

geo = pd.read_csv(
    os.path.join(geo_dir, 'geo.tsv.gz'),
    sep='\t',
    low_memory=False,
)
geo = geo.loc[geo.lang=='en']
geo.set_index('text_string', inplace=True)

# hand review data
hand = pd.read_csv(
    os.path.join(geo_dir, 'handreview.tsv'),
    sep='\t',
    index_col='text_string'
)

# restore some items from C19 hand review
hand.loc[
    [
        'hollywood', 
        'dallas', 
        'florence', 
        'kingston',
        'berkeley', 
        'queens', 
        'phoenix', 
        'woodstock', 
        'surrey',
        'orlando'
    ], 
    'ignore'
] = 0

# improve alises
hand.loc['kingston', 'alias_to'] = 'kingston jamaica'
hand.loc['baltic', ['ignore', 'alias_to']] = [0, 'baltic sea']

# alias places
for original_place, alias_to in hand.loc[(~hand.alias_to.isna()) & (~hand.ignore.equals(1)) & (hand.alias_to.isin(geo.index)), 'alias_to'].items():
    geo.loc[original_place] = geo.loc[alias_to]
    
# drop ignored places
geo.drop(hand.loc[hand.ignore==1].index, inplace=True, errors='ignore')

In [4]:
# drop unused places
used_gpes = Counter()
for sequence in conlit.gpe_sequences:
    used_gpes.update([regularize_string(i) for i in sequence])
geo.drop(geo.loc[~geo.index.isin(used_gpes)].index, inplace=True, errors='ignore')

### Sequence distances

For each volume sequence, look up each place, get lat/lon, calculate distance from previous place, sum over sequential path.

Performs location aliasing, ignore known-bad places, and zero out sequence steps that move between admin levels within the same admin entity (e.g., `Boston -> United States` or `UK -> England`). The last step has room for improvement: we don't deal with `admin_2` level and below (the distances involved are small), nor with continents (source geo data doesn't place countries in continents, plus, I think there's a meaningful sense in which there's distance between, e.g., `Paris` and `Europe` in a way that there isn't between `Boston` and `USA`).

In [5]:
def get_lat_lon(place_string, geo_data, extended=False):
    '''Assumes lookup string has been regularized if necessary'''
    try:
        if extended:
            return(geo_data.loc[place_string, ['lat', 'lon', 'country_short', 'admin_1_std', 'location_type']])
        else:
            return(geo_data.loc[place_string, ['lat', 'lon']])
    except (ValueError, KeyError):
        return(None)

def hop_distance(location1, location2, geo_data, return_zero_dist=False):
    '''
    Takes two location strings, returns fancy distance between them in miles.
    return_zero_dist: if True, return zero distances where we would otherwise return None
    '''
    dist = None
    if location1 in distances and location2 in distances[location1]:
        return distances[location1][location2]
    else:
        loc1 = get_lat_lon(location1, geo_data, extended=True)
        loc2 = get_lat_lon(location2, geo_data, extended=True)
        if loc1 is None or loc2 is None: pass # should never happen, but check
        elif loc1.equals(loc2): pass # ignore identical places, even if called different names
        # eliminate place -> higher-order place in same admin area
        elif (loc1.location_type=='country' or loc2.location_type=='country') and \
           (loc2.country_short==loc1.country_short): pass
        elif (loc1.location_type=='administrative_area_level_1' or \
              loc2.location_type=='administrative_area_level_1') and \
             (loc2.country_short==loc1.country_short and \
              loc2.admin_1_std==loc1.admin_1_std): pass
        else:
            loc1 = loc1[['lat', 'lon']]
            loc2 = loc2[['lat', 'lon']]   
            dist = distance.distance(loc1, loc2).miles
            distances[location1][location2] = dist
            distances[location2][location1] = dist
    if return_zero_dist and dist is None:
        dist = 0.0
    return(dist)

def sequence_distance(sequence, geo_data, return_zero_dist=True):
    # set data, regularize strings, and remove unknown locations
    seq = [regularize_string(i) for i in sequence if regularize_string(i) in geo_data.index]
    total_distance = 0.0
    hop_distances = []
    # calculate distance over pairwise hops
    for location1, location2 in pairwise(seq):
        dist = hop_distance(location1, location2, geo_data)
        if dist != None:
            total_distance += dist
            hop_distances.append(dist)
    # calculate start-finish distance
    if return_zero_dist:
        start_finish_miles = 0.0
        start_finish_Z = 0.0
    else:
        start_finish_miles = None
        start_finish_Z = None
    if len(seq) < 2:
        pass
    else:
        start_finish_miles = hop_distance(seq[0], seq[-1], geo_data, return_zero_dist=True)
    if (len(hop_distances)==1) and (start_finish_miles>0.0): # start_finish_Z = NaN for single-hop books not ending at origin
        start_finish_Z = None
    elif len(hop_distances) > 1:
        hop_std = np.std(hop_distances)
        hop_mean = np.mean(hop_distances)
        if hop_std > 0.01:
            start_finish_Z = (start_finish_miles - hop_mean)/hop_std 
    return(total_distance, len(hop_distances), start_finish_miles, start_finish_Z)

In [6]:
distances = defaultdict(lambda: defaultdict(float))
conlit_distances = conlit['gpe_sequences'].apply(sequence_distance, args=(geo, True))

CPU times: user 24.5 s, sys: 7.54 ms, total: 24.5 s
Wall time: 24.6 s


### Save distances to CSV

In [7]:
conlit['dist_miles'], conlit['hops'], conlit['Start_Finish_Miles'], conlit['Start_Finish_Z'] = zip(*conlit_distances)
conlit[['dist_miles', 'hops', 'Start_Finish_Miles', 'Start_Finish_Z']].to_csv(os.path.join(derived_dir, conlit_distances_output_file))

## EARLY GPE distances

### Prepare early file

In [8]:
def read_hoplist(file_path, label='gpe'):
    df = pd.read_json(
        file_path,
        lines=True
    ).explode('chars')
    
    book_ids = []
    char_ids = []
    gpe_lists = []
    seq_lists = []
    for _, row in df.iterrows():
        book_id = row.book_id
        d = row.loc['chars']
        char_id = d['char_id']
        seq = d['sequence']
        seq_list = [place_dict['place'] for place_dict in seq]
        gpe_list = []
        for place_dict in seq:
            for i in range(place_dict['count']):
                gpe_list.append(place_dict['place'])
        seq_lists.append(seq_list)
        gpe_lists.append(gpe_list)
        book_ids.append(book_id)
        char_ids.append(char_id)

    result = pd.DataFrame(
        {
            'book_id':book_ids,
            'char_id':char_ids,
            f'{label}_places':gpe_lists,
            f'{label}_sequences':seq_lists
        }
    )
    return(result)

# read hoplists
early_gpes = read_hoplist(os.path.join(raw_dir, 'hoplist.gpe.all.jsonl.bz2'), label='gpe')

# read base data
base_early = pd.read_csv(
    os.path.join(raw_dir, 'inf_gender.1.tsv.gz'),
    sep='\t',
).drop(columns=['prob'])

# restrict to protagonists
early = base_early.merge(
    early_gpes, how='left', on=['book_id', 'char_id']
)

# add token count
early_token_counts = pd.read_csv(
    os.path.join(raw_dir, 'book_lengths.tsv'),
    sep='\t',
    skiprows=1,
    names=['book_id', 'Tokens']
)
early = early.merge(early_token_counts, how='left', on=['book_id'])

# add empty distance column
early['dist_miles'] = np.nan

### Calculate distances

Need to read geo data again so that we can restrict to GPEs that exist in early data

In [9]:
# reread geo data to retain relevant locations, etc.
geo = pd.read_csv(
    os.path.join(geo_dir, 'geo.tsv.gz'),
    sep='\t',
    low_memory=False,
)
geo = geo.loc[geo.lang=='en']
geo.set_index('text_string', inplace=True)

# hand review data
hand = pd.read_csv(
    os.path.join(geo_dir, 'handreview.tsv'),
    sep='\t',
    index_col='text_string'
)

# restore some items from C19 hand review
hand.loc[
    [
        'hollywood', 
        'dallas', 
        'florence', 
        'kingston',
        'berkeley', 
        'queens', 
        'phoenix', 
        'woodstock', 
        'surrey',
        'orlando'
    ], 
    'ignore'
] = 0

# improve alises
hand.loc['kingston', 'alias_to'] = 'kingston jamaica'
hand.loc['baltic', ['ignore', 'alias_to']] = [0, 'baltic sea']

# alias places
for original_place, alias_to in hand.loc[(~hand.alias_to.isna()) & (~hand.ignore.equals(1)) & (hand.alias_to.isin(geo.index)), 'alias_to'].items():
    geo.loc[original_place] = geo.loc[alias_to]

# drop ignored places
geo = geo.drop(hand.loc[hand.ignore==1].index, errors='ignore')

# drop unused places
used_gpes = Counter()
for sequence in early.gpe_sequences:
    used_gpes.update([regularize_string(i) for i in sequence])
geo.drop(geo.loc[~geo.index.isin(used_gpes)].index, inplace=True, errors='ignore')

In [10]:
distances = defaultdict(lambda: defaultdict(float))
early_distances = early['gpe_sequences'].apply(sequence_distance, args=(geo, True))

CPU times: user 41.3 s, sys: 28.7 ms, total: 41.3 s
Wall time: 41.5 s


### Save distances to CSV

In [11]:
early['dist_miles'], early['hops'], early['Start_Finish_Miles'], early['Start_Finish_Z'] = zip(*early_distances)
early.set_index('book_id', inplace=True)
early[['dist_miles', 'hops', 'Start_Finish_Miles', 'Start_Finish_Z']].to_csv(os.path.join(derived_dir, early_distances_output_file))