# Mobility distance calculations (Matt)

In [1]:
# imports and setup
from   ast import literal_eval
from   collections import Counter, defaultdict
from   geopy import distance
from   itertools import pairwise
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
from   sklearn.linear_model import LinearRegression
import seaborn as sns
import string
from   unidecode import unidecode

data_dir = os.path.join('..', 'data')
derived_dir = os.path.join(data_dir, 'derived')

## Load derived data

In [2]:
def string_to_list(x):
    lst = literal_eval(x.replace(', nan', "', ZZZZ'").replace('[nan, ', "'['").replace(', nan]', "']'"))
    return [i for i in lst if i != 'ZZZZ']

conlit = pd.read_csv(
    os.path.join(derived_dir, 'CONLIT_CharData_AP_6.csv.gz'), 
    index_col='book_id',
    converters={
        'gpe_places': string_to_list,
        'nongpe_places': string_to_list,
        'all_places': string_to_list,
        'gpe_sequences': string_to_list
    },
)
conlit = conlit.fillna('')#.drop(columns='X')

## CONLIT GPE distances

In [3]:
gpe_counts = Counter()
for i in conlit.gpe_places:
    gpe_counts.update(i)
gpe_counts.most_common(20)

[('New York', 6683),
 ('London', 4367),
 ('Paris', 3673),
 ('America', 3477),
 ('Washington', 2412),
 ('England', 2087),
 ('California', 2064),
 ('Chicago', 2000),
 ('Boston', 1624),
 ('France', 1560),
 ('Rome', 1304),
 ('San Francisco', 1247),
 ('Los Angeles', 1247),
 ('Texas', 1047),
 ('New York City', 1046),
 ('Europe', 952),
 ('Berlin', 938),
 ('Manhattan', 908),
 ('Philadelphia', 862),
 ('Virginia', 854)]

### Wilkens geo data

In [4]:
# regularize data to wilkens geo format
punctuation_to_space = str.maketrans({key:' ' for key in string.punctuation})

def regularize_string(place_string):
    return(unidecode(' '.join(place_string.translate(punctuation_to_space).lower().split())))

wi = pd.read_csv(
    os.path.join('..', '..', 'toponyms', 'geo.tsv.gz'),
    sep='\t',
    low_memory=False,
)
wi = wi.loc[wi.lang=='en']
wi.set_index('text_string', inplace=True)

# hand review data
hand = pd.read_csv(
    os.path.join('..', '..', 'toponyms', 'us_handreview.tsv'),
    sep='\t',
    index_col='text_string'
)

# restore some items from C19 hand review
hand.loc[
    [
        'hollywood', 
        'dallas', 
        'florence', 
        'kingston',
        'berkeley', 
        'queens', 
        'phoenix', 
        'woodstock', 
        'surrey',
        'orlando'
    ], 
    'ignore'
] = 0

# improve alises
hand.loc['kingston', 'alias_to'] = 'kingston jamaica'
hand.loc['baltic', 'alias_to'] = 'baltic sea'

In [5]:
# drop ignored places
wi = wi.drop(hand.loc[hand.ignore==1].index, errors='ignore')

# alias places
for original_place, alias_to in hand.loc[(~hand.alias_to.isna()) & (hand.ignore==0) & (hand.alias_to.isin(wi.index)), 'alias_to'].items():
    wi.loc[original_place] = wi.loc[alias_to]

# drop unused places
wi_gpes = Counter()
for sequence in conlit.gpe_sequences:
    wi_gpes.update([regularize_string(i) for i in sequence])
wi.drop(wi.loc[~wi.index.isin(wi_gpes)].index, inplace=True, errors='ignore')

### Sequence distances

For each volume sequence, look up each place, get lat/lon, calculate distance from previous place, sum over sequential path.

If `source='wilkens'`, also perform fancier location aliasing, ignore known-bad places, and zero out sequence steps that move between admin levels within the same admin entity (e.g., `Boston -> United States` or `UK -> England`). The last step has room for improvement: we don't deal with `admin_2` level and below (the distances involved are small), nor with continents (Google data doesn't place countries in continents, plus, I think there's a meaningful sense in which there's distance between, e.g., `Paris` and `Europe` in a way that there isn't between `Boston` and `USA`).

In [6]:
def get_lat_lon(place_string, geo_data=wi, extended=False):
    '''Assumes lookup string has been regularized if necessary'''
    try:
        if extended:
            return(geo_data.loc[place_string, ['lat', 'lon', 'country_short', 'admin_1_std', 'location_type']])
        else:
            return(geo_data.loc[place_string, ['lat', 'lon']])
    except (ValueError, KeyError):
        return(None)

def hop_distance(location1, location2, geo_data=wi, return_zero_dist=False):
    '''
    Takes two location strings, returns fancy distance between them in miles.
    return_zero_dist: if True, return zero distances where we would otherwise return None
    '''
    dist = None
    if location1 in distances and location2 in distances[location1]:
        return distances[location1][location2]
    else:
        loc1 = get_lat_lon(location1, geo_data, extended=True)
        loc2 = get_lat_lon(location2, geo_data, extended=True)
        if loc1 is None or loc2 is None: pass # should never happen, but check
        elif loc1.equals(loc2): pass # ignore identical places, even if called different names
        # eliminate place -> higher-order place in same admin area
        elif (loc1.location_type=='country' or loc2.location_type=='country') and \
           (loc2.country_short==loc1.country_short): pass
        elif (loc1.location_type=='administrative_area_level_1' or \
              loc2.location_type=='administrative_area_level_1') and \
             (loc2.country_short==loc1.country_short and \
              loc2.admin_1_std==loc1.admin_1_std): pass
        else:
            loc1 = loc1[['lat', 'lon']]
            loc2 = loc2[['lat', 'lon']]   
            dist = distance.distance(loc1, loc2).miles
            distances[location1][location2] = dist
            distances[location2][location1] = dist
    if return_zero_dist and dist is None:
        dist = 0.0
    return(dist)

def sequence_distance(sequence, geo_data=wi):
    # set data, regularize strings, and remove unknown locations
    seq = [regularize_string(i) for i in sequence if regularize_string(i) in geo_data.index]
    total_distance = 0.0
    hop_distances = []
    # calculate distance over pairwise hops
    for location1, location2 in pairwise(seq):
        dist = hop_distance(location1, location2, geo_data)
        if dist != None:
            total_distance += dist
            hop_distances.append(dist)
    # calculate start-finish distance
    start_finish_miles = None
    start_finish_Z = None
    if len(seq) < 2:
        pass
    else:
        start_finish_miles = hop_distance(seq[0], seq[-1], geo_data, return_zero_dist=True)
    if len(hop_distances) > 1:
        hop_std = np.std(hop_distances)
        hop_mean = np.mean(hop_distances)
        if hop_std > 0.01:
            start_finish_Z = (start_finish_miles - hop_mean)/hop_std 
    return(total_distance, start_finish_miles, start_finish_Z)

In [11]:
distances = defaultdict(lambda: defaultdict(float))
sequence_distance(['baltic', 'connecticut'], geo_data=wi)

(1209.546836009187, 1209.546836009187, None)

In [10]:
wi.loc['baltic']

lang                                                                en
occurs                                                        262356.0
place_id                                   ChIJyfT6OvJHiYcRTV5-9FrmUIA
formatted_address    Baltic City Hall, 130 Street Olaf Ave, Baltic,...
location_type                                  local_government_office
country_long                                             United States
country_short                                                       US
admin_1_long                                              South Dakota
admin_1_short                                                       SD
admin_1_std                                                         SD
admin_2                                               Minnehaha County
admin_3                                                            NaN
admin_4                                                            NaN
admin_5                                                            NaN
locali

In [9]:
%%time
distances = defaultdict(lambda: defaultdict(float))
wi_distances = conlit['gpe_sequences'].apply(sequence_distance)

CPU times: user 26.3 s, sys: 7.11 ms, total: 26.3 s
Wall time: 26.4 s


### Save new distances to CSV

In [7]:
conlit['dist_miles'], conlit['Start_Finish_Miles'], conlit['Start_Finish_Z'] = zip(*wi_distances)
conlit.to_csv(os.path.join(derived_dir, 'CONLIT_CharData_AP_MW_6.csv.gz'))
conlit[['dist_miles', 'Start_Finish_Miles', 'Start_Finish_Z']].to_csv(os.path.join(derived_dir, 'CONLIT_CharData_distances_6.csv.gz'))

## EARLY GPE distances

### Prepare early file

In [8]:
def read_hoplist(file_path, label='gpe'):
    df = pd.read_json(
        file_path,
        lines=True
    ).explode('chars')
    
    book_ids = []
    char_ids = []
    gpe_lists = []
    seq_lists = []
    for _, row in df.iterrows():
        book_id = row.book_id
        d = row.loc['chars']
        char_id = d['char_id']
        seq = d['sequence']
        seq_list = [place_dict['place'] for place_dict in seq]
        gpe_list = []
        for place_dict in seq:
            for i in range(place_dict['count']):
                gpe_list.append(place_dict['place'])
        seq_lists.append(seq_list)
        gpe_lists.append(gpe_list)
        book_ids.append(book_id)
        char_ids.append(char_id)

    result = pd.DataFrame(
        {
            'book_id':book_ids,
            'char_id':char_ids,
            f'{label}_places':gpe_lists,
            f'{label}_sequences':seq_lists
        }
    )
    return(result)

# read hoplists
early_gpes = read_hoplist(os.path.join(derived_dir, 'mb.hoplist.gpe.all.jsonl.bz2'), label='gpe')
early_nongpes = read_hoplist(os.path.join(derived_dir, 'mb.hoplist.non_gpe.all.jsonl.bz2'), label='nongpe')
early_allplaces = read_hoplist(os.path.join(derived_dir, 'mb.hoplist.all.all.jsonl.bz2'), label='all')

# read base data
base_early = pd.read_csv(
    os.path.join(derived_dir, 'inf_gender.1.tsv.gz'),
    sep='\t',
).drop(columns=['prob'])

# restrict to protagonists
early = base_early.merge(
    early_gpes, how='left', on=['book_id', 'char_id']
).merge(
    early_nongpes, how='left', on=['book_id', 'char_id']
).merge(
    early_allplaces, how='left', on=['book_id', 'char_id']
)

# reference format of original data for CONLIT
ref = pd.read_csv(
    os.path.join(derived_dir, 'book_char_mobility.tsv.bz2'),
    sep='\t'
)

# calculate simple derived columns
early['num_gpe_places'] = early['gpe_places'].apply(lambda x: len(set(x)))
early['num_nongpe_places'] = early['nongpe_places'].apply(lambda x: len(set(x)))
early['num_all_places'] = early['all_places'].apply(lambda x: len(set(x)))
early['char_rank'] = 1.0

# add token count
early_token_counts = pd.read_csv(
    os.path.join(derived_dir, 'mb.book_lengths.tsv'),
    sep='\t',
    skiprows=1,
    names=['book_id', 'Tokens']
)
early = early.merge(early_token_counts, how='left', on=['book_id'])

# add empty distance column and reorder columns to match reference
early['dist_miles'] = 0.0
proper_cols = [col for col in ref.columns.drop('num_words')]
proper_cols.append('Tokens')
early = early[proper_cols]

### Calculate distances

In [9]:
# reread geo data to retain relevant locations, etc.
wi = pd.read_csv(
    os.path.join('..', '..', 'toponyms', 'geo.tsv.gz'),
    sep='\t',
    low_memory=False,
)
wi = wi.loc[wi.lang=='en']
wi.set_index('text_string', inplace=True)

# hand review data
hand = pd.read_csv(
    os.path.join('..', '..', 'toponyms', 'us_handreview.tsv'),
    sep='\t',
    index_col='text_string'
)

# restore some items from C19 hand review
hand.loc[
    [
        'hollywood', 
        'dallas', 
        'florence', 
        'kingston',
        'berkeley', 
        'queens', 
        'phoenix', 
        'woodstock', 
        'surrey',
        'orlando'
    ], 
    'ignore'
] = 0

# improve alises
hand.loc['kingston', 'alias_to'] = 'kingston jamaica'

# drop ignored places
wi = wi.drop(hand.loc[hand.ignore==1].index, errors='ignore')

# alias places
for original_place, alias_to in hand.loc[(~hand.alias_to.isna()) & (hand.ignore==0) & (hand.alias_to.isin(wi.index)), 'alias_to'].items():
    wi.loc[original_place] = wi.loc[alias_to]

In [10]:
# drop unused places
wi_gpes = Counter()
for sequence in early.gpe_sequences:
    wi_gpes.update([regularize_string(i) for i in sequence])
wi.drop(wi.loc[~wi.index.isin(wi_gpes)].index, inplace=True, errors='ignore')

In [11]:
%%time
distances = defaultdict(lambda: defaultdict(float))
early_distances = early['gpe_sequences'].apply(sequence_distance)

CPU times: user 32.5 s, sys: 8.1 ms, total: 32.5 s
Wall time: 32.6 s


### Save new distances to CSV

In [12]:
early['dist_miles'], early['Start_Finish_Miles'], early['Start_Finish_Z'] = zip(*early_distances)
early.set_index('book_id', inplace=True)
early[['dist_miles', 'Start_Finish_Miles', 'Start_Finish_Z']].to_csv(os.path.join(derived_dir, 'EARLY_CharData_distances_2.csv.gz'))