# Mobility data prep (Matt)

In [1]:
# imports and setup
from   ast import literal_eval
from   collections import Counter, defaultdict
from   geopy import distance
from   itertools import pairwise
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
from   sklearn.linear_model import LinearRegression
import seaborn as sns
import string
from   unidecode import unidecode

data_dir = os.path.join('..', 'data')
derived_dir = os.path.join(data_dir, 'derived')

## Load derived data

In [2]:
def string_to_list(x):
    lst = literal_eval(x.replace(', nan', "', ZZZZ'").replace('[nan, ', "'['").replace(', nan]', "']'"))
    return [i for i in lst if i != 'ZZZZ']

conlit = pd.read_csv(
    os.path.join(derived_dir, 'CONLIT_CharData_AP_6.csv.gz'), 
    index_col='book_id',
    converters={
        'gpe_places': string_to_list,
        'nongpe_places': string_to_list,
        'all_places': string_to_list,
        'gpe_sequences': string_to_list
    },
)
conlit = conlit.fillna('')#.drop(columns='X')

## CONLIT GPE distances

In [3]:
gpe_counts = Counter()
for i in conlit.gpe_places:
    gpe_counts.update(i)
gpe_counts.most_common(20)

[('New York', 6683),
 ('London', 4367),
 ('Paris', 3673),
 ('America', 3477),
 ('Washington', 2412),
 ('England', 2087),
 ('California', 2064),
 ('Chicago', 2000),
 ('Boston', 1624),
 ('France', 1560),
 ('Rome', 1304),
 ('San Francisco', 1247),
 ('Los Angeles', 1247),
 ('Texas', 1047),
 ('New York City', 1046),
 ('Europe', 952),
 ('Berlin', 938),
 ('Manhattan', 908),
 ('Philadelphia', 862),
 ('Virginia', 854)]

### Wilkens geo data

In [4]:
# regularize data to wilkens geo format
punctuation_to_space = str.maketrans({key:' ' for key in string.punctuation})

def regularize_string(place_string):
    return(unidecode(' '.join(place_string.translate(punctuation_to_space).lower().split())))

wi = pd.read_csv(
    os.path.join('..', '..', 'toponyms', 'geo.tsv.gz'),
    sep='\t',
    low_memory=False,
)
wi = wi.loc[wi.lang=='en']
wi.set_index('text_string', inplace=True)

# hand review data
hand = pd.read_csv(
    os.path.join('..', '..', 'toponyms', 'us_handreview.tsv'),
    sep='\t',
    index_col='text_string'
)

# restore some items from C19 hand review
hand.loc[
    [
        'hollywood', 
        'dallas', 
        'florence', 
        'kingston',
        'berkeley', 
        'queens', 
        'phoenix', 
        'woodstock', 
        'surrey',
        'orlando'
    ], 
    'ignore'
] = 0

# improve alises
hand.loc['kingston', 'alias_to'] = 'kingston jamaica'

# drop ignored places
wi = wi.drop(hand.loc[hand.ignore==1].index, errors='ignore')

# alias places
for original_place, alias_to in hand.loc[(~hand.alias_to.isna()) & (hand.ignore==0) & (hand.alias_to.isin(wi.index)), 'alias_to'].items():
    wi.loc[original_place] = wi.loc[alias_to]

# drop unused places
wi_gpes = Counter()
for sequence in conlit.gpe_sequences:
    wi_gpes.update([regularize_string(i) for i in sequence])
wi.drop(wi.loc[~wi.index.isin(wi_gpes)].index, inplace=True, errors='ignore')

### Sequence distances

For each volume sequence, look up each place, get lat/lon, calculate distance from previous place, sum over sequential path.

If `source='wilkens'`, also perform fancier location aliasing, ignore known-bad places, and zero out sequence steps that move between admin levels within the same admin entity (e.g., `Boston -> United States` or `UK -> England`). The last step has room for improvement: we don't deal with `admin_2` level and below (the distances involved are small), nor with continents (Google data doesn't place countries in continents, plus, I think there's a meaningful sense in which there's distance between, e.g., `Paris` and `Europe` in a way that there isn't between `Boston` and `USA`).

In [43]:
def get_lat_lon(place_string, geo_data, extended=False):
    '''Assumes lookup string has been regularized if necessary'''
    try:
        if extended:
            return(geo_data.loc[place_string, ['lat', 'lon', 'country_short', 'admin_1_std', 'location_type']])
        else:
            return(geo_data.loc[place_string, ['lat', 'lon']])
    except ValueError:
        return(None)

def hop_distance(location1, location2, geo_data):
    '''Takes two location strings, returns fancy distance between them in miles.'''
    dist = None
    if location1 in distances and location2 in distances[location1]:
        return distances[location1][location2]
    else:
        loc1 = get_lat_lon(location1, geo_data, extended=True)
        loc2 = get_lat_lon(location2, geo_data, extended=True)
        if loc1 is None or loc2 is None: pass # should never happen, but check
        elif loc1.equals(loc2): pass # ignore identical places, even if called different names
        # eliminate place -> higher-order place in same admin area
        elif (loc1.location_type=='country' or loc2.location_type=='country') and \
           (loc2.country_short==loc1.country_short): pass
        elif (loc1.location_type=='administrative_area_level_1' or \
              loc2.location_type=='administrative_area_level_1') and \
             (loc2.country_short==loc1.country_short and \
              loc2.admin_1_std==loc1.admin_1_std): pass
        else:
            loc1 = loc1[['lat', 'lon']]
            loc2 = loc2[['lat', 'lon']]   
            dist = distance.distance(loc1, loc2).miles
            distances[location1][location2] = dist
            distances[location2][location1] = dist
    return(dist)

def sequence_distance(sequence, geo_data=wi):
    # set data, regularize strings, and remove unknown locations
    seq = [regularize_string(i) for i in sequence if regularize_string(i) in geo_data.index]
    total_distance = 0.0
    hop_distances = []
    # calculate distance over pairwise hops
    for location1, location2 in pairwise(seq):
        dist = hop_distance(location1, location2, geo_data)
        if dist != None:
            total_distance += dist
            hop_distances.append(dist)
    # calculate start-finish distance
    start_finish_miles = None
    start_finish_Z = None
    if len(seq) < 2:
        pass
    else:
        start_finish_miles = hop_distance(seq[0], seq[-1], geo_data)
    if len(hop_distances) > 1:
        hop_std = np.std(hop_distances)
        hop_mean = np.mean(hop_distances)
        if (start_finish_miles) and (hop_std > 0):
            start_finish_Z = abs(start_finish_miles - hop_mean)/hop_std 
        
    return(total_distance, start_finish_miles, start_finish_Z)

In [45]:
%%time
distances = defaultdict(lambda: defaultdict(float))
wi_distances = conlit['gpe_sequences'].apply(sequence_distance)

CPU times: user 27.6 s, sys: 15.3 ms, total: 27.6 s
Wall time: 27.7 s


### Save new distances to CSV

In [55]:
conlit['dist_miles'], conlit['Start_Finish_Miles'], conlit['Start_Finish_Z'] = zip(*wi_distances)

In [56]:
conlit.to_csv(os.path.join(derived_dir, 'CONLIT_CharData_AP_MW_6.csv.gz'))

In [57]:
conlit['dist_miles'].to_csv(os.path.join(derived_dir, 'CONLIT_CharData_dist_miles.csv.gz'))

In [58]:
conlit

Unnamed: 0_level_0,char_id,char_count,inf_gender,gpe_places,num_gpe_places,nongpe_places,num_nongpe_places,all_places,num_all_places,gpe_sequences,...,non_gpe_ratio,avg_Distance_GPE_Tokens,deixis_count_perplace,semantic_dist_mean,semantic_dist_total,dist_miles_allChars,dist_miles_allChars_norm_Tokens,num_gpe_places_allChars_norm_Tokens,Start_Finish_Miles,Start_Finish_Z
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
[Heist Society 2] Uncommon Criminals - Ally Carter,138,3238,she/her,"[Paraguay, Paraguay, London, New York City, mi...",15,"[there, there, there, the museum, the museum, ...",103,"[there, there, there, Paraguay, Paraguay, the ...",118,"[Paraguay, London, New York City, midtown Manh...",...,6.866667,0.662652,0.084034,0.501414,70.699392,88408.215954,1.099714,0.000460,6223.878880,1.483659
"2001_2011_Wilson,RobertCharles_TheChronoliths_SF",0,2742,she/her,"[Bangkok, Minneapolis / St. Paul, Chumphon, Ch...",22,"[our household, a gas station hawng nam, the c...",122,"[our household, a gas station hawng nam, the c...",143,"[Bangkok, Minneapolis / St. Paul, Chumphon, Ba...",...,5.545455,1.839428,0.135417,0.510990,72.049542,410727.049637,3.978757,0.000746,9671.262067,1.265047
"2001_Martel,Yann_LifeofPi_BS",0,4754,he/him/his,"[Canada, India, India, India, India, India, Ca...",15,"[the north, the rich , noisy , functioning mad...",142,"[Canada, India, the north, India, India, the r...",157,"[Canada, India, Canada, India, Toronto, Safed,...",...,9.466667,1.579137,0.031915,0.600678,111.726188,225403.288506,1.811268,0.000249,,
"2002_2011_Anderson,MT_Feed_SF",0,2862,she/her,"[Lonely, School, School, Switzerland, Io, Amer...",5,"[the craters, a crummy hotel, here, here, the ...",68,"[the craters, a crummy hotel, here, here, the ...",72,"[Lonely, School, Switzerland, Io, America]",...,13.6,0.066334,0.097902,0.494841,48.494434,19635.975898,0.292747,0.000194,4990.393324,
"2002_Baker,Jo_Offcomer_CT",120,4723,she/her,"[Belfast, Belfast, Belfast, Belfast, Belfast, ...",11,"[the bath, the bath, the bath, it, the bath, t...",162,"[the bath, the bath, the bath, it, the bath, t...",172,"[Belfast, Conroys, October, Belfast, Somervill...",...,14.727273,0.146856,0.075000,0.526924,125.934781,25238.249281,0.286661,0.000295,276.146057,0.365448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Woman Behind the New Deal, The - Kirstin Downey",625,6543,she/her,"[Columbia, Cornell, Cornell, New York City, Ma...",61,"[its, more than a dozen archives across the no...",288,"[its, more than a dozen archives across the no...",344,"[Columbia, Cornell, New York City, Maine, Bost...",...,4.721311,0.900087,0.055866,0.475163,172.959514,359423.555550,2.058943,0.000791,865.742012,0.165683
"Woman of No Importance, A - Sonia Purnell",553,4519,she/her,"[France, France, France, France, Europe, Franc...",70,"[the France, the whole of Europe, the whole of...",230,"[the France, France, France, France, France, t...",296,"[France, Europe, France, Europe, Cambridge, Ma...",...,3.285714,2.493816,0.000000,0.596225,178.867514,485001.109979,3.619871,0.000911,4775.068411,1.456236
"Woodrow Wilson - John Milton Cooper, Jr_",1465,3676,he/him/his,"[America, Washington, Tennessee, Tennessee, Ge...",62,"[his State of the Union, his State of the Unio...",102,"[his State of the Union, his State of the Unio...",161,"[America, Washington, Tennessee, Germany, Wash...",...,1.645161,0.430231,0.024896,0.466782,77.485743,631512.567676,2.007089,0.000969,,
You Never Forget Your First - Alexis Coe,276,417,he/him/his,"[Brooklyn Heights, Lower Manhattan, Philadelph...",11,"[the river, its, his plantation, his plantatio...",14,"[Brooklyn Heights, Lower Manhattan, the river,...",24,"[Brooklyn Heights, Lower Manhattan, Philadelph...",...,1.272727,0.128522,0.166667,0.593350,7.120198,97633.602092,1.515838,0.001118,78.422808,0.446969


## EARLY GPE distances

### Prepare early file

In [13]:
def read_hoplist(file_path, label='gpe'):
    df = pd.read_json(
        file_path,
        lines=True
    ).explode('chars')
    
    book_ids = []
    char_ids = []
    gpe_lists = []
    seq_lists = []
    for _, row in df.iterrows():
        book_id = row.book_id
        d = row.loc['chars']
        char_id = d['char_id']
        seq = d['sequence']
        seq_list = [place_dict['place'] for place_dict in seq]
        gpe_list = []
        for place_dict in seq:
            for i in range(place_dict['count']):
                gpe_list.append(place_dict['place'])
        seq_lists.append(seq_list)
        gpe_lists.append(gpe_list)
        book_ids.append(book_id)
        char_ids.append(char_id)

    result = pd.DataFrame(
        {
            'book_id':book_ids,
            'char_id':char_ids,
            f'{label}_places':gpe_lists,
            f'{label}_sequences':seq_lists
        }
    )
    return(result)

# read hoplists
early_gpes = read_hoplist(os.path.join(derived_dir, 'mb.hoplist.gpe.all.jsonl.bz2'), label='gpe')
early_nongpes = read_hoplist(os.path.join(derived_dir, 'mb.hoplist.non_gpe.all.jsonl.bz2'), label='nongpe')
early_allplaces = read_hoplist(os.path.join(derived_dir, 'mb.hoplist.all.all.jsonl.bz2'), label='all')

# read base data
base_early = pd.read_csv(
    os.path.join(derived_dir, 'inf_gender.1.tsv.gz'),
    sep='\t',
).drop(columns=['prob'])

# restrict to protagonists
early = base_early.merge(
    early_gpes, how='left', on=['book_id', 'char_id']
).merge(
    early_nongpes, how='left', on=['book_id', 'char_id']
).merge(
    early_allplaces, how='left', on=['book_id', 'char_id']
)

# reference format of original data for CONLIT
ref = pd.read_csv(
    os.path.join(derived_dir, 'book_char_mobility.tsv.bz2'),
    sep='\t'
)

# calculate simple derived columns
early['num_gpe_places'] = early['gpe_places'].apply(lambda x: len(set(x)))
early['num_nongpe_places'] = early['nongpe_places'].apply(lambda x: len(set(x)))
early['num_all_places'] = early['all_places'].apply(lambda x: len(set(x)))
early['char_rank'] = 1.0

# add token count
early_token_counts = pd.read_csv(
    os.path.join(derived_dir, 'mb.book_lengths.tsv'),
    sep='\t',
    skiprows=1,
    names=['book_id', 'Tokens']
)
early = early.merge(early_token_counts, how='left', on=['book_id'])

# add empty distance column and reorder columns to match reference
early['dist_miles'] = 0.0
proper_cols = [col for col in ref.columns.drop('num_words')]
proper_cols.append('Tokens')
early = early[proper_cols]

In [14]:
# glance at the data
early.sample(3)

Unnamed: 0,book_id,char_id,char_count,inf_gender,gpe_places,num_gpe_places,nongpe_places,num_nongpe_places,all_places,num_all_places,gpe_sequences,dist_miles,char_rank,Tokens
10323,23517,0,5007,he/him/his,"[Rome, Genoa, Florence, Perugia, Florence, Gen...",15,"[the road, the building, here, that hotel in P...",210,"[Rome, the road, the building, Genoa, here, Fl...",224,"[Rome, Genoa, Florence, Perugia, Florence, Gen...",0.0,1.0,128635
6531,24524,52,3690,he/him/his,"[Valdemaran, Valdemaran, Valdemar, Valdemar, G...",6,"[the woods, the cottage, the cottage, the cott...",82,"[the woods, the cottage, the cottage, the cott...",87,"[Valdemaran, Valdemar, Grove, Errold ’s Grove,...",0.0,1.0,134486
4501,22954,0,2718,he/him/his,"[Maasailand, Maasailand, Earth, Earth, 882 G.E...",5,"[the Kikuyu, Wilford Braxton 's, Wilford Braxt...",93,"[the Kikuyu, Maasailand, Maasailand, Wilford B...",97,"[Maasailand, Earth, 882 G.E., Tahiti Benoit, K...",0.0,1.0,131175


### Calculate distances

In [15]:
# reread geo data to retain relevant locations, etc.
wi = pd.read_csv(
    os.path.join('..', '..', 'toponyms', 'geo.tsv.gz'),
    sep='\t',
    low_memory=False,
)
wi = wi.loc[wi.lang=='en']
wi.set_index('text_string', inplace=True)

# hand review data
hand = pd.read_csv(
    os.path.join('..', '..', 'toponyms', 'us_handreview.tsv'),
    sep='\t',
    index_col='text_string'
)

# restore some items from C19 hand review
hand.loc[
    [
        'hollywood', 
        'dallas', 
        'florence', 
        'kingston',
        'berkeley', 
        'queens', 
        'phoenix', 
        'woodstock', 
        'surrey',
        'orlando'
    ], 
    'ignore'
] = 0

# improve alises
hand.loc['kingston', 'alias_to'] = 'kingston jamaica'

# drop ignored places
wi = wi.drop(hand.loc[hand.ignore==1].index, errors='ignore')

# alias places
for original_place, alias_to in hand.loc[(~hand.alias_to.isna()) & (hand.ignore==0) & (hand.alias_to.isin(wi.index)), 'alias_to'].items():
    wi.loc[original_place] = wi.loc[alias_to]

In [16]:
# drop unused places
wi_gpes = Counter()
for sequence in early.gpe_sequences:
    wi_gpes.update([regularize_string(i) for i in sequence])
wi.drop(wi.loc[~wi.index.isin(wi_gpes)].index, inplace=True, errors='ignore')

In [17]:
%%time
# calculate distances
distances = defaultdict(lambda: defaultdict(float))
early_distances = early['gpe_sequences'].apply(sequence_distance, source='wilkens')

CPU times: user 40.9 s, sys: 47.4 ms, total: 40.9 s
Wall time: 40.9 s


### Save new distances to CSV

In [18]:
early['dist_miles'] = early_distances
early.set_index('book_id', inplace=True)
early.to_csv(os.path.join(derived_dir, 'EARLY_CharData_MW.csv.gz'))
early['dist_miles'].to_csv(os.path.join(derived_dir, 'EARLY_CharData_dist_miles.csv.gz'))

### Check EARLY distances

In [19]:
early.describe()

Unnamed: 0,char_id,char_count,num_gpe_places,num_nongpe_places,num_all_places,dist_miles,char_rank,Tokens
count,10693.0,10693.0,10693.0,10693.0,10693.0,10693.0,10693.0,10693.0
mean,133.157019,2928.077153,11.285794,93.092677,104.060881,29417.895816,1.0,115764.9
std,217.618373,2021.123267,10.486673,61.809811,69.181629,48108.827763,0.0,66509.97
min,0.0,19.0,0.0,0.0,0.0,0.0,1.0,319.0
25%,0.0,1560.0,4.0,49.0,55.0,2240.269955,1.0,76921.0
50%,90.0,2519.0,9.0,82.0,91.0,13369.838032,1.0,101941.0
75%,190.0,3780.0,15.0,123.0,137.0,36431.983953,1.0,139017.0
max,9403.0,22946.0,146.0,623.0,726.0,788452.017943,1.0,1331338.0


In [20]:
conlit.dist_miles.divide(conlit.Tokens).mean()

0.502130632355569

In [21]:
early.dist_miles.divide(early.Tokens).mean()

0.24894877927328699

In [22]:
early_gpe_counts = Counter()
for i in early.gpe_places:
    early_gpe_counts.update(i)
early_gpe_counts.most_common(20)

[('New York', 14926),
 ('London', 8615),
 ('Paris', 7636),
 ('America', 5850),
 ('England', 5622),
 ('Chicago', 4830),
 ('California', 4201),
 ('Boston', 4026),
 ('France', 3334),
 ('Washington', 3168),
 ('Rome', 3084),
 ('San Francisco', 2982),
 ('Virginia', 2133),
 ('Texas', 2102),
 ('Mexico', 2057),
 ('New Orleans', 1844),
 ('Los Angeles', 1812),
 ('Europe', 1645),
 ('Florida', 1563),
 ('Manhattan', 1348)]