In [186]:
from   ast import literal_eval
from   collections import Counter
from   geopy import distance
from   itertools import pairwise
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
from   scipy.stats import permutation_test
import seaborn
import string

## Load derived data

In [36]:
data_dir = os.path.join('..', 'data')
derived_dir = os.path.join(data_dir, 'derived')

def string_to_list(x):
    lst = literal_eval(x.replace(', nan', "', ZZZZ'").replace('[nan, ', "'['").replace(', nan]', "']'"))
    return [i for i in lst if i != 'ZZZZ']

df = pd.read_csv(
    os.path.join(derived_dir, 'CONLIT_CharData_AP_3.csv.gz'), 
    index_col='book_id',
    converters={
        'gpe_places': string_to_list,
        'nongpe_places': string_to_list,
        'all_places': string_to_list,
        'gpe_sequences': string_to_list
    },
)
df.fillna('')
df.head()

Unnamed: 0_level_0,X,char_id,char_count,inf_gender,gpe_places,num_gpe_places,nongpe_places,num_nongpe_places,all_places,num_all_places,...,char_count_norm,nongpe_places_cleaned,nongpe_places_total,gpe_places_total,ttr_nongpe,ttr_gpe,avg_Distance_GPE,non_gpe_ratio,avg_Distance_GPE_Tokens,deixis_count_perplace
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
[Heist Society 2] Uncommon Criminals - Ally Carter,1,138,3238,she/her,"[Paraguay, Paraguay, London, New York City, mi...",15,"[there, there, there, the museum, the museum, ...",103,"[there, there, there, Paraguay, Paraguay, the ...",118,...,0.040278,"['', '', '', 'the museum', 'the museum', 'the ...",238,32,0.432773,0.46875,1664.751474,7.4375,0.662654,0.084034
"2001_2011_Wilson,RobertCharles_TheChronoliths_SF",2,0,2742,she/her,"[Bangkok, Minneapolis / St. Paul, Chumphon, Ch...",22,"[our household, a gas station hawng nam, the c...",122,"[our household, a gas station hawng nam, the c...",143,...,0.026562,"['our household', 'a gas station hawng nam', '...",192,62,0.635417,0.354839,2895.067535,3.096774,1.738779,0.135417
"2001_Martel,Yann_LifeofPi_BS",3,0,4754,he/him/his,"[Canada, India, India, India, India, India, Ca...",15,"[the north, the rich , noisy , functioning mad...",142,"[Canada, India, the north, India, India, the r...",157,...,0.038202,"['the north', 'the rich , noisy , functioning ...",282,54,0.503546,0.277778,3695.789883,5.222222,1.603702,0.031915
"2002_2011_Anderson,MT_Feed_SF",4,0,2862,she/her,"[Lonely, School, School, Switzerland, Io, Amer...",5,"[the craters, a crummy hotel, here, here, the ...",68,"[the craters, a crummy hotel, here, here, the ...",72,...,0.042669,"['the craters', 'a crummy hotel', '', '', 'the...",143,7,0.475524,0.714286,713.296839,20.428571,0.07444,0.097902
"2002_Baker,Jo_Offcomer_CT",5,120,4723,she/her,"[Belfast, Belfast, Belfast, Belfast, Belfast, ...",11,"[the bath, the bath, the bath, it, the bath, t...",162,"[the bath, the bath, the bath, it, the bath, t...",172,...,0.053645,"['the bath', 'the bath', 'the bath', '', 'the ...",400,36,0.405,0.305556,524.871736,11.111111,0.214618,0.075


In [4]:
df.columns

Index(['X', 'char_id', 'char_count', 'inf_gender', 'gpe_places',
       'num_gpe_places', 'nongpe_places', 'num_nongpe_places', 'all_places',
       'num_all_places', 'gpe_sequences', 'dist_miles', 'char_rank',
       'num_words', 'Category', 'Genre', 'Tokens', 'num_gpe_places_norm',
       'num_nongpe_places_norm', 'num_gpe_places_norm_byCharacter',
       'num_nongpe_places_norm_byCharacter', 'char_count_norm',
       'nongpe_places_cleaned', 'nongpe_places_total', 'gpe_places_total',
       'ttr_nongpe', 'ttr_gpe', 'avg_Distance_GPE', 'non_gpe_ratio',
       'avg_Distance_GPE_Tokens', 'deixis_count_perplace'],
      dtype='object')

In [5]:
df.Category.value_counts()

Category
FIC    1934
NON     820
Name: count, dtype: int64

In [6]:
df.Genre.value_counts()

Genre
NYT     419
PW      258
BS      249
MY      234
MEM     229
SF      223
ROM     208
HIST    205
MIX     193
BIO     193
YA      177
MID     166
Name: count, dtype: int64

## Feature differences by facet

In [12]:
df.nongpe_places_total

book_id
[Heist Society 2] Uncommon Criminals - Ally Carter    238
2001_2011_Wilson,RobertCharles_TheChronoliths_SF      192
2001_Martel,Yann_LifeofPi_BS                          282
2002_2011_Anderson,MT_Feed_SF                         143
2002_Baker,Jo_Offcomer_CT                             400
                                                     ... 
Woman Behind the New Deal, The - Kirstin Downey       537
Woman of No Importance, A - Sonia Purnell             445
Woodrow Wilson - John Milton Cooper, Jr_              241
You Never Forget Your First - Alexis Coe               18
Young Stalin - Montefiore, Simon Sebag                369
Name: nongpe_places_total, Length: 2754, dtype: int64

In [15]:
facet_by = ['inf_gender', 'Category', 'Genre']

cols_to_test = [
    'char_count', 
    'num_gpe_places', 
    'num_nongpe_places',
    'num_all_places', 
    'dist_miles', 
    'Tokens', 
    'num_gpe_places_norm',
    'num_nongpe_places_norm', 
    'num_gpe_places_norm_byCharacter',
    'num_nongpe_places_norm_byCharacter', 
    'char_count_norm',
    'nongpe_places_total', 
    'gpe_places_total',
    'ttr_nongpe', 
    'ttr_gpe', 
    'avg_Distance_GPE', 
    'non_gpe_ratio',
    'avg_Distance_GPE_Tokens', 
    'deixis_count_perplace'
]

def significance_label(pvalue, levels=[0.05, 0.01, 0.001]):
    if pvalue < levels[2]:
        return '***'
    elif pvalue < levels[1]:
        return '**'
    elif pvalue < levels[0]:
        return '*'
    else: 
        return ''

def diff_means(x, y, axis):
    return np.mean(x, axis=axis) - np.mean(y, axis=axis)

def compare_facets(df, facets, cols, statistic=diff_means):
    '''Compare statistic in cols of df via permutation. Display results, return nothing.'''
    for facet in facets:
        print(f"\n==============\nFacet: {facet}\n==============")
        df = df.dropna(subset=[facet])
        for level in df[facet].unique():
            data = (df.loc[df[facet]==level, cols], df.loc[~(df[facet]==level), cols])
            level_mean = np.mean(data[0], axis=0)
            other_mean = np.mean(data[1], axis=0)
            output = pd.DataFrame(level_mean, columns=[level]).join(pd.DataFrame(other_mean, columns=['others']))
            res = permutation_test(data, statistic)
            output['diff'] = res.statistic
            output['p'] = res.pvalue
            output['sig'] = output['p'].apply(significance_label)
            display(output)

In [16]:
compare_facets(df, facet_by, cols_to_test, diff_means)


Facet: inf_gender


Unnamed: 0,she/her,others,diff,p,sig
char_count,3752.279775,3844.495428,-92.215652,0.394,
num_gpe_places,16.35618,21.814954,-5.458775,0.0002,***
num_nongpe_places,122.757303,123.285637,-0.528334,0.8636,
num_all_places,138.620225,144.524476,-5.904251,0.0996,
dist_miles,56148.685011,79583.242665,-23434.557654,0.0002,***
Tokens,119516.989888,131900.656267,-12383.666379,0.0002,***
num_gpe_places_norm,0.000141,0.00017,-2.9e-05,0.0002,***
num_nongpe_places_norm,0.001127,0.001048,7.9e-05,0.0012,**
num_gpe_places_norm_byCharacter,0.005224,0.009569,-0.004345,0.0002,***
num_nongpe_places_norm_byCharacter,0.036178,0.037657,-0.001479,0.04,*


Unnamed: 0,he/him/his,others,diff,p,sig
char_count,4012.726801,3475.705128,537.021673,0.0002,***
num_gpe_places,22.02536,16.663708,5.361652,0.0002,***
num_nongpe_places,127.342939,115.879684,11.463255,0.0002,***
num_all_places,148.785591,132.051282,16.734309,0.0002,***
dist_miles,80558.158976,57346.323422,23211.835554,0.0002,***
Tokens,133472.337752,118342.14497,15130.192782,0.0002,***
num_gpe_places_norm,0.000169,0.000146,2.3e-05,0.0002,***
num_nongpe_places_norm,0.001073,0.001074,-1e-06,0.9472,
num_gpe_places_norm_byCharacter,0.008396,0.007762,0.000634,0.1794,
num_nongpe_places_norm_byCharacter,0.03688,0.037687,-0.000807,0.2436,


Unnamed: 0,they/them/their,others,diff,p,sig
char_count,1490.612903,3924.422857,-2433.809954,0.0002,***
num_gpe_places,18.870968,20.103238,-1.23227,0.5624,
num_nongpe_places,66.516129,125.78819,-59.272061,0.0002,***
num_all_places,84.903226,145.339048,-60.435822,0.0002,***
dist_miles,65942.276537,72282.184946,-6339.908409,0.5872,
Tokens,109909.790323,128740.810286,-18831.019963,0.001,**
num_gpe_places_norm,0.000183,0.00016,2.4e-05,0.0876,
num_nongpe_places_norm,0.000695,0.001091,-0.000396,0.0002,***
num_gpe_places_norm_byCharacter,0.025981,0.007321,0.01866,0.0002,***
num_nongpe_places_norm_byCharacter,0.048523,0.036642,0.011881,0.0002,***



Facet: Category


Unnamed: 0,FIC,others,diff,p,sig
char_count,4206.708441,2889.110024,1317.598417,0.0002,***
num_gpe_places,13.348006,35.863081,-22.515074,0.0002,***
num_nongpe_places,123.981875,121.067237,2.914638,0.356,
num_all_places,136.91507,156.06357,-19.1485,0.0002,***
dist_miles,40961.126653,145258.731303,-104297.60465,0.0002,***
Tokens,121689.747281,142531.221271,-20841.47399,0.0002,***
num_gpe_places_norm,0.000117,0.000263,-0.000146,0.0002,***
num_nongpe_places_norm,0.001116,0.000972,0.000144,0.0002,***
num_gpe_places_norm_byCharacter,0.003801,0.018457,-0.014656,0.0002,***
num_nongpe_places_norm_byCharacter,0.033272,0.046399,-0.013127,0.0002,***


Unnamed: 0,NON,others,diff,p,sig
char_count,2889.110024,4206.708441,-1317.598417,0.0002,***
num_gpe_places,35.863081,13.348006,22.515074,0.0002,***
num_nongpe_places,121.067237,123.981875,-2.914638,0.3506,
num_all_places,156.06357,136.91507,19.1485,0.0002,***
dist_miles,145258.731303,40961.126653,104297.60465,0.0002,***
Tokens,142531.221271,121689.747281,20841.47399,0.0002,***
num_gpe_places_norm,0.000263,0.000117,0.000146,0.0002,***
num_nongpe_places_norm,0.000972,0.001116,-0.000144,0.0002,***
num_gpe_places_norm_byCharacter,0.018457,0.003801,0.014656,0.0002,***
num_nongpe_places_norm_byCharacter,0.046399,0.033272,0.013127,0.0002,***



Facet: Genre


Unnamed: 0,YA,others,diff,p,sig
char_count,5664.129944,3687.361975,1976.767968,0.0002,***
num_gpe_places,11.519774,20.634526,-9.114752,0.0002,***
num_nongpe_places,130.824859,122.583981,8.240877,0.1582,
num_all_places,141.903955,142.661742,-0.757787,0.9398,
dist_miles,31568.513293,74778.3635,-43209.850206,0.0002,***
Tokens,106057.067797,129393.989114,-23336.921317,0.0002,***
num_gpe_places_norm,0.000113,0.000164,-5.1e-05,0.0002,***
num_nongpe_places_norm,0.001257,0.001061,0.000196,0.0004,***
num_gpe_places_norm_byCharacter,0.002359,0.008562,-0.006202,0.0002,***
num_nongpe_places_norm_byCharacter,0.024377,0.038059,-0.013682,0.0002,***


Unnamed: 0,SF,others,diff,p,sig
char_count,3852.243243,3811.336763,40.90648,0.8078,
num_gpe_places,12.747748,20.688959,-7.941211,0.0002,***
num_nongpe_places,116.711712,123.677087,-6.965376,0.173,
num_all_places,128.842342,143.822715,-14.980372,0.0134,*
dist_miles,34477.256623,75292.293947,-40815.037324,0.0002,***
Tokens,144376.954955,126443.117135,17933.83782,0.0006,***
num_gpe_places_norm,9.8e-05,0.000166,-6.8e-05,0.0002,***
num_nongpe_places_norm,0.000902,0.001088,-0.000186,0.0002,***
num_gpe_places_norm_byCharacter,0.003721,0.008553,-0.004831,0.0002,***
num_nongpe_places_norm_byCharacter,0.031815,0.037649,-0.005835,0.0002,***


Unnamed: 0,BS,others,diff,p,sig
char_count,4729.730924,3723.4972,1006.233724,0.0002,***
num_gpe_places,17.694779,20.282,-2.587221,0.0622,
num_nongpe_places,143.425703,121.0916,22.334103,0.0002,***
num_all_places,160.618474,140.8196,19.798874,0.0014,**
dist_miles,53612.079193,73827.268022,-20215.188828,0.0044,**
Tokens,155119.140562,125179.51,29939.630562,0.0002,***
num_gpe_places_norm,0.000126,0.000164,-3.8e-05,0.0002,***
num_nongpe_places_norm,0.001017,0.001079,-6.2e-05,0.1234,
num_gpe_places_norm_byCharacter,0.004617,0.008516,-0.003899,0.0002,***
num_nongpe_places_norm_byCharacter,0.034983,0.037397,-0.002413,0.0364,*


Unnamed: 0,NYT,others,diff,p,sig
char_count,3719.703349,3831.664522,-111.961172,0.4306,
num_gpe_places,16.351675,20.710425,-4.35875,0.0002,***
num_nongpe_places,129.624402,121.947233,7.677169,0.058,
num_all_places,145.58134,142.080652,3.500688,0.4744,
dist_miles,55524.251111,74949.996057,-19425.744946,0.0004,***
Tokens,118268.789474,129616.93994,-11348.150466,0.0012,**
num_gpe_places_norm,0.00015,0.000163,-1.3e-05,0.0882,
num_nongpe_places_norm,0.001194,0.001052,0.000142,0.0002,***
num_gpe_places_norm_byCharacter,0.004899,0.008748,-0.003848,0.0002,***
num_nongpe_places_norm_byCharacter,0.037954,0.037039,0.000915,0.3444,


Unnamed: 0,MY,others,diff,p,sig
char_count,4099.149573,3788.168986,310.980587,0.0928,
num_gpe_places,13.286325,20.67674,-7.390415,0.0002,***
num_nongpe_places,132.316239,122.258449,10.05779,0.0506,
num_all_places,145.226496,142.369781,2.856714,0.6188,
dist_miles,35709.909859,75372.34945,-39662.439591,0.0002,***
Tokens,124630.961538,128194.749901,-3563.788362,0.4406,
num_gpe_places_norm,0.000109,0.000166,-5.7e-05,0.0002,***
num_nongpe_places_norm,0.001113,0.00107,4.4e-05,0.2878,
num_gpe_places_norm_byCharacter,0.003733,0.008574,-0.004841,0.0002,***
num_nongpe_places_norm_byCharacter,0.03418,0.037457,-0.003277,0.0064,**


Unnamed: 0,ROM,others,diff,p,sig
char_count,5970.480769,3638.168438,2332.312332,0.0002,***
num_gpe_places,7.625,21.064542,-13.439542,0.0002,***
num_nongpe_places,108.990385,124.27076,-15.280375,0.002,**
num_all_places,116.403846,144.758363,-28.354517,0.0002,***
dist_miles,23115.93848,75997.427221,-52881.488741,0.0002,***
Tokens,103573.639423,129881.985045,-26308.345622,0.0002,***
num_gpe_places_norm,7.5e-05,0.000168,-9.3e-05,0.0002,***
num_nongpe_places_norm,0.001099,0.001071,2.8e-05,0.5262,
num_gpe_places_norm_byCharacter,0.001397,0.008716,-0.007319,0.0002,***
num_nongpe_places_norm_byCharacter,0.019871,0.038595,-0.018724,0.0002,***


Unnamed: 0,HIST,others,diff,p,sig
char_count,987.385366,4042.465409,-3055.080043,0.0002,***
num_gpe_places,20.839024,19.983884,0.855141,0.5758,
num_nongpe_places,48.063415,129.162343,-81.098928,0.0002,***
num_all_places,68.556098,148.580582,-80.024484,0.0002,***
dist_miles,68072.449642,72312.392137,-4239.942495,0.6292,
Tokens,148252.897561,126250.627752,22002.269809,0.0002,***
num_gpe_places_norm,0.000152,0.000161,-9e-06,0.3808,
num_nongpe_places_norm,0.000349,0.001132,-0.000783,0.0002,***
num_gpe_places_norm_byCharacter,0.030387,0.006371,0.024016,0.0002,***
num_nongpe_places_norm_byCharacter,0.051873,0.035994,0.015879,0.0002,***


Unnamed: 0,PW,others,diff,p,sig
char_count,3177.618677,3880.336276,-702.717599,0.0002,***
num_gpe_places,14.801556,20.588684,-5.787127,0.0002,***
num_nongpe_places,121.428016,123.288523,-1.860508,0.703,
num_all_places,135.782101,143.317416,-7.535315,0.1788,
dist_miles,52226.899827,74035.017864,-21808.118037,0.0016,**
Tokens,122261.315175,128472.023676,-6210.708501,0.147,
num_gpe_places_norm,0.000134,0.000163,-2.9e-05,0.001,**
num_nongpe_places_norm,0.001108,0.00107,3.8e-05,0.3422,
num_gpe_places_norm_byCharacter,0.005194,0.008469,-0.003275,0.0002,***
num_nongpe_places_norm_byCharacter,0.042319,0.036648,0.005672,0.0002,***


Unnamed: 0,MIX,others,diff,p,sig
char_count,1869.167539,3959.904222,-2090.736683,0.0002,***
num_gpe_places,25.450262,19.644253,5.806008,0.0012,**
num_nongpe_places,84.507853,125.997263,-41.48941,0.0002,***
num_all_places,109.403141,145.092651,-35.689509,0.0002,***
dist_miles,103312.422699,69657.898764,33654.523935,0.0012,**
Tokens,126088.335079,128026.023847,-1937.688768,0.7044,
num_gpe_places_norm,0.000204,0.000157,4.7e-05,0.0002,***
num_nongpe_places_norm,0.000745,0.001098,-0.000352,0.0002,***
num_gpe_places_norm_byCharacter,0.018654,0.007379,0.011275,0.0002,***
num_nongpe_places_norm_byCharacter,0.046469,0.036484,0.009985,0.0002,***


Unnamed: 0,MEM,others,diff,p,sig
char_count,4260.253275,3774.146032,486.107243,0.012,*
num_gpe_places,37.877729,18.427381,19.450348,0.0002,***
num_nongpe_places,170.908297,118.771429,52.136868,0.0002,***
num_all_places,207.737991,136.694841,71.04315,0.0002,***
dist_miles,159239.594297,64068.139159,95171.455138,0.0002,***
Tokens,105611.161572,129916.065476,-24304.903904,0.0002,***
num_gpe_places_norm,0.000355,0.000143,0.000212,0.0002,***
num_nongpe_places_norm,0.001666,0.001019,0.000646,0.0002,***
num_gpe_places_norm_byCharacter,0.009739,0.008019,0.00172,0.049,*
num_nongpe_places_norm_byCharacter,0.041595,0.036777,0.004819,0.0002,***


Unnamed: 0,MID,others,diff,p,sig
char_count,3103.355422,3860.351916,-756.996495,0.0002,***
num_gpe_places,7.024096,20.88463,-13.860534,0.0002,***
num_nongpe_places,94.024096,124.984127,-30.960031,0.0002,***
num_all_places,100.753012,145.303136,-44.550124,0.0002,***
dist_miles,16320.877678,75574.259419,-59253.381741,0.0002,***
Tokens,84156.487952,130702.076655,-46545.588703,0.0002,***
num_gpe_places_norm,8.9e-05,0.000165,-7.6e-05,0.0002,***
num_nongpe_places_norm,0.001243,0.001062,0.000181,0.0002,***
num_gpe_places_norm_byCharacter,0.002412,0.008532,-0.00612,0.0002,***
num_nongpe_places_norm_byCharacter,0.031853,0.03752,-0.005667,0.0002,***


Unnamed: 0,BIO,others,diff,p,sig
char_count,4291.549223,3778.629499,512.919724,0.015,*
num_gpe_places,59.735751,17.050861,42.684891,0.0002,***
num_nongpe_places,175.65285,119.147496,56.505354,0.0002,***
num_all_places,233.875648,135.721831,98.153817,0.0002,***
dist_miles,252167.099482,58391.755702,193775.34378,0.0002,***
Tokens,196532.989637,122708.362285,73824.627352,0.0002,***
num_gpe_places_norm,0.000331,0.000148,0.000184,0.0002,***
num_nongpe_places_norm,0.001036,0.001076,-4e-05,0.3754,
num_gpe_places_norm_byCharacter,0.015934,0.007576,0.008359,0.0002,***
num_nongpe_places_norm_byCharacter,0.046214,0.036496,0.009718,0.0002,***


## GPE distances

In [39]:
gpe_cols = ['gpe_places', 'gpe_sequences']

In [69]:
gpe_counts = Counter()
for i in df.gpe_places:
    gpe_counts.update(i)
gpe_counts.most_common(20)

[('New York', 6683),
 ('London', 4367),
 ('Paris', 3673),
 ('America', 3477),
 ('Washington', 2412),
 ('England', 2087),
 ('California', 2064),
 ('Chicago', 2000),
 ('Boston', 1624),
 ('France', 1560),
 ('Rome', 1304),
 ('San Francisco', 1247),
 ('Los Angeles', 1247),
 ('Texas', 1047),
 ('New York City', 1046),
 ('Europe', 952),
 ('Berlin', 938),
 ('Manhattan', 908),
 ('Philadelphia', 862),
 ('Virginia', 854)]

### Geonames

It **looks** like `gpe_places` is the observed sequence of GPEs in the source text, while `gpe_sequences` de-dupes (naïvely) immediate repetitions (but not repetitions with an intervening place). **We should confirm this with Sandeep if we're not sure.** 

In [211]:
gn_col_names = [
    'geonameid',
    'name',
    'asciiname',
    'alternatenames',
    'lat',
    'lon',
    'feature_class',
    'feature_code',
    'country_code',
    'cc2',
    'admin1_code',
    'admin2_code',
    'admin3_code',
    'admin4_code',
    'population',
    'elevation',
    'dem',
    'timezone',
    'mod_date'
]
gn = pd.read_csv(
    os.path.join(data_dir, 'geonames', 'allCountries.zip'), 
    sep='\t',
    names=gn_col_names,
    index_col=0,
    low_memory=False
)

# retain only most-populous place per unique name
gn = gn.sort_values(by='population', ascending=False).drop_duplicates(subset=['name']).set_index('name')

In [41]:
hierarchy = pd.read_csv(
    os.path.join(data_dir, 'geonames', 'hierarchy.zip'), 
    sep='\t',
    names=['parent', 'child', 'type']
)

feature_codes = pd.read_csv(
    os.path.join(data_dir, 'geonames', 'featureCodes_en.txt'),
    sep='\t',
    names=['feature_code', 'feature_name', 'feature_description']
)

### Wilkens geo data

In [209]:
wi = pd.read_csv(
    os.path.join('..', '..', 'toponyms', 'geo.tsv.gz'),
    sep='\t',
    low_memory=False,
)
wi = wi.loc[wi.lang=='en']
wi.set_index('text_string', inplace=True)

### Sequences, naively

For each volume sequence, look up each place, get lat/lon, calculate distance from previous place, sum over sequential path.

Issues:

* Allows the Boston -> USA distance
* Lookup on `name` field in GeoNames is brittle

Should match Sandeep's distance calculations.

In [206]:
punctuation_to_space = str.maketrans({key:' ' for key in string.punctuation})

def regularize_string(place_string):
    return(' '.join(place_string.translate(punctuation_to_space).lower().split()))

def get_lat_lon(place_string, geo_data):
    '''Assumes lookup string has been regularized if necessary'''
        try:
            return(geo_data.loc[place_string, ['lat', 'lon']]
        except ValueError:
            return(None)
                   
def sequence_distance(sequence, source='geonames'):
    # set data, regularize strings, and remove unknown locations
    if source=='geonames':
        geo_data = gn
        seq = [i for i in sequence if i in geo_data.Index]
    if source=='wilkens':
        geo_data = wi
        seq = [regularize_string(i) for i in sequence if regularize_string(i) in valid_names]
    total_distance = 0.0
    for location_pair in pairwise(seq):
        loc1 = get_lat_lon(location_pair[0], geo_data, source=source)
        loc2 = get_lat_lon(location_pair[1], geo_data, source=source)
        if loc1 is None or loc2 is None:
            pass
        else:
            total_distance += distance.distance(loc1, loc2).miles
    return(total_distance)

In [None]:
gn.Index.get_loc()

In [207]:
df.head().loc[:, 'gpe_sequences'].apply(sequence_distance, source='geonames')

book_id
[Heist Society 2] Uncommon Criminals - Ally Carter     62668.457945
2001_2011_Wilson,RobertCharles_TheChronoliths_SF      148330.228581
2001_Martel,Yann_LifeofPi_BS                          143346.329971
2002_2011_Anderson,MT_Feed_SF                          15291.137887
2002_Baker,Jo_Offcomer_CT                              23299.077240
Name: gpe_sequences, dtype: float64

In [197]:
df.head().loc[:, 'gpe_sequences'].apply(sequence_distance, source='wilkens')

book_id
[Heist Society 2] Uncommon Criminals - Ally Carter     53272.047166
2001_2011_Wilson,RobertCharles_TheChronoliths_SF      179494.187170
2001_Martel,Yann_LifeofPi_BS                          199572.653692
2002_2011_Anderson,MT_Feed_SF                           4993.077876
2002_Baker,Jo_Offcomer_CT                              18895.382501
Name: gpe_sequences, dtype: float64

In [136]:
df['dist_miles'].head()

book_id
[Heist Society 2] Uncommon Criminals - Ally Carter     53272.047166
2001_2011_Wilson,RobertCharles_TheChronoliths_SF      179494.187170
2001_Martel,Yann_LifeofPi_BS                          199572.653692
2002_2011_Anderson,MT_Feed_SF                           4993.077876
2002_Baker,Jo_Offcomer_CT                              18895.382501
Name: dist_miles, dtype: float64

To do:

* Duplicate removal for geonames: for all dupes in `name`, retain only the one with largest population. This will speed up the lookups a lot.

### Hop mobility