# Data prep and integration

Load and integrate data and metadata for use in full analysis 

In [1]:
# imports and setup
from   ast import literal_eval
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from   scipy.stats import permutation_test
import seaborn as sns
from   sklearn.preprocessing import LabelBinarizer
import string
import warnings

data_dir = os.path.join('..', 'data')
derived_dir = os.path.join(data_dir, 'derived')
metadata_dir = os.path.join(data_dir, 'metadata')

# inputs
conlit_file = 'CONLIT_CharData_AP_MW_11.csv.gz'
conlit_distance_file = 'CONLIT_CharData_distances_10.csv.gz'
early_file_wi = 'EARLY_CharData_distances_10.csv.gz'
early_file_ap = 'EARLY_CharData_AP_MW_11.csv.gz'

# outputs

#conlit_out_file = 'CONLIT_CharData_AP_MW_10.csv.gz'
early_out_file =  'EARLY_CharData_AP_MW_11_with_author_title.csv.gz'

## Data

In [2]:
def string_to_list(x):
    lst = literal_eval(x.replace(', nan', "', ZZZZ'").replace('[nan, ', "'['").replace(', nan]', "']'"))
    return [i for i in lst if i != 'ZZZZ']

def index_int_string(idx):
    '''Make Chicago index labels consistent'''
    try:
        int(idx)
        return(str(idx).rjust(8, '0'))
    except ValueError:
        return(idx)
def source_mapper(label):
    if label.startswith('eaf'):
        return('eaf')
    elif label.startswith('Wright'):
        return('wright')
    else:
        return('chicago')

# read CONLIT
conlit = pd.read_csv(
    os.path.join(derived_dir, conlit_file), 
    index_col='book_id',
    converters={
        'gpe_places': string_to_list,
        'nongpe_places': string_to_list,
        'all_places': string_to_list,
        'gpe_sequences': string_to_list
    },
)

conlit['source'] = 'conlit'
conlit_distances = pd.read_csv(
    os.path.join(derived_dir, conlit_distance_file), 
    index_col='book_id'
)

In [3]:
# have i changed base distance calcs in conlit?
if not np.isclose(conlit.dist_miles, conlit_distances.dist_miles).all():
    exam = conlit.join(conlit_distances, lsuffix='_ap')
    display(exam.loc[~np.isclose(exam.dist_miles_ap, exam.dist_miles), ['dist_miles', 'dist_miles_ap', 'gpe_sequences']])

Unnamed: 0_level_0,dist_miles,dist_miles_ap,gpe_sequences
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"2002_Sebold,Alice_LovelyBones_BS",16021.97637,11692.015613,"[Baltic, Boardwalk, England, New Hampshire, De..."
"2013_MacMillan,Margaret_TheWarthatEndedPeace_HIST",118571.830892,126475.273084,"[Germany, Berlin, Paris, Berlin, Bavaria, Prus..."


In [3]:
# read EARLY
if early_file_wi:
    early_wi = pd.read_csv(
        os.path.join(derived_dir, early_file_wi), 
        index_col='book_id',
        converters={
            'gpe_places': string_to_list,
            'nongpe_places': string_to_list,
            'all_places': string_to_list,
            'gpe_sequences': string_to_list
        },
    )
    early_wi.index = early_wi.index.to_series().apply(index_int_string)
    # set source corpus for EARLY data
    #early_wi['source'] = early_wi.index.to_series().apply(source_mapper)
    early_wi = early_wi.loc[~early_wi.index.duplicated()]
if early_file_ap:
    early_ap = pd.read_csv(
        os.path.join(derived_dir, early_file_ap), 
        index_col='book_id',
        converters={
            'gpe_places': string_to_list,
            'nongpe_places': string_to_list,
            'all_places': string_to_list,
            'gpe_sequences': string_to_list
        },
    )
    #early_ap.index = early.index.to_series().apply(index_int_string)
    # set source corpus for EARLY data
    #early['source'] = early.index.to_series().apply(source_mapper)

In [4]:
# check for distance differences in early
df = early_ap.join(early_wi, lsuffix='_ap')
if not df.dist_miles_ap.equals(df.dist_miles):
    display(df.loc[~np.isclose(df.dist_miles_ap, df.dist_miles), ['dist_miles', 'dist_miles_ap', 'gpe_sequences']])

Unnamed: 0_level_0,dist_miles,dist_miles_ap,gpe_sequences
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11772,59292.095962,66988.406703,"[Central America, Upper Deal, Paris, Lydia, Sw..."
25357,32457.774767,32602.173637,"[Brasenose, England, Oxford, Oxfordshire, Eves..."


## Metadata

In [6]:
# # EAF and Wright
# eaf = pd.read_csv(
#     os.path.join(data_dir, 'metadata', 'eaf-wright-metadata.tsv'), 
#     sep='\t', 
#     index_col='source_id'
# )
# eaf.index.rename('book_id', inplace=True)
# eaf.rename(columns={'gender':'author_gender'}, inplace=True)

# # Chicago
# chi = pd.read_csv(
#     os.path.join(metadata_dir, 'chicago-books.csv'),
#     index_col='BOOK_ID'
# )
# chi.index.rename('book_id', inplace=True)
# chi.index = chi.index.to_series().apply(index_int_string)
# chi.columns = [i.lower() for i in chi.columns]

# chi_auth = pd.read_csv(
#     os.path.join(metadata_dir, 'chicago-authors.csv'),
# )
# chi_auth.columns = [i.lower() for i in chi_auth.columns]

# chi_idx = chi.index
# chi = chi.merge(chi_auth[['auth_id', 'gender']], how='left', on='auth_id').set_index(chi_idx)
# chi.rename(columns={'gender':'author_gender', 'publ_date':'pub_date'}, inplace=True)
# chi['author'] = chi[['auth_last', 'auth_first']].agg(', '.join, axis=1)

# # CONLIT
# con = pd.read_csv(
#     os.path.join(metadata_dir, 'CONLIT_META.csv'),
#     index_col='ID'
# )
# con.index.rename('book_id', inplace=True)
# con.index = con.index.to_series().apply(lambda x: x[:-4]) #delete '.txt'
# # fix up indexing errors
# con.rename(
#     index={
#         '2009_LaFleur,Suzanne_Love,Aubrey_MGtxt':'2009_LaFleur,Suzanne_Love,Aubrey_M', 
#         '2015_Jackson,AL_ComeToMeRecklessly_ROM.txt':'2015_Jackson,AL_ComeToMeRecklessly_ROM'
#     }, \
#     inplace=True
# )
# con.columns = [i.lower() for i in con.columns]
# con.rename(columns={'pubdate':'pub_date', 'work_title':'title'}, inplace=True)
# con['author'] = con[['author_last', 'author_first']].fillna('').astype(str).agg(', '.join, axis=1)

# # integrate target columns
# cols = ['author', 'title', 'pub_date', 'author_gender']
# meta = pd.concat([eaf[cols], chi[cols], con[cols]], axis=0)

## Integrate

In [7]:
# conlit = conlit.join(meta[['pub_date', 'author_gender']])
# early_wi = early_wi.join(meta[['pub_date', 'author_gender']])

# # set canonical values where needed
# early_wi['Category'] = 'FIC'
# early_wi.loc[(early_wi.source=='chicago') & (early_wi.pub_date<=1945), ['source']] = 'chicago_1'
# early_wi.loc[(early_wi.source=='chicago') & (early_wi.pub_date>1945), ['source']] = 'chicago_2'

In [8]:
#early = early_ap.join(early_wi[['Start_Finish_Miles', 'Start_Finish_Z', 'Category', 'author_gender']])
early = early_ap.copy()
early_ap[['dist_miles', 'Start_Finish_Miles', 'Start_Finish_Z']] = early_wi[['dist_miles', 'Start_Finish_Miles', 'Start_Finish_Z']]

conlit_stash = conlit.copy()
conlit[['dist_miles', 'Start_Finish_Miles', 'Start_Finish_Z']] = conlit_distances[['dist_miles', 'Start_Finish_Miles', 'Start_Finish_Z']]

In [9]:
early.loc[~np.isclose(early.dist_miles, early_ap.dist_miles)]

Unnamed: 0_level_0,pub_date,collection,char_id,char_count,inf_gender,gpe_places,num_gpe_places,nongpe_places,num_nongpe_places,all_places,...,non_gpe_total_rank,dist_miles_rank,deixis_count_perplace,Start_Finish_Miles,Start_Finish_Z,Category,author_gender,semantic_dist_total,semantic_dist_mean,first_last_SemanticDist
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11772,1945,chicago_1,298,4392,he/him/his,"[Central America, Upper Deal, Paris, Lydia, Sw...",30,"[his bath, his bath, his bath, there, the new ...",113,"[his bath, his bath, his bath, Central America...",...,7852,9350,0.011152,5936.531092,1.73248,FIC,M,99.40172,0.633132,
25357,1992,chicago_2,400,5515,she/her,"[Brasenose, Brasenose, England, Oxford, Oxford...",14,"[there, there, Infirmary, Infirmary, Infirmary...",135,"[there, there, Brasenose, Brasenose, Infirmary...",...,5097,5235,0.129288,310.926865,1.905819,FIC,F,148.064025,0.611835,0.166


In [10]:
conlit.loc[~np.isclose(conlit.dist_miles, conlit_stash.dist_miles)]

Unnamed: 0_level_0,char_id,char_count,inf_gender,gpe_places,num_gpe_places,nongpe_places,num_nongpe_places,all_places,num_all_places,gpe_sequences,...,dist_miles_allChars_norm_Tokens,num_gpe_places_allChars_norm_Tokens,non_gpe_total_rank,dist_miles_rank,source,pub_date,author_gender,Start_Finish_Miles,Start_Finish_Z,first_last_SemanticDist
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"2002_Sebold,Alice_LovelyBones_BS",0,3762,he/him/his,"[Baltic, Boardwalk, England, New Hampshire, De...",12,"[the classroom, the classroom, my memorial, th...",213,"[the classroom, the classroom, my memorial, th...",225,"[Baltic, Boardwalk, England, New Hampshire, De...",...,0.448292,0.000405,2489,772,conlit,2002,F,3943.226913,1.407251,0.589
"2013_MacMillan,Margaret_TheWarthatEndedPeace_HIST",996,1188,he/him/his,"[Germany, Germany, Germany, Germany, Germany, ...",36,"[his household, the North Sea, the North Sea, ...",36,"[Germany, Germany, Germany, Germany, Germany, ...",71,"[Germany, Berlin, Paris, Berlin, Bavaria, Prus...",...,1.455149,0.000445,130,1805,conlit,2013,F,0.0,-1.26889,0.59


In [6]:
early_ap

Unnamed: 0_level_0,pub_date,collection,char_id,char_count,inf_gender,gpe_places,num_gpe_places,nongpe_places,num_nongpe_places,all_places,...,non_gpe_total_rank,dist_miles_rank,deixis_count_perplace,Start_Finish_Miles,Start_Finish_Z,Category,author_gender,semantic_dist_total,semantic_dist_mean,first_last_SemanticDist
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000001,1880,chicago_1,125,1563,she/her,"[New York, New York, New York, New York, New Y...",8,"[the brown stone houses they lived in, the ric...",30,"[New York, New York, New York, New York, the b...",...,2373,3414,0.042857,203.756288,-1.095404,FIC,M,13.928088,0.397945,0.328
00000003,1880,chicago_1,81,1534,he/him/his,"[Stillwater, Stillwater, Stillwater, Stillwate...",3,[the crowded family tomb behind the South Chur...,74,"[Stillwater, Stillwater, Stillwater, the crowd...",...,6523,1797,0.076923,117.221770,,FIC,M,37.172396,0.482758,0.520
00000013,1880,chicago_1,62,594,she/her,[],0,"[here, the lawn, here, here, here, here, here,...",22,"[here, the lawn, here, here, here, here, here,...",...,3314,1,0.326087,,,FIC,F,11.574699,0.413382,
00000015,1880,chicago_1,307,1358,he/him/his,"[Louisiana, Louisiana, Louisiana, Louisiana, L...",2,"[the room, the room, the room, here, here, The...",42,"[Louisiana, Louisiana, Louisiana, the room, th...",...,769,1,0.028571,,,FIC,M,17.531069,0.407699,0.000
00000021,1880,chicago_1,311,2242,he/him/his,"[Philadelphia, Indiana, Yerbury, Yerbury, Orie...",6,"[the academy, the academy, the walk, the walk,...",66,"[the academy, the academy, the walk, the walk,...",...,1651,6285,0.155340,82.962585,-1.455046,FIC,F,35.752619,0.558635,0.772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wright2-2801,1870,wright,13,1439,she/her,[],0,"[the old Union Theatre, the neighborhood of th...",40,"[the old Union Theatre, the neighborhood of th...",...,5595,1,0.060870,,,FIC,M,23.676403,0.446725,
Wright2-2802,1864,wright,0,1441,he/him/his,"[Atlanta, Georgia, Georgia, Atlanta, St. Augus...",4,"[the inner temple, the Sunny South, the Sunny ...",44,"[the inner temple, the Sunny South, the Sunny ...",...,4974,2284,0.068966,686.019090,,FIC,M,22.332137,0.429464,0.461
Wright2-2803,1870,wright,103,1464,he/him/his,"[Northville, Northville, Northville, Munich, M...",2,"[here, the lounge where Philip Lester lay, thi...",62,"[here, the lounge where Philip Lester lay, thi...",...,2991,4138,0.145455,,,FIC,F,38.182576,0.561508,0.000
Wright2-2804,1871,wright,0,1357,he/him/his,"[New York, New York, New York, New York, New Y...",3,"[my house, my house, home, home, here, my gard...",32,"[my house, my house, home, home, here, my gard...",...,2339,4249,0.016667,5728.554425,2.661596,FIC,F,15.212632,0.447430,1.000


In [9]:
# add author and title fields to early file
early_meta = pd.read_csv(os.path.join(metadata_dir, 'EARLY_META.tsv'), sep='\t', index_col='book_id')
early_meta

Unnamed: 0_level_0,source,author,title,pub_date,author_gender
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00010412,chicago_1,"Nathan, Robert",One more spring,1933,M
00011648,chicago_1,"Stewart, Alfred Walter",The four defences,1940,M
00010576,chicago_1,"Dixon, Thomas",The flaming sword,1939,M
00011480,chicago_1,"Ambler, Eric",The dark frontier,1936,M
Wright2-0416,wright,"Bunce, Oliver Bell",A bachelor's story,1859,M
...,...,...,...,...,...
00022574,chicago_2,"Saberhagen, Fred",Woundhealer's story,1986,M
00020441,chicago_2,"Caldwell, Taylor",A prologue to love,1961,F
00022410,chicago_2,"Pineda, Cecile",Face,1985,F
00003825,chicago_1,"Day, Holman",The landloper,1915,M


In [16]:
early_with_bib = early_meta[['author', 'title']].join(early_ap)

## Dump integrated data to disk

In [11]:
# # just early metadata
# early_wi[['source']].join(meta).to_csv(os.path.join(metadata_dir, 'EARLY_META.tsv'), sep='\t')

In [18]:
# just early with bib data
early_with_bib.to_csv(os.path.join(derived_dir, early_out_file))

In [12]:
# full CONLIT and EARLY
early.to_csv(
    os.path.join(derived_dir, early_out_file)
)
conlit.to_csv(
    os.path.join(derived_dir, conlit_out_file)
)