# Data prep and integration

Load and integrate data and metadata for use in full analysis 

In [1]:
# imports and setup
from   ast import literal_eval
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from   scipy.stats import permutation_test
import seaborn as sns
from   sklearn.preprocessing import LabelBinarizer
import string
import warnings

data_dir = os.path.join('..', 'data')
derived_dir = os.path.join(data_dir, 'derived')
metadata_dir = os.path.join(data_dir, 'metadata')

# inputs
conlit_file = 'CONLIT_CharData_AP_MW_10.csv.gz'
conlit_distance_file = 'CONLIT_CharData_distances_10.csv.gz'
early_file_wi = 'EARLY_CharData_distances_10.csv.gz'
early_file_ap = 'EARLY_CharData_AP_MW_9.csv.gz'

# outputs

conlit_out_file = 'CONLIT_CharData_AP_MW_10.csv.gz'
early_out_file =  'EARLY_CharData_AP_MW_10.csv.gz'

## Data

In [2]:
def string_to_list(x):
    lst = literal_eval(x.replace(', nan', "', ZZZZ'").replace('[nan, ', "'['").replace(', nan]', "']'"))
    return [i for i in lst if i != 'ZZZZ']

def index_int_string(idx):
    '''Make Chicago index labels consistent'''
    try:
        int(idx)
        return(str(idx).rjust(8, '0'))
    except ValueError:
        return(idx)
def source_mapper(label):
    if label.startswith('eaf'):
        return('eaf')
    elif label.startswith('Wright'):
        return('wright')
    else:
        return('chicago')

# read CONLIT
conlit = pd.read_csv(
    os.path.join(derived_dir, conlit_file), 
    index_col='book_id',
    converters={
        'gpe_places': string_to_list,
        'nongpe_places': string_to_list,
        'all_places': string_to_list,
        'gpe_sequences': string_to_list
    },
)

In [3]:
conlit

Unnamed: 0_level_0,char_id,char_count,inf_gender,gpe_places,num_gpe_places,nongpe_places,num_nongpe_places,all_places,num_all_places,gpe_sequences,...,dist_miles_allChars_norm_Tokens,num_gpe_places_allChars_norm_Tokens,non_gpe_total_rank,dist_miles_rank,source,pub_date,author_gender,Start_Finish_Miles,Start_Finish_Z,first_last_SemanticDist
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
[Heist Society 2] Uncommon Criminals - Ally Carter,138,3238,she/her,"[Paraguay, Paraguay, London, New York City, mi...",15,"[there, there, there, the museum, the museum, ...",103,"[there, there, there, Paraguay, Paraguay, the ...",118,"[Paraguay, London, New York City, midtown Manh...",...,1.099714,0.000460,1950,2090,conlit,2011,F,6223.878880,1.483659,0.563
"2001_2011_Wilson,RobertCharles_TheChronoliths_SF",0,2742,she/her,"[Bangkok, Minneapolis / St. Paul, Chumphon, Ch...",22,"[our household, a gas station hawng nam, the c...",122,"[our household, a gas station hawng nam, the c...",143,"[Bangkok, Minneapolis / St. Paul, Chumphon, Ba...",...,3.978757,0.000746,1164,2604,conlit,2001,M,9671.262067,1.265047,0.642
"2001_Martel,Yann_LifeofPi_BS",0,4754,he/him/his,"[Canada, India, India, India, India, India, Ca...",15,"[the north, the rich , noisy , functioning mad...",142,"[Canada, India, the north, India, India, the r...",157,"[Canada, India, Canada, India, Toronto, Safed,...",...,1.811268,0.000249,1487,2554,conlit,2001,M,0.000000,-3.280330,0.249
"2002_2011_Anderson,MT_Feed_SF",0,2862,she/her,"[Lonely, School, School, Switzerland, Io, Amer...",5,"[the craters, a crummy hotel, here, here, the ...",68,"[the craters, a crummy hotel, here, here, the ...",72,"[Lonely, School, Switzerland, Io, America]",...,0.292747,0.000194,1376,603,conlit,2002,M,4990.393324,0.000000,0.672
"2002_Baker,Jo_Offcomer_CT",120,4723,she/her,"[Belfast, Belfast, Belfast, Belfast, Belfast, ...",11,"[the bath, the bath, the bath, it, the bath, t...",162,"[the bath, the bath, the bath, it, the bath, t...",172,"[Belfast, Conroys, October, Belfast, Somervill...",...,0.286661,0.000295,2548,1014,conlit,2002,F,276.146057,-0.365448,0.435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Woman Behind the New Deal, The - Kirstin Downey",625,6543,she/her,"[Columbia, Cornell, Cornell, New York City, Ma...",61,"[its, more than a dozen archives across the no...",288,"[its, more than a dozen archives across the no...",344,"[Columbia, Cornell, New York City, Maine, Bost...",...,2.058943,0.000791,2019,2281,conlit,2009,F,865.742012,-0.165683,0.354
"Woman of No Importance, A - Sonia Purnell",553,4519,she/her,"[France, France, France, France, Europe, Franc...",70,"[the France, the whole of Europe, the whole of...",230,"[the France, France, France, France, France, t...",296,"[France, Europe, France, Europe, Cambridge, Ma...",...,3.619871,0.000911,2160,2688,conlit,2019,F,4775.068411,1.456236,0.481
"Woodrow Wilson - John Milton Cooper, Jr_",1465,3676,he/him/his,"[America, Washington, Tennessee, Tennessee, Ge...",62,"[his State of the Union, his State of the Unio...",102,"[his State of the Union, his State of the Unio...",161,"[America, Washington, Tennessee, Germany, Wash...",...,2.007089,0.000969,393,1780,conlit,2009,M,0.000000,-0.925339,0.647
You Never Forget Your First - Alexis Coe,276,417,he/him/his,"[Brooklyn Heights, Lower Manhattan, Philadelph...",11,"[the river, its, his plantation, his plantatio...",14,"[Brooklyn Heights, Lower Manhattan, the river,...",24,"[Brooklyn Heights, Lower Manhattan, Philadelph...",...,1.515838,0.001118,165,924,conlit,2020,F,78.422808,-0.446969,0.430


In [2]:
conlit['source'] = 'conlit'
conlit_distances = pd.read_csv(
    os.path.join(derived_dir, conlit_distance_file), 
    index_col='book_id'
)

# read EARLY
if early_file_wi:
    early_wi = pd.read_csv(
        os.path.join(derived_dir, early_file_wi), 
        index_col='book_id',
        converters={
            'gpe_places': string_to_list,
            'nongpe_places': string_to_list,
            'all_places': string_to_list,
            'gpe_sequences': string_to_list
        },
    )
    early_wi.index = early_wi.index.to_series().apply(index_int_string)
    # set source corpus for EARLY data
    early_wi['source'] = early_wi.index.to_series().apply(source_mapper)
    early_wi = early_wi.loc[~early_wi.index.duplicated()]
if early_file_ap:
    early_ap = pd.read_csv(
        os.path.join(derived_dir, early_file_ap), 
        index_col='book_id',
        converters={
            'gpe_places': string_to_list,
            'nongpe_places': string_to_list,
            'all_places': string_to_list,
            'gpe_sequences': string_to_list
        },
    )
    #early_ap.index = early.index.to_series().apply(index_int_string)
    # set source corpus for EARLY data
    #early['source'] = early.index.to_series().apply(source_mapper)

## Metadata

In [3]:
# EAF and Wright
eaf = pd.read_csv(
    os.path.join(data_dir, 'metadata', 'eaf-wright-metadata.tsv'), 
    sep='\t', 
    index_col='source_id'
)
eaf.index.rename('book_id', inplace=True)
eaf.rename(columns={'gender':'author_gender'}, inplace=True)

# Chicago
chi = pd.read_csv(
    os.path.join(metadata_dir, 'chicago-books.csv'),
    index_col='BOOK_ID'
)
chi.index.rename('book_id', inplace=True)
chi.index = chi.index.to_series().apply(index_int_string)
chi.columns = [i.lower() for i in chi.columns]

chi_auth = pd.read_csv(
    os.path.join(metadata_dir, 'chicago-authors.csv'),
)
chi_auth.columns = [i.lower() for i in chi_auth.columns]

chi_idx = chi.index
chi = chi.merge(chi_auth[['auth_id', 'gender']], how='left', on='auth_id').set_index(chi_idx)
chi.rename(columns={'gender':'author_gender', 'publ_date':'pub_date'}, inplace=True)
chi['author'] = chi[['auth_last', 'auth_first']].agg(', '.join, axis=1)

# CONLIT
con = pd.read_csv(
    os.path.join(metadata_dir, 'CONLIT_META.csv'),
    index_col='ID'
)
con.index.rename('book_id', inplace=True)
con.index = con.index.to_series().apply(lambda x: x[:-4]) #delete '.txt'
# fix up indexing errors
con.rename(
    index={
        '2009_LaFleur,Suzanne_Love,Aubrey_MGtxt':'2009_LaFleur,Suzanne_Love,Aubrey_M', 
        '2015_Jackson,AL_ComeToMeRecklessly_ROM.txt':'2015_Jackson,AL_ComeToMeRecklessly_ROM'
    }, \
    inplace=True
)
con.columns = [i.lower() for i in con.columns]
con.rename(columns={'pubdate':'pub_date', 'work_title':'title'}, inplace=True)
con['author'] = con[['author_last', 'author_first']].fillna('').astype(str).agg(', '.join, axis=1)

# integrate target columns
cols = ['author', 'title', 'pub_date', 'author_gender']
meta = pd.concat([eaf[cols], chi[cols], con[cols]], axis=0)

## Integrate

In [4]:
conlit = conlit.join(meta[['pub_date', 'author_gender']])
early_wi = early_wi.join(meta[['pub_date', 'author_gender']])

# set canonical values where needed
early_wi['Category'] = 'FIC'
early_wi.loc[(early_wi.source=='chicago') & (early_wi.pub_date<=1945), ['source']] = 'chicago_1'
early_wi.loc[(early_wi.source=='chicago') & (early_wi.pub_date>1945), ['source']] = 'chicago_2'

In [5]:
early = early_ap.join(early_wi[['Start_Finish_Miles', 'Start_Finish_Z', 'Category', 'author_gender']])

In [6]:
conlit = conlit.join(conlit_distances[['Start_Finish_Miles', 'Start_Finish_Z']])

## Dump integrated data to disk

In [7]:
# just early metadata
early_wi[['source']].join(meta).to_csv(os.path.join(metadata_dir, 'EARLY_META.tsv'), sep='\t')

In [8]:
# full CONLIT and EARLY
early.to_csv(
    os.path.join(derived_dir, early_out_file)
)
conlit.to_csv(
    os.path.join(derived_dir, conlit_out_file)
)