# Data prep and integration

Load and integrate data and metadata for use in full analysis 

In [1]:
# imports and setup
from   ast import literal_eval
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from   scipy.stats import permutation_test
import seaborn as sns
from   sklearn.preprocessing import LabelBinarizer
import string
import warnings

data_dir = os.path.join('..', 'data')
derived_dir = os.path.join(data_dir, 'derived')
metadata_dir = os.path.join(data_dir, 'metadata')

# inputs
conlit_file = 'CONLIT_CharData_AP_7.csv.gz'
conlit_distance_file = 'CONLIT_CharData_distances_6.csv.gz'
early_file_wi = 'EARLY_CharData_MW_2.csv.gz'
early_file_ap = 'EARLY_CharData_02.csv.gz'
early_file_full = None

# outputs

conlit_out_file = 'CONLIT_CharData_AP_MW_7.csv.gz'
early_out_file =  'EARLY_CharData_AP_MW_7.csv.gz'

## Data

In [2]:
def string_to_list(x):
    lst = literal_eval(x.replace(', nan', "', ZZZZ'").replace('[nan, ', "'['").replace(', nan]', "']'"))
    return [i for i in lst if i != 'ZZZZ']

def index_int_string(idx):
    '''Make Chicago index labels consistent'''
    try:
        int(idx)
        return(str(idx).rjust(8, '0'))
    except ValueError:
        return(idx)
def source_mapper(label):
    if label.startswith('eaf'):
        return('eaf')
    elif label.startswith('Wright'):
        return('wright')
    else:
        return('chicago')

# read CONLIT
conlit = pd.read_csv(
    os.path.join(derived_dir, conlit_file), 
    index_col='book_id',
    converters={
        'gpe_places': string_to_list,
        'nongpe_places': string_to_list,
        'all_places': string_to_list,
        'gpe_sequences': string_to_list
    },
)
conlit['source'] = 'conlit'
conlit_distances = pd.read_csv(
    os.path.join(derived_dir, conlit_distance_file), 
    index_col='book_id'
)

# read EARLY
if early_file_wi:
    early_wi = pd.read_csv(
        os.path.join(derived_dir, early_file_wi), 
        index_col='book_id',
        converters={
            'gpe_places': string_to_list,
            'nongpe_places': string_to_list,
            'all_places': string_to_list,
            'gpe_sequences': string_to_list
        },
    )
    early_wi.index = early_wi.index.to_series().apply(index_int_string)
    # set source corpus for EARLY data
    early_wi['source'] = early_wi.index.to_series().apply(source_mapper)
    early_wi = early_wi.loc[~early_wi.index.duplicated()]
if early_file_ap:
    early_ap = pd.read_csv(
        os.path.join(derived_dir, early_file_ap), 
        index_col='book_id',
        converters={
            'gpe_places': string_to_list,
            'nongpe_places': string_to_list,
            'all_places': string_to_list,
            'gpe_sequences': string_to_list
        },
    )
    #early_ap.index = early.index.to_series().apply(index_int_string)
    # set source corpus for EARLY data
    #early['source'] = early.index.to_series().apply(source_mapper)


## Metadata

In [3]:
# EAF and Wright
eaf = pd.read_csv(
    os.path.join(data_dir, 'metadata', 'eaf-wright-metadata.tsv'), 
    sep='\t', 
    index_col='source_id'
)
eaf.index.rename('book_id', inplace=True)
eaf.rename(columns={'gender':'author_gender'}, inplace=True)

# Chicago
chi = pd.read_csv(
    os.path.join(metadata_dir, 'chicago-books.csv'),
    index_col='BOOK_ID'
)
chi.index.rename('book_id', inplace=True)
chi.index = chi.index.to_series().apply(index_int_string)
chi.columns = [i.lower() for i in chi.columns]

chi_auth = pd.read_csv(
    os.path.join(metadata_dir, 'chicago-authors.csv'),
)
chi_auth.columns = [i.lower() for i in chi_auth.columns]

chi_idx = chi.index
chi = chi.merge(chi_auth[['auth_id', 'gender']], how='left', on='auth_id').set_index(chi_idx)
chi.rename(columns={'gender':'author_gender', 'publ_date':'pub_date'}, inplace=True)
chi['author'] = chi[['auth_last', 'auth_first']].agg(', '.join, axis=1)

# CONLIT
con = pd.read_csv(
    os.path.join(metadata_dir, 'CONLIT_META.csv'),
    index_col='ID'
)
con.index.rename('book_id', inplace=True)
con.index = con.index.to_series().apply(lambda x: x[:-4]) #delete '.txt'
# fix up indexing errors
con.rename(
    index={
        '2009_LaFleur,Suzanne_Love,Aubrey_MGtxt':'2009_LaFleur,Suzanne_Love,Aubrey_M', 
        '2015_Jackson,AL_ComeToMeRecklessly_ROM.txt':'2015_Jackson,AL_ComeToMeRecklessly_ROM'
    }, \
    inplace=True
)
con.columns = [i.lower() for i in con.columns]
con.rename(columns={'pubdate':'pub_date', 'work_title':'title'}, inplace=True)
con['author'] = con[['author_last', 'author_first']].fillna('').astype(str).agg(', '.join, axis=1)

# integrate target columns
cols = ['author', 'title', 'pub_date', 'author_gender']
meta = pd.concat([eaf[cols], chi[cols], con[cols]], axis=0)

## Integrate

In [4]:
conlit = conlit.join(meta[['pub_date', 'author_gender']])
early_wi = early_wi.join(meta[['pub_date', 'author_gender']])

# set canonical values where needed
early_wi['Category'] = 'FIC'
early_wi.loc[(early_wi.source=='chicago') & (early_wi.pub_date<=1945), ['source']] = 'chicago_1'
early_wi.loc[(early_wi.source=='chicago') & (early_wi.pub_date>1945), ['source']] = 'chicago_2'

In [5]:
early = early_ap.join(early_wi[['Start_Finish_Miles', 'Start_Finish_Z', 'Category', 'author_gender']])

In [6]:
conlit = conlit.join(conlit_distances[['Start_Finish_Miles', 'Start_Finish_Z']])

## Dump integrated data to disk

In [7]:
# just early metadata
early_wi[['source']].join(meta).to_csv(os.path.join(metadata_dir, 'EARLY_META.tsv'), sep='\t')

In [8]:
# full CONLIT and EARLY
early.to_csv(
    os.path.join(derived_dir, early_out_file)
)
conlit.to_csv(
    os.path.join(derived_dir, conlit_out_file)
)