In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
from   scipy.stats import permutation_test
import seaborn

In [2]:
data_dir = os.path.join('..', 'data', 'derived')
df = pd.read_csv(os.path.join(data_dir, 'CONLIT_CharData_AP.csv.gz'), index_col='book_id')
df.head()

Unnamed: 0_level_0,char_id,char_count,inf_gender,gpe_places,num_gpe_places,nongpe_places,num_nongpe_places,all_places,num_all_places,gpe_sequences,...,char_rank,num_words,Category,Genre,Tokens,num_gpe_places_norm,num_nongpe_places_norm,num_gpe_places_norm_byCharacter,num_nongpe_places_norm_byCharacter,char_count_norm
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
[Heist Society 2] Uncommon Criminals - Ally Carter,138,3238,she/her,"['Paraguay', 'Paraguay', 'London', 'New York C...",15,"['there', 'there', 'there', 'the museum', 'the...",103,"['there', 'there', 'there', 'Paraguay', 'Parag...",118,"['Paraguay', 'London', 'New York City', 'midto...",...,1,320000,FIC,YA,80392,0.000187,0.001281,0.004632,0.03181,0.040278
"2001_2011_Wilson,RobertCharles_TheChronoliths_SF",0,2742,she/her,"['Bangkok', 'Minneapolis / St. Paul', 'Chumpho...",22,"['our household', 'a gas station hawng nam', '...",122,"['our household', 'a gas station hawng nam', '...",143,"['Bangkok', 'Minneapolis / St. Paul', 'Chumpho...",...,1,320000,FIC,SF,103230,0.000213,0.001182,0.008023,0.044493,0.026562
"2001_Martel,Yann_LifeofPi_BS",0,4754,he/him/his,"['Canada', 'India', 'India', 'India', 'India',...",15,"['the north', 'the rich , noisy , functioning ...",142,"['Canada', 'India', 'the north', 'India', 'Ind...",157,"['Canada', 'India', 'Canada', 'India', 'Toront...",...,1,320000,FIC,BS,124445,0.000121,0.001141,0.003155,0.02987,0.038202
"2002_2011_Anderson,MT_Feed_SF",0,2862,she/her,"['Lonely', 'School', 'School', 'Switzerland', ...",5,"['the craters', 'a crummy hotel', 'here', 'her...",68,"['the craters', 'a crummy hotel', 'here', 'her...",72,"['Lonely', 'School', 'Switzerland', 'Io', 'Ame...",...,1,320000,FIC,SF,67075,7.5e-05,0.001014,0.001747,0.02376,0.042669
"2002_Baker,Jo_Offcomer_CT",120,4723,she/her,"['Belfast', 'Belfast', 'Belfast', 'Belfast', '...",11,"['the bath', 'the bath', 'the bath', 'it', 'th...",162,"['the bath', 'the bath', 'the bath', 'it', 'th...",172,"['Belfast', 'Conroys', 'October', 'Belfast', '...",...,1,320000,FIC,NYT,88042,0.000125,0.00184,0.002329,0.0343,0.053645


In [3]:
df.columns

Index(['char_id', 'char_count', 'inf_gender', 'gpe_places', 'num_gpe_places',
       'nongpe_places', 'num_nongpe_places', 'all_places', 'num_all_places',
       'gpe_sequences', 'dist_miles', 'char_rank', 'num_words', 'Category',
       'Genre', 'Tokens', 'num_gpe_places_norm', 'num_nongpe_places_norm',
       'num_gpe_places_norm_byCharacter', 'num_nongpe_places_norm_byCharacter',
       'char_count_norm'],
      dtype='object')

In [4]:
df.Category.value_counts()

Category
FIC    1934
NON     820
Name: count, dtype: int64

In [5]:
df.Genre.value_counts()

Genre
NYT     419
PW      258
BS      249
MY      234
MEM     229
SF      223
ROM     208
HIST    205
MIX     193
BIO     193
YA      177
MID     166
Name: count, dtype: int64

In [10]:
facet_by = ['inf_gender', 'Category', 'Genre']
cols_to_test = [
    'char_count',
    'num_gpe_places',
    'num_nongpe_places',
    'num_all_places',
    'dist_miles', 
    'Tokens', 
    'num_gpe_places_norm', 
    'num_nongpe_places_norm',
    'num_gpe_places_norm_byCharacter', 
    'num_nongpe_places_norm_byCharacter',
    'char_count_norm'
]

def significance_label(pvalue, levels=[0.05, 0.01, 0.001]):
    if pvalue < levels[2]:
        return '***'
    elif pvalue < levels[1]:
        return '**'
    elif pvalue < levels[0]:
        return '*'
    else: 
        return ''

def diff_means(x, y, axis):
    return np.mean(x, axis=axis) - np.mean(y, axis=axis)

def compare_facets(df, facets, cols, statistic=diff_means):
    '''Compare statistic in cols of df via permutation. Display results, return nothing.'''
    for facet in facets:
        print(f"\n==============\nFacet: {facet}\n==============")
        df = df.dropna(subset=[facet])
        for level in df[facet].unique():
            data = (df.loc[df[facet]==level, cols], df.loc[~(df[facet]==level), cols])
            level_mean = np.mean(data[0], axis=0)
            other_mean = np.mean(data[1], axis=0)
            output = pd.DataFrame(level_mean, columns=[level]).join(pd.DataFrame(other_mean, columns=['others']))
            res = permutation_test(data, statistic)
            output['diff'] = res.statistic
            output['p'] = res.pvalue
            output['sig'] = output['p'].apply(significance_label)
            display(output)

In [11]:
compare_facets(df, facet_by, cols_to_test, diff_means)


Facet: inf_gender


Unnamed: 0,she/her,others,diff,p,sig
char_count,3752.279775,3844.495428,-92.215652,0.404,
num_gpe_places,16.35618,21.814954,-5.458775,0.0002,***
num_nongpe_places,122.757303,123.285637,-0.528334,0.849,
num_all_places,138.620225,144.524476,-5.904251,0.0978,
dist_miles,56148.685011,79583.242665,-23434.557654,0.0002,***
Tokens,119516.989888,131900.656267,-12383.666379,0.0002,***
num_gpe_places_norm,0.000141,0.00017,-2.9e-05,0.0002,***
num_nongpe_places_norm,0.001127,0.001048,7.9e-05,0.0018,**
num_gpe_places_norm_byCharacter,0.005224,0.009569,-0.004345,0.0002,***
num_nongpe_places_norm_byCharacter,0.036178,0.037657,-0.001479,0.0362,*


Unnamed: 0,he/him/his,others,diff,p,sig
char_count,4012.726801,3475.705128,537.021673,0.0002,***
num_gpe_places,22.02536,16.663708,5.361652,0.0002,***
num_nongpe_places,127.342939,115.879684,11.463255,0.0002,***
num_all_places,148.785591,132.051282,16.734309,0.0002,***
dist_miles,80558.158976,57346.323422,23211.835554,0.0002,***
Tokens,133472.337752,118342.14497,15130.192782,0.0002,***
num_gpe_places_norm,0.000169,0.000146,2.3e-05,0.0002,***
num_nongpe_places_norm,0.001073,0.001074,-1e-06,0.9414,
num_gpe_places_norm_byCharacter,0.008396,0.007762,0.000634,0.1722,
num_nongpe_places_norm_byCharacter,0.03688,0.037687,-0.000807,0.2564,


Unnamed: 0,they/them/their,others,diff,p,sig
char_count,1490.612903,3924.422857,-2433.809954,0.0002,***
num_gpe_places,18.870968,20.103238,-1.23227,0.539,
num_nongpe_places,66.516129,125.78819,-59.272061,0.0002,***
num_all_places,84.903226,145.339048,-60.435822,0.0002,***
dist_miles,65942.276537,72282.184946,-6339.908409,0.566,
Tokens,109909.790323,128740.810286,-18831.019963,0.0016,**
num_gpe_places_norm,0.000183,0.00016,2.4e-05,0.0906,
num_nongpe_places_norm,0.000695,0.001091,-0.000396,0.0002,***
num_gpe_places_norm_byCharacter,0.025981,0.007321,0.01866,0.0002,***
num_nongpe_places_norm_byCharacter,0.048523,0.036642,0.011881,0.0002,***



Facet: Category


Unnamed: 0,FIC,others,diff,p,sig
char_count,4206.708441,2889.110024,1317.598417,0.0002,***
num_gpe_places,13.348006,35.863081,-22.515074,0.0002,***
num_nongpe_places,123.981875,121.067237,2.914638,0.3398,
num_all_places,136.91507,156.06357,-19.1485,0.0002,***
dist_miles,40961.126653,145258.731303,-104297.60465,0.0002,***
Tokens,121689.747281,142531.221271,-20841.47399,0.0002,***
num_gpe_places_norm,0.000117,0.000263,-0.000146,0.0002,***
num_nongpe_places_norm,0.001116,0.000972,0.000144,0.0002,***
num_gpe_places_norm_byCharacter,0.003801,0.018457,-0.014656,0.0002,***
num_nongpe_places_norm_byCharacter,0.033272,0.046399,-0.013127,0.0002,***


Unnamed: 0,NON,others,diff,p,sig
char_count,2889.110024,4206.708441,-1317.598417,0.0002,***
num_gpe_places,35.863081,13.348006,22.515074,0.0002,***
num_nongpe_places,121.067237,123.981875,-2.914638,0.3442,
num_all_places,156.06357,136.91507,19.1485,0.0002,***
dist_miles,145258.731303,40961.126653,104297.60465,0.0002,***
Tokens,142531.221271,121689.747281,20841.47399,0.0002,***
num_gpe_places_norm,0.000263,0.000117,0.000146,0.0002,***
num_nongpe_places_norm,0.000972,0.001116,-0.000144,0.0002,***
num_gpe_places_norm_byCharacter,0.018457,0.003801,0.014656,0.0002,***
num_nongpe_places_norm_byCharacter,0.046399,0.033272,0.013127,0.0002,***



Facet: Genre


Unnamed: 0,YA,others,diff,p,sig
char_count,5664.129944,3687.361975,1976.767968,0.0002,***
num_gpe_places,11.519774,20.634526,-9.114752,0.0002,***
num_nongpe_places,130.824859,122.583981,8.240877,0.1682,
num_all_places,141.903955,142.661742,-0.757787,0.938,
dist_miles,31568.513293,74778.3635,-43209.850206,0.0002,***
Tokens,106057.067797,129393.989114,-23336.921317,0.0002,***
num_gpe_places_norm,0.000113,0.000164,-5.1e-05,0.0002,***
num_nongpe_places_norm,0.001257,0.001061,0.000196,0.0002,***
num_gpe_places_norm_byCharacter,0.002359,0.008562,-0.006202,0.0002,***
num_nongpe_places_norm_byCharacter,0.024377,0.038059,-0.013682,0.0002,***


Unnamed: 0,SF,others,diff,p,sig
char_count,3852.243243,3811.336763,40.90648,0.8254,
num_gpe_places,12.747748,20.688959,-7.941211,0.0002,***
num_nongpe_places,116.711712,123.677087,-6.965376,0.1734,
num_all_places,128.842342,143.822715,-14.980372,0.011,*
dist_miles,34477.256623,75292.293947,-40815.037324,0.0002,***
Tokens,144376.954955,126443.117135,17933.83782,0.0006,***
num_gpe_places_norm,9.8e-05,0.000166,-6.8e-05,0.0002,***
num_nongpe_places_norm,0.000902,0.001088,-0.000186,0.0004,***
num_gpe_places_norm_byCharacter,0.003721,0.008553,-0.004831,0.0002,***
num_nongpe_places_norm_byCharacter,0.031815,0.037649,-0.005835,0.0002,***


Unnamed: 0,BS,others,diff,p,sig
char_count,4729.730924,3723.4972,1006.233724,0.0002,***
num_gpe_places,17.694779,20.282,-2.587221,0.0662,
num_nongpe_places,143.425703,121.0916,22.334103,0.0002,***
num_all_places,160.618474,140.8196,19.798874,0.0028,**
dist_miles,53612.079193,73827.268022,-20215.188828,0.0044,**
Tokens,155119.140562,125179.51,29939.630562,0.0002,***
num_gpe_places_norm,0.000126,0.000164,-3.8e-05,0.0002,***
num_nongpe_places_norm,0.001017,0.001079,-6.2e-05,0.1362,
num_gpe_places_norm_byCharacter,0.004617,0.008516,-0.003899,0.0002,***
num_nongpe_places_norm_byCharacter,0.034983,0.037397,-0.002413,0.0332,*


Unnamed: 0,NYT,others,diff,p,sig
char_count,3719.703349,3831.664522,-111.961172,0.434,
num_gpe_places,16.351675,20.710425,-4.35875,0.0004,***
num_nongpe_places,129.624402,121.947233,7.677169,0.057,
num_all_places,145.58134,142.080652,3.500688,0.459,
dist_miles,55524.251111,74949.996057,-19425.744946,0.0002,***
Tokens,118268.789474,129616.93994,-11348.150466,0.0008,***
num_gpe_places_norm,0.00015,0.000163,-1.3e-05,0.0948,
num_nongpe_places_norm,0.001194,0.001052,0.000142,0.0002,***
num_gpe_places_norm_byCharacter,0.004899,0.008748,-0.003848,0.0002,***
num_nongpe_places_norm_byCharacter,0.037954,0.037039,0.000915,0.3462,


Unnamed: 0,MY,others,diff,p,sig
char_count,4099.149573,3788.168986,310.980587,0.0988,
num_gpe_places,13.286325,20.67674,-7.390415,0.0002,***
num_nongpe_places,132.316239,122.258449,10.05779,0.0524,
num_all_places,145.226496,142.369781,2.856714,0.625,
dist_miles,35709.909859,75372.34945,-39662.439591,0.0002,***
Tokens,124630.961538,128194.749901,-3563.788362,0.4594,
num_gpe_places_norm,0.000109,0.000166,-5.7e-05,0.0002,***
num_nongpe_places_norm,0.001113,0.00107,4.4e-05,0.2928,
num_gpe_places_norm_byCharacter,0.003733,0.008574,-0.004841,0.0002,***
num_nongpe_places_norm_byCharacter,0.03418,0.037457,-0.003277,0.0038,**


Unnamed: 0,ROM,others,diff,p,sig
char_count,5970.480769,3638.168438,2332.312332,0.0002,***
num_gpe_places,7.625,21.064542,-13.439542,0.0002,***
num_nongpe_places,108.990385,124.27076,-15.280375,0.0028,**
num_all_places,116.403846,144.758363,-28.354517,0.0002,***
dist_miles,23115.93848,75997.427221,-52881.488741,0.0002,***
Tokens,103573.639423,129881.985045,-26308.345622,0.0002,***
num_gpe_places_norm,7.5e-05,0.000168,-9.3e-05,0.0002,***
num_nongpe_places_norm,0.001099,0.001071,2.8e-05,0.5268,
num_gpe_places_norm_byCharacter,0.001397,0.008716,-0.007319,0.0002,***
num_nongpe_places_norm_byCharacter,0.019871,0.038595,-0.018724,0.0002,***


Unnamed: 0,HIST,others,diff,p,sig
char_count,987.385366,4042.465409,-3055.080043,0.0002,***
num_gpe_places,20.839024,19.983884,0.855141,0.5786,
num_nongpe_places,48.063415,129.162343,-81.098928,0.0002,***
num_all_places,68.556098,148.580582,-80.024484,0.0002,***
dist_miles,68072.449642,72312.392137,-4239.942495,0.6452,
Tokens,148252.897561,126250.627752,22002.269809,0.0002,***
num_gpe_places_norm,0.000152,0.000161,-9e-06,0.3864,
num_nongpe_places_norm,0.000349,0.001132,-0.000783,0.0002,***
num_gpe_places_norm_byCharacter,0.030387,0.006371,0.024016,0.0002,***
num_nongpe_places_norm_byCharacter,0.051873,0.035994,0.015879,0.0002,***


Unnamed: 0,PW,others,diff,p,sig
char_count,3177.618677,3880.336276,-702.717599,0.0002,***
num_gpe_places,14.801556,20.588684,-5.787127,0.0002,***
num_nongpe_places,121.428016,123.288523,-1.860508,0.706,
num_all_places,135.782101,143.317416,-7.535315,0.1872,
dist_miles,52226.899827,74035.017864,-21808.118037,0.0022,**
Tokens,122261.315175,128472.023676,-6210.708501,0.159,
num_gpe_places_norm,0.000134,0.000163,-2.9e-05,0.0012,**
num_nongpe_places_norm,0.001108,0.00107,3.8e-05,0.3532,
num_gpe_places_norm_byCharacter,0.005194,0.008469,-0.003275,0.0002,***
num_nongpe_places_norm_byCharacter,0.042319,0.036648,0.005672,0.0002,***


Unnamed: 0,MIX,others,diff,p,sig
char_count,1869.167539,3959.904222,-2090.736683,0.0002,***
num_gpe_places,25.450262,19.644253,5.806008,0.0008,***
num_nongpe_places,84.507853,125.997263,-41.48941,0.0002,***
num_all_places,109.403141,145.092651,-35.689509,0.0002,***
dist_miles,103312.422699,69657.898764,33654.523935,0.0008,***
Tokens,126088.335079,128026.023847,-1937.688768,0.7124,
num_gpe_places_norm,0.000204,0.000157,4.7e-05,0.0004,***
num_nongpe_places_norm,0.000745,0.001098,-0.000352,0.0002,***
num_gpe_places_norm_byCharacter,0.018654,0.007379,0.011275,0.0002,***
num_nongpe_places_norm_byCharacter,0.046469,0.036484,0.009985,0.0002,***


Unnamed: 0,MEM,others,diff,p,sig
char_count,4260.253275,3774.146032,486.107243,0.0136,*
num_gpe_places,37.877729,18.427381,19.450348,0.0002,***
num_nongpe_places,170.908297,118.771429,52.136868,0.0002,***
num_all_places,207.737991,136.694841,71.04315,0.0002,***
dist_miles,159239.594297,64068.139159,95171.455138,0.0002,***
Tokens,105611.161572,129916.065476,-24304.903904,0.0002,***
num_gpe_places_norm,0.000355,0.000143,0.000212,0.0002,***
num_nongpe_places_norm,0.001666,0.001019,0.000646,0.0002,***
num_gpe_places_norm_byCharacter,0.009739,0.008019,0.00172,0.0486,*
num_nongpe_places_norm_byCharacter,0.041595,0.036777,0.004819,0.0004,***


Unnamed: 0,MID,others,diff,p,sig
char_count,3103.355422,3860.351916,-756.996495,0.0006,***
num_gpe_places,7.024096,20.88463,-13.860534,0.0002,***
num_nongpe_places,94.024096,124.984127,-30.960031,0.0002,***
num_all_places,100.753012,145.303136,-44.550124,0.0002,***
dist_miles,16320.877678,75574.259419,-59253.381741,0.0002,***
Tokens,84156.487952,130702.076655,-46545.588703,0.0002,***
num_gpe_places_norm,8.9e-05,0.000165,-7.6e-05,0.0002,***
num_nongpe_places_norm,0.001243,0.001062,0.000181,0.0004,***
num_gpe_places_norm_byCharacter,0.002412,0.008532,-0.00612,0.0002,***
num_nongpe_places_norm_byCharacter,0.031853,0.03752,-0.005667,0.0002,***


Unnamed: 0,BIO,others,diff,p,sig
char_count,4291.549223,3778.629499,512.919724,0.0172,*
num_gpe_places,59.735751,17.050861,42.684891,0.0002,***
num_nongpe_places,175.65285,119.147496,56.505354,0.0002,***
num_all_places,233.875648,135.721831,98.153817,0.0002,***
dist_miles,252167.099482,58391.755702,193775.34378,0.0002,***
Tokens,196532.989637,122708.362285,73824.627352,0.0002,***
num_gpe_places_norm,0.000331,0.000148,0.000184,0.0002,***
num_nongpe_places_norm,0.001036,0.001076,-4e-05,0.3848,
num_gpe_places_norm_byCharacter,0.015934,0.007576,0.008359,0.0002,***
num_nongpe_places_norm_byCharacter,0.046214,0.036496,0.009718,0.0002,***
