# Detecting Americans

Let's start by loading some ground truth.

In [59]:
import pandas as pd
from difflib import SequenceMatcher
from collections import Counter
import statsmodels.api as sm
import numpy as np
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

In [147]:
chicago = pd.read_csv('/Users/tunder/Dropbox/CHICAGO_CORPUS/CHICAGO_NOVEL_CORPUS_METADATA/CHICAGO_CORPUS_AUTHORS.csv')

In [149]:
nationalities = dict()

for idx, row in chicago.iterrows():
    name = row['AUTH_LAST'] + ', ' + row['AUTH_FIRST']
    
    if row['NATIONALITY'].lower().startswith('ameri'):
        nation = 'us'
    else:
        nation = 'non'
    
    nationalities[name] = nation
    


In [161]:
nationalities['Koontz, Dean R.']


'us'

In [153]:
an = pd.read_csv('../../metascrape/authornationalities.csv')
an = an.rename(columns = {'name': 'author'})

In [154]:
def add2nations(df):
    global nationalities
    for idx, row in df.iterrows():
        if not pd.isnull(row['nationality']) and not pd.isnull(row['author']):
            nationalities[row['author']] = row['nationality']

In [155]:
add2nations(an)
print(len(nationalities))

4820


In [157]:
p = pd.read_csv('../../meta2018/patrick.tsv')
add2nations(p)
print(len(nationalities))

5382


In [158]:
j = pd.read_csv('../../meta2018/jessica.tsv')
add2nations(j)
print(len(nationalities))

5957


In [160]:
ssm = pd.read_csv('../supplement2/second_supplement_refined.tsv', sep = '\t')
add2nations(ssm)
print(len(nationalities))

6549


In [136]:
blocks = dict()

for author, nation in nationalities.items():
    if len(author) < 2:
        code = 'xx'
    else:
        code = author.lower()[0:2]
        
    if code not in blocks:
        blocks[code] = []
    blocks[code].append(author)

In [162]:
mapped = dict()
supp2 = pd.read_csv('../supplement2/supp2allmeta.tsv', sep = '\t')

In [163]:
def fuzzymatch(str1, str2):
    
    m = SequenceMatcher(None, str1, str2)
    match = m.real_quick_ratio()
    if match > 0.7:
        match = m.ratio()
    
    return match

def trim(astring):
    astring = astring.strip('[]().,')
    if '(' in astring:
        return astring.split('(')[0]
    else:
        return astring
    
ctr = 0
for a in supp2.author:
    ctr += 1
    if ctr % 100 == 1:
        print(ctr)
    if pd.isnull(a):
        continue
    matches = []
    trimmed = trim(a)
    if a in mapped:
        continue
    elif len(a) < 2:
        continue
    else:
        code = a.lower()[0: 2]
        if code not in blocks:
            continue
        block = blocks[code]
        for a2 in block:
            trim2 = trim(a2)
            sim = fuzzymatch(a, a2)
            sim2 = fuzzymatch(trimmed, trim2)
            similarity = max(sim, sim2)
            if similarity > 0.9:
                matches.append((similarity, a2))
    
    if len(matches) > 0:
        matches.sort()
        closest = matches[-1][1]
        mapped[a] = closest

print(len(mapped))
                
            
            
        
        

1
101
201
301
401
501
601
701
801
901
1001
1101
1201
1301
1401
1501
1601
1701
1801
1901
2001
2101
2201
2301
2401
2501
2601
2701
2801
2901
3001
3101
3201
3301
3401
3501
3601
3701
3801
3901
4001
4101
4201
4301
4401
4501
4601
4701
4801
4901
5001
5101
5201
5301
5401
5501
5601
5701
5801
5901
6001
6101
6201
6301
6401
6501
6601
6701
6801
6901
7001
7101
7201
7301
7401
7501
7601
7701
7801
7901
8001
8101
8201
8301
8401
8501
8601
8701
8801
8901
9001
9101
9201
9301
9401
9501
9601
9701
9801
9901
10001
10101
10201
10301
10401
10501
10601
10701
10801
10901
11001
11101
11201
11301
11401
11501
11601
11701
11801
11901
12001
12101
12201
12301
12401
12501
12601
12701
12801
12901
13001
13101
13201
13301
13401
13501
13601
13701
13801
13901
14001
14101
14201
14301
14401
14501
14601
14701
14801
14901
15001
15101
15201
15301
15401
15501
15601
15701
15801
15901
16001
16101
16201
16301
16401
16501
16601
16701
16801
16901
17001
17101
17201
17301
17401
17501
17601
17701
17801
17901
18001
18101
18201
18301
18401
18

In [164]:
def usethemap(author):
    global mapped, nationalities
    if pd.isnull(author):
        return float('nan')
    elif author in mapped:
        return nationalities[mapped[author]]
    else:
        return float('nan')

supp2 = supp2.assign(nationality = supp2.author.map(usethemap))

In [165]:
print(sum(supp2.nationality == 'us'))

6369


In [78]:
cities = Counter()
for imprint in supp2.imprint:
    if pd.isnull(imprint):
        continue
    parts = imprint.split('|')
    city = parts[0]
    if ';' in city:
        city = city.split(';')[0]
    cities[city] += 1

cities.most_common(100)

[('New York', 13786),
 ('London', 11817),
 ('Boston', 2712),
 ('Philadelphia', 1539),
 ('Edinburgh', 619),
 ('Garden City, N.Y.', 619),
 ('Chicago', 467),
 ('Leipzig', 380),
 ('Indianapolis', 223),
 ('Toronto', 206),
 ('New Delhi', 191),
 ('New York, N.Y.', 170),
 ('Garden City, N. Y.', 140),
 ('San Francisco', 128),
 ('New-York', 128),
 ('Dublin', 115),
 ('Sydney', 113),
 ('Moscow', 111),
 ('Paris', 80),
 ('Cincinnati', 74),
 ('New York and London', 70),
 ('Garden City, New York', 65),
 ('Baltimore', 59),
 ('New York, NY', 57),
 ('Calcutta', 51),
 ('S.l.', 50),
 ('New York [etc.', 50),
 ('Los Angeles', 47),
 ('Bombay', 47),
 ('New York, N.Y., U.S.A.', 46),
 ('Glasgow', 45),
 ('Delhi', 44),
 ('Cleveland', 44),
 ('Singapore', 44),
 ('Melbourne', 44),
 ('Nairobi', 43),
 ('Oxford', 42),
 ('Boston, Mass.', 38),
 ('Quezon City', 38),
 ('Washington', 36),
 ('London [etc.', 36),
 ('Chapel Hill, N.C.', 35),
 ('Pleasantville, N.Y.', 33),
 ('Lincoln', 32),
 ('Cape Town', 32),
 ('Tokyo', 32),
 ('

In [142]:
yankeecities = {'New York', 'Boston', 'Philadelphia', 'Garden City, N.Y.', 'Chicago', 
           'Indianapolis', 'New York, N.Y.', 'Garden City, N. Y.',
           'San Francisco', 'New-York', 'Cincinnati', 'Garden City, New York',
           'Baltimore', 'New York, NY', 'Los Angeles', 'New York, N.Y., U.S.A.',
           'Cleveland', 'Pleasantville, N.Y.', 'Washington', 'Boston and New York',
           'Boston, Mass.', 'Washington, D.C.', 'Tallahassee, Fla.',
           'N.Y.', 'Hartford, Conn.', 'Los Angeles, Calif.', 'San Diego',
               'Evanston, Ill.', 'Hartford', 'Cambridge, Mass.', 'Providence', 'Pittsburgh, Pa.',
               'Chicago, Ill.', 'Minneapolis, Minn.', 'Albany', 'Santa Barbara', 'St. Louis',
               'Berkeley', 'Englewood Cliffs, N.J.', 'Iowa City', 'Richmond, Va.'}

localus = {'Indianapolis', 'San Francisco', 'Cincinnati',
           'Baltimore', 'Los Angeles',
           'Cleveland', 'Pleasantville, N.Y.', 'Washington', 'Washington, D.C.', 
           'Tallahassee, Fla.', 'Hartford, Conn.', 'Los Angeles, Calif.', 'San Diego',
               'Evanston, Ill.', 'Hartford', 'Cambridge, Mass.', 'Providence', 'Pittsburgh, Pa.',
               'Chicago, Ill.', 'Minneapolis, Minn.', 'Albany', 'Santa Barbara', 'St. Louis',
               'Berkeley', 'Englewood Cliffs, N.J.', 'Iowa City', 'Richmond, Va.'}

localnonus = {'Edinburgh', 'New Delhi', 'Sydney', 'Dublin', 'Nairobi', 'Moscow',
             'Paris', 'Calcutta', 'Bombay', 'Glasgow', 'Delhi', 'Melbourne', 'Cape Town'}

In [167]:
authorgroups = supp2.groupby('author')
ratios = []
numbernot = []
numberyes = []
dates = []
nations = []
kansas = []
notinkansas = []

for author, df in authorgroups:
    if author not in mapped:
        continue
    else:
        newctr = 0
        kansasctr = 0
        notinkansasctr = 0
        for imprint in df.imprint:
            if pd.isnull(imprint):
                continue
            parts = imprint.split('|')
            city = parts[0]
            if ';' in city:
                city = city.split(';')[0]
                
            if city in yankeecities:
                newctr += 1
            if city in localus:
                kansasctr += 1
            if city in localnonus:
                notinkansasctr += 1
        totalvols = len(df.imprint)
        ratio = newctr / totalvols
        
        nation = nationalities[mapped[author]]
        if nation == 'us':
            nations.append(1)
        else:
            nations.append(0)
        
        ratios.append(ratio)
        dates.append(np.mean(df.latestcomp))
        numberyes.append(newctr)
        numbernot.append(len(df.imprint) - newctr)
        kansas.append(kansasctr)
        notinkansas.append((notinkansasctr + .01) / totalvols)
        
        
X = pd.DataFrame({'pubplaces': ratios, 'latestcomp': dates, 'not': numbernot, 
                  'yes': numberyes, 'kansas': kansas, 'notinks': notinkansas})
y = np.array(nations)

In [168]:
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.469425
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 3637
Model:                          Logit   Df Residuals:                     3631
Method:                           MLE   Df Model:                            5
Date:                Sat, 14 Jul 2018   Pseudo R-squ.:                  0.3176
Time:                        14:03:10   Log-Likelihood:                -1707.3
converged:                       True   LL-Null:                       -2502.0
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
kansas         0.8334      0.217      3.848      0.000       0.409       1.258
latestcomp -1.691e-05   6.25e

In [169]:
predictions = result.predict(X)
predictions[0:10]

0    0.335009
1    0.335024
2    0.334994
3    0.787403
4    0.025506
5    0.787440
6    0.334893
7    0.787304
8    0.335284
9    0.679127
dtype: float64

In [170]:
gotright = 0
gotwrong = 0

for p, r in zip(predictions, y):
    if p > 0.5 and r > 0.5:
        gotright += 1
    elif p < 0.5 and r < 0.5:
        gotright += 1
    else:
        gotwrong += 1

gotright / (gotright + gotwrong)

0.7954357987352213

In [171]:
((39817 - 16577) * 0.205) / 39817

0.11965240977471933

In [172]:
unmatched = supp2.loc[pd.isnull(supp2.nationality), : ]
unmatched.shape

(23240, 12)

In [173]:
ctr = 1
for idx, row in unmatched.iterrows():
    if pd.isnull(row['author']) or len(row['author']) < 3:
        unmatched.loc[idx, 'author'] = 'anonym' + str(ctr)
        ctr += 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [174]:
missingauths = unmatched.groupby('author')
print(len(missingauths))

17705


In [175]:
ratios = []
numbernot = []
numberyes = []
dates = []
kansas = []
notinkansas = []
themissingauthors = []

for author, df in missingauths:
    newctr = 0
    kansasctr = 0
    notinkansasctr = 0
    for imprint in df.imprint:
        if pd.isnull(imprint):
            continue
        parts = imprint.split('|')
        city = parts[0]
        if ';' in city:
            city = city.split(';')[0]

        if city in yankeecities:
            newctr += 1
        if city in localus:
            kansasctr += 1
        if city in localnonus:
            notinkansasctr += 1
    totalvols = len(df.imprint)
    ratio = newctr / totalvols

    ratios.append(ratio)
    dates.append(np.mean(df.latestcomp))
    numberyes.append(newctr)
    numbernot.append(len(df.imprint) - newctr)
    kansas.append(kansasctr)
    notinkansas.append((notinkansasctr + .01) / totalvols)
    themissingauthors.append(author)
        
        
X = pd.DataFrame({'pubplaces': ratios, 'latestcomp': dates, 'not': numbernot, 
                  'yes': numberyes, 'kansas': kansas, 'notinks': notinkansas})

In [176]:
newpredictions = result.predict(X)
X = X.assign(pred = newpredictions)
X = X.assign(auth = themissingauthors)

In [177]:
X.head()

Unnamed: 0,kansas,latestcomp,not,notinks,pubplaces,yes,pred,auth
0,0,1830.0,1,0.01,0.0,0,0.335371,"(Goodwin, William)"
1,0,2009.0,1,1.01,0.0,0,0.140886,"&#xd6;stergren, Klas"
2,0,1937.0,1,0.01,0.0,0,0.334968,"'Edith', pseud"
3,0,1824.428571,6,0.287143,0.142857,1,0.019486,", Lady"
4,0,1993.0,1,0.01,0.0,0,0.334757,"1947- Tan, Mark"


In [178]:
mapthemissing = dict()
for idx, row in X.iterrows():
    if row['pred'] > 0.5:
        mapthemissing[row['auth']] = 'guess: us'
    else:
        mapthemissing[row['auth']] = 'guess: non-us'

In [179]:
biggest_missing = Counter()
for name, df in missingauths:
    biggest_missing[name] = len(df.author)
bigs = biggest_missing.most_common(100)
for aname, acount in bigs:
    print(aname, acount, mapthemissing[aname])

Gore, (Catherine Grace Frances), Mrs 63 guess: non-us
Riddell, J. H., Mrs 32 guess: non-us
Marryat, Florence R. M. Church Lean 31 guess: non-us
Ward, R. Plumer (Robert Plumer) 25 guess: non-us
Lee, Holme 22 guess: non-us
Carlyle, Thomas 22 guess: non-us
Grey, (Elizabeth Caroline), Mrs 20 guess: non-us
Countess of, Blessington, Marguerite 19 guess: non-us
Hanley, James 18 guess: non-us
Zola, Émile 18 guess: non-us
Chatterton, Georgiana, Lady 17 guess: non-us
Aimard, Gustave 17 guess: non-us
Haliburton, Thomas Chandler 17 guess: non-us
Bray 16 guess: non-us
Charlotte Elizabeth 16 guess: us
M'Donogh, Felix 15 guess: non-us
Yerby, Frank 14 guess: us
Andersen, H. C. (Hans Christian) 14 guess: us
Morgan 14 guess: non-us
Sienkiewicz, Henryk 14 guess: us
Mühlbach, L. (Luise) 13 guess: us
Praed, Campbell, Mrs 13 guess: non-us
Lauder, Thomas Dick, Sir 13 guess: non-us
Hentz, Caroline Lee 13 guess: us
Jókai, Mór 13 guess: non-us
Freiherr de, La Motte-Fouqué, Friedrich Heinrich Karl 12 guess: non-

In [180]:
mapthemissing['Mühlbach, L. (Luise)'] = 'de'
mapthemissing['M?_hlbach, L'] = 'de'
mapthemissing['Andersen, H. C. (Hans Christian)'] = 'da'
mapthemissing['Sienkiewicz, Henryk'] = 'po'

In [181]:
docid2nation = dict()

for idx, row in unmatched.iterrows():
    auth = row['author']
    if auth not in mapthemissing:
        print('Danger, Will Robinson.')
    else:
        docid2nation[row['docid']] = mapthemissing[auth]


In [182]:
for idx, row in supp2.iterrows():
    if not pd.isnull(row['nationality']):
        continue
    elif row['docid'] not in docid2nation:
        print('Danger!')
    else:
        supp2.loc[idx, 'nationality'] = docid2nation[row['docid']]
        

In [183]:
sum(pd.isnull(supp2.nationality))

0

In [184]:
supp2.to_csv('../supplement2/supp2nationalitymeta.tsv', sep = '\t', index = False)