In [1]:
import pandas as pd

# name_common,age,mlb_ID,player_ID,year_ID,team_ID,stint_ID,lg_ID,PA,G,Inn,runs_bat,runs_br,runs_dp,runs_field,runs_infield,runs_outfield,runs_catcher,runs_good_plays,runs_defense,runs_position,runs_position_p,runs_replacement,runs_above_rep,runs_above_avg,runs_above_avg_off,runs_above_avg_def,WAA,WAA_off,WAA_def,WAR,WAR_def,WAR_off,WAR_rep,salary,pitcher,teamRpG,oppRpG,oppRpPA_rep,oppRpG_rep,pyth_exponent,pyth_exponent_rep,waa_win_perc,waa_win_perc_off,waa_win_perc_def,waa_win_perc_rep,OPS_plus,TOB_lg,TB_lg
# David Aardsma,22,430911,aardsda01,2004,SFG,1,NL,0,11,10.7,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.00,0.00,-0.01,0.00,-0.01,0.00,0.00,300000,Y,4.67092,4.67092,0.08651,4.67092,1.890,1.890,0.5000,0.5000,0.5000,0.5000,NULL,0.000,0.000

# load the data
df1 = pd.read_csv('war/war_daily_bat.txt', sep=',', header=0)
df2 = pd.read_csv('war/war_daily_pitch.txt', sep=',', header=0)

# get just the names
names1 = df1['name_common']
names2 = df2['name_common']

# concat them
names = pd.concat([names1, names2])

# drop duplicates
names = names.drop_duplicates()

# save to a string
full_mlb_names = '\n'.join(names).lower()

In [2]:
seen = set()
uniq = []
# for every name, split it by space and drop the last element. If this hasn't been seen before, add it to the list
for name in names:
    except_last = tuple(name.split(' ')[:-1])
    if except_last in seen:
        continue
    uniq.append(name)
    seen.add(except_last)

uniq.sort()
# Save it to mlb_names.txt
with open('mlb.txt', 'w') as f:
    for name in uniq:
        f.write(name + '\n')

In [3]:
from collections import defaultdict

cumulative_count = defaultdict(int)
cumulative_percent = defaultdict(int) # didn't end up using this

# for each file yob1901.txt through yob2000.txt
for year in range(1901, 2001):
    # read the file
    with open(f'names/yob{year}.txt', 'r') as f:
        lines = f.readlines()
    # remove all the F's lines. Save the name and number
    rows = [line.split(',') for line in lines]
    rows = [(row[0], int(row[2])) for row in rows if row[1] == 'M']
    # get the total number of babies
    total = sum(row[1] for row in rows)
    # for each row, add the percentage of babies to the cumulative percentage
    for row in rows:
        name, count = row
        cumulative_count[name] += count
        cumulative_percent[name] += count / total

In [4]:
# sort names by cumulative count
count_sorted_names = sorted(cumulative_count.keys(), key=lambda x: cumulative_count[x], reverse=True)
count_sorted_names[:10]

['James',
 'John',
 'Robert',
 'Michael',
 'William',
 'David',
 'Richard',
 'Joseph',
 'Charles',
 'Thomas']

In [5]:
import re
# open mlb.txt as one string
with open('mlb.txt', 'r') as f:
    mlb_names = '\n' + f.read().lower()

missing_names = []
# for each of the count_sorted_names, if it's not in mlb_names, add it to missing_names with its index
for i, name in enumerate(count_sorted_names):
    lowered = name.lower()
    # use regex
    if re.search(f'\n{lowered} ', mlb_names) is None:
        missing_names.append((name, i))
    if len(missing_names) == 50:
        break

missing_names

[('Kenneth', 17),
 ('Douglas', 46),
 ('Benjamin', 53),
 ('Zachary', 74),
 ('Jeffery', 126),
 ('Clinton', 239),
 ('Raul', 293),
 ('Clifton', 298),
 ('Johnathan', 305),
 ('Mathew', 314),
 ('Fredrick', 319),
 ('Jonathon', 335),
 ('Geoffrey', 344),
 ('Salvatore', 351),
 ('Edmund', 367),
 ('Delbert', 373),
 ('Nicolas', 385),
 ('Roderick', 389),
 ('Dominick', 409),
 ('Woodrow', 412),
 ('Bryant', 413),
 ('Laurence', 414),
 ('Maxwell', 420),
 ('Terence', 456),
 ('Billie', 462),
 ('Stewart', 481),
 ('Sebastian', 482),
 ('Wilfred', 492),
 ('Quentin', 497),
 ('Nickolas', 500),
 ('Trenton', 505),
 ('Demetrius', 513),
 ('August', 522),
 ('Timmy', 523),
 ('Eldon', 525),
 ('Stephan', 534),
 ('Cary', 538),
 ('Malik', 543),
 ('Zachery', 548),
 ('Avery', 551),
 ('Guadalupe', 557),
 ('Edmond', 558),
 ('Antoine', 569),
 ('Burton', 573),
 ('Rodger', 575),
 ('Lamont', 581),
 ('Tomas', 588),
 ('Sammie', 589),
 ('Solomon', 590),
 ('Elwood', 604)]

In [6]:
# quality check
# for every missing name, see if it's in the string full_mlb_names
for name, i in missing_names:
    if f'\n{name} ' in full_mlb_names:
        print(name)

In [7]:
# for every name, print the name and its rank
for name, i in missing_names:
    print(f'{name}: {i}')

Kenneth: 17
Douglas: 46
Benjamin: 53
Zachary: 74
Jeffery: 126
Clinton: 239
Raul: 293
Clifton: 298
Johnathan: 305
Mathew: 314
Fredrick: 319
Jonathon: 335
Geoffrey: 344
Salvatore: 351
Edmund: 367
Delbert: 373
Nicolas: 385
Roderick: 389
Dominick: 409
Woodrow: 412
Bryant: 413
Laurence: 414
Maxwell: 420
Terence: 456
Billie: 462
Stewart: 481
Sebastian: 482
Wilfred: 492
Quentin: 497
Nickolas: 500
Trenton: 505
Demetrius: 513
August: 522
Timmy: 523
Eldon: 525
Stephan: 534
Cary: 538
Malik: 543
Zachery: 548
Avery: 551
Guadalupe: 557
Edmond: 558
Antoine: 569
Burton: 573
Rodger: 575
Lamont: 581
Tomas: 588
Sammie: 589
Solomon: 590
Elwood: 604
