# US Baby Names
- NationalNames.csv
- Id: 행 구분자
- Name: 유아 이름
- Year: 출생연도
- Gender: 성별
- Count: 해당 (이름, 연도, 성별)의 출생횟수

In [2]:
%matplotlib nbagg
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

- 1. 각 (연도, 성별) 그룹의 총 출생횟수 산출하기

In [3]:
df_names = pd.read_csv("data/us-baby-names/NationalNames.csv", sep=",",
                     header=0, names=["id", "name", "year", "sex", "births"])
df_names.head()

Unnamed: 0,id,name,year,sex,births
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746


In [4]:
df_total_births = df_names.pivot_table("births", index="year", columns="sex", aggfunc=sum)

In [5]:
df_total_births.head()

sex,F,M
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1880,90993,110491
1881,91954,100745
1882,107850,113688
1883,112321,104629
1884,129022,114445


In [6]:
gp_ax = df_total_births.plot()
gp_ax.set_title("Total births by sex and year")

<IPython.core.display.Javascript object>

Text(0.5,1,'Total births by sex and year')

- 2.각 (연도, 성별) 그룹 내에서 각 이름의 출생횟수가 전체에서 차지하는 비중(%)을 나타내는 열 추가하기

In [27]:
df_names.head(10)

Unnamed: 0,id,name,year,sex,births
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746
5,6,Margaret,1880,F,1578
6,7,Ida,1880,F,1472
7,8,Alice,1880,F,1414
8,9,Bertha,1880,F,1320
9,10,Sarah,1880,F,1288


In [8]:
dict_grouped_names = dict(list(df_names.groupby(["year", "sex"])))

In [10]:
dict_grouped_names.keys()

dict_keys([(1880, 'F'), (1880, 'M'), (1881, 'F'), (1881, 'M'), (1882, 'F'), (1882, 'M'), (1883, 'F'), (1883, 'M'), (1884, 'F'), (1884, 'M'), (1885, 'F'), (1885, 'M'), (1886, 'F'), (1886, 'M'), (1887, 'F'), (1887, 'M'), (1888, 'F'), (1888, 'M'), (1889, 'F'), (1889, 'M'), (1890, 'F'), (1890, 'M'), (1891, 'F'), (1891, 'M'), (1892, 'F'), (1892, 'M'), (1893, 'F'), (1893, 'M'), (1894, 'F'), (1894, 'M'), (1895, 'F'), (1895, 'M'), (1896, 'F'), (1896, 'M'), (1897, 'F'), (1897, 'M'), (1898, 'F'), (1898, 'M'), (1899, 'F'), (1899, 'M'), (1900, 'F'), (1900, 'M'), (1901, 'F'), (1901, 'M'), (1902, 'F'), (1902, 'M'), (1903, 'F'), (1903, 'M'), (1904, 'F'), (1904, 'M'), (1905, 'F'), (1905, 'M'), (1906, 'F'), (1906, 'M'), (1907, 'F'), (1907, 'M'), (1908, 'F'), (1908, 'M'), (1909, 'F'), (1909, 'M'), (1910, 'F'), (1910, 'M'), (1911, 'F'), (1911, 'M'), (1912, 'F'), (1912, 'M'), (1913, 'F'), (1913, 'M'), (1914, 'F'), (1914, 'M'), (1915, 'F'), (1915, 'M'), (1916, 'F'), (1916, 'M'), (1917, 'F'), (1917, 'M'), (

In [11]:
df_grouped_sample = dict_grouped_names[(2011, 'M')]
df_grouped_sample.head(10)

Unnamed: 0,id,name,year,sex,births
1711173,1711174,Jacob,2011,M,20331
1711174,1711175,Mason,2011,M,19488
1711175,1711176,William,2011,M,17314
1711176,1711177,Jayden,2011,M,16954
1711177,1711178,Noah,2011,M,16838
1711178,1711179,Michael,2011,M,16744
1711179,1711180,Ethan,2011,M,16665
1711180,1711181,Alexander,2011,M,15681
1711181,1711182,Aiden,2011,M,15469
1711182,1711183,Daniel,2011,M,15249


In [12]:
def add_prop(agg_df):
    agg_births = agg_df["births"]
    agg_df["prop"] = agg_births / agg_births.sum()
    return agg_df

In [13]:
df_names_with_prop = df_names.groupby(["year", "sex"]).apply(add_prop)
df_names_with_prop.head()

Unnamed: 0,id,name,year,sex,births,prop
0,1,Mary,1880,F,7065,0.077643
1,2,Anna,1880,F,2604,0.028618
2,3,Emma,1880,F,2003,0.022013
3,4,Elizabeth,1880,F,1939,0.021309
4,5,Minnie,1880,F,1746,0.019188


- 3.각 (연도, 성별) 그룹 내 출생횟수 기준 TOP 1000 이름 추출하기

In [14]:
dict_grouped_names_with_prop = dict(list(df_names_with_prop.groupby(["year", "sex"])))

In [17]:
df_grouped_sample = dict_grouped_names_with_prop[(2011, 'M')]

In [18]:
df_grouped_sample.sort_values(by="births", ascending=False).iloc[:10]

Unnamed: 0,id,name,year,sex,births,prop
1711173,1711174,Jacob,2011,M,20331,0.010739
1711174,1711175,Mason,2011,M,19488,0.010294
1711175,1711176,William,2011,M,17314,0.009145
1711176,1711177,Jayden,2011,M,16954,0.008955
1711177,1711178,Noah,2011,M,16838,0.008894
1711178,1711179,Michael,2011,M,16744,0.008844
1711179,1711180,Ethan,2011,M,16665,0.008802
1711180,1711181,Alexander,2011,M,15681,0.008283
1711181,1711182,Aiden,2011,M,15469,0.008171
1711182,1711183,Daniel,2011,M,15249,0.008054


In [19]:
def get_top1000(agg_df):
    df_top1000 = agg_df.sort_values(by="births", ascending=False).iloc[:1000]
    return df_top1000

In [21]:
df_top1000_names = df_names_with_prop.groupby(["year", "sex"]).apply(get_top1000)

In [23]:
df_top1000_names.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,name,year,sex,births,prop
year,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1880,F,0,1,Mary,1880,F,7065,0.077643
1880,F,1,2,Anna,1880,F,2604,0.028618
1880,F,2,3,Emma,1880,F,2003,0.022013
1880,F,3,4,Elizabeth,1880,F,1939,0.021309
1880,F,4,5,Minnie,1880,F,1746,0.019188


In [25]:
df_names_with_prop.loc[(df_names_with_prop['year'] == 2011) & (df_names_with_prop['sex'] == 'M')].shape

(14329, 6)

In [26]:
df_top1000_names.loc[(2011,'M')].shape

(1000, 6)