In [9]:
import numpy as np
import pandas as pd
%matplotlib notebook
names1880 = pd.read_csv('datasets/babynames/yob1880.txt', names = ['name', 'sex', 'births'])
names1880.head()

Unnamed: 0,name,sex,births
0,Mary,F,7065
1,Anna,F,2604
2,Emma,F,2003
3,Elizabeth,F,1939
4,Minnie,F,1746


In [15]:
names1880.groupby('sex').size()   # 通过性别分组  

sex
F     942
M    1058
dtype: int64

In [17]:
names1880.groupby('sex').births.sum()   

sex
F     90993
M    110493
Name: births, dtype: int64

In [19]:
'''
该数据按年度被分隔了多个文件，将之组装到一个DataFrame里面，使用pandas.concat将数据合并到一起
'''
years = range(1880, 2011)  # 左边是闭区间 右边是开区间 [1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
    path = 'datasets/babynames/yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)
    frame['year'] = year
    pieces.append(frame)
    
names = pd.concat(pieces, ignore_index=True)   # 按行将多个DataFrame组合到一起  ignore_index不保留原始行号
names.head()

Unnamed: 0,name,sex,births,year
0,Mary,F,7065,1880
1,Anna,F,2604,1880
2,Emma,F,2003,1880
3,Elizabeth,F,1939,1880
4,Minnie,F,1746,1880


In [26]:
total_births = names.pivot_table('births', index=['year'], columns='sex', aggfunc='sum')
total_births.tail()  # Return the last n=5 rows.

sex,F,M
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2006,1896468,2050234
2007,1916888,2069242
2008,1883645,2032310
2009,1827643,1973359
2010,1759010,1898382


In [27]:
total_births.plot(title='Total births by sex and year')  # wtf!?  这就画出来了   
# 这是个新技能  index作为x坐标  colunms作为不同曲线 而数值自动划分了   

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x126bf34a8>

In [32]:
# 插入一个prop列  总出生数的比例
def add_prop(group):
    group['prop'] = group.births / group.births.sum()
    return group
names = names.groupby(['year', 'sex']).apply(add_prop)  # 根据year和sex作为索引来分组  之后进行百分比计算
names[:10]

Unnamed: 0,name,sex,births,year,prop
0,Mary,F,7065,1880,0.077643
1,Anna,F,2604,1880,0.028618
2,Emma,F,2003,1880,0.022013
3,Elizabeth,F,1939,1880,0.021309
4,Minnie,F,1746,1880,0.019188
5,Margaret,F,1578,1880,0.017342
6,Ida,F,1472,1880,0.016177
7,Alice,F,1414,1880,0.01554
8,Bertha,F,1320,1880,0.014507
9,Sarah,F,1288,1880,0.014155


In [33]:
names.groupby(['year', 'sex']).prop.sum()   # 验证总和是否为1 

year  sex
1880  F      1.0
      M      1.0
1881  F      1.0
      M      1.0
1882  F      1.0
      M      1.0
1883  F      1.0
      M      1.0
1884  F      1.0
      M      1.0
1885  F      1.0
      M      1.0
1886  F      1.0
      M      1.0
1887  F      1.0
      M      1.0
1888  F      1.0
      M      1.0
1889  F      1.0
      M      1.0
1890  F      1.0
      M      1.0
1891  F      1.0
      M      1.0
1892  F      1.0
      M      1.0
1893  F      1.0
      M      1.0
1894  F      1.0
      M      1.0
            ... 
1996  F      1.0
      M      1.0
1997  F      1.0
      M      1.0
1998  F      1.0
      M      1.0
1999  F      1.0
      M      1.0
2000  F      1.0
      M      1.0
2001  F      1.0
      M      1.0
2002  F      1.0
      M      1.0
2003  F      1.0
      M      1.0
2004  F      1.0
      M      1.0
2005  F      1.0
      M      1.0
2006  F      1.0
      M      1.0
2007  F      1.0
      M      1.0
2008  F      1.0
      M      1.0
2009  F      1.0
    

In [37]:
def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
top1000.reset_index(inplace=True, drop=True)
top1000.head()

Unnamed: 0,name,sex,births,year,prop
0,Mary,F,7065,1880,0.077643
1,Anna,F,2604,1880,0.028618
2,Emma,F,2003,1880,0.022013
3,Elizabeth,F,1939,1880,0.021309
4,Minnie,F,1746,1880,0.019188


In [39]:
boys = top1000[top1000.sex=='M']
girls = top1000[top1000.sex=='F']
total_births = top1000.pivot_table('births', index=['year'], columns='name', aggfunc=sum)
total_births.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131 entries, 1880 to 2010
Columns: 6868 entries, Aaden to Zuri
dtypes: float64(6868)
memory usage: 6.9 MB


In [43]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
#subset.plot(subplots=True, figsize=(12, 10), grid=False, title='Number of births per year')  #显示不全 暂时停止显示

In [45]:
# 上面几个名字近年来次数都下降了  so  通过计算最流行的1000个名字所占的比例 
table = top1000.pivot_table('prop', index=['year'], columns='sex', aggfunc=sum)
table.plot(title='前1000个名字在总出生人数中的比例', yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x12c4cabe0>

In [46]:
table  # 可以看出来是逐年下降的

sex,F,M
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1880,1.000000,0.997375
1881,1.000000,1.000000
1882,0.998702,0.995646
1883,0.997596,0.998566
1884,0.993156,0.994539
1885,0.992251,0.995501
1886,0.989504,0.995035
1887,0.988279,0.996697
1888,0.984241,0.992429
1889,0.984061,0.994981


In [48]:
# 计算占总出生人数前50%的不同名字的数量  只考虑2010年男孩的名字
df = boys[boys.year==2010]
df

Unnamed: 0,name,sex,births,year,prop
260877,Jacob,M,21875,2010,0.011523
260878,Ethan,M,17866,2010,0.009411
260879,Michael,M,17133,2010,0.009025
260880,Jayden,M,17030,2010,0.008971
260881,William,M,16870,2010,0.008887
260882,Alexander,M,16634,2010,0.008762
260883,Noah,M,16281,2010,0.008576
260884,Daniel,M,15679,2010,0.008259
260885,Aiden,M,15403,2010,0.008114
260886,Anthony,M,15364,2010,0.008093


In [53]:
# 这1000个名字 多少个名字的人数加起来才够50% 也就是求中位数咯
prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()   # 累计    和之前随机漫步理论一样的思路
prop_cumsum.head()

260877    0.011523
260878    0.020934
260879    0.029959
260880    0.038930
260881    0.047817
Name: prop, dtype: float64

In [54]:
prop_cumsum.values.searchsorted(0.5)   # 所以在2010年前117个名字达到了50%

116

In [57]:
df = boys[boys.year == 1900]
in1900 = df.sort_values(by='prop', ascending=False).prop.cumsum()
in1900.values.searchsorted(0.5)      # 所以在1900年前24个名字达到了50%

24

In [64]:
'''
上面通过计算可以得到  在1900年时前24个名字就达到了50%，而在2010年，多元化使得前117个名字才达到50%
那么现在需要对所有年份进行这个操作  做出图 直观展示结果
'''
def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity[:10]

sex,F,M
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1880,38,14
1881,38,14
1882,38,15
1883,39,15
1884,39,16
1885,40,16
1886,41,16
1887,41,17
1888,42,17
1889,43,18


In [65]:
diversity.plot(title='Number of popular names in top 50%')  # 可以看出来女孩子名字的多样性超过男孩  在1990年代飙升

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x132864828>

In [67]:
'''
最后一个字母的变革    男孩名字在最后一个字母上的分布发生了显著的变化
'''
names.head()

Unnamed: 0,name,sex,births,year,prop
0,Mary,F,7065,1880,0.077643
1,Anna,F,2604,1880,0.028618
2,Emma,F,2003,1880,0.022013
3,Elizabeth,F,1939,1880,0.021309
4,Minnie,F,1746,1880,0.019188


In [87]:
get_last_letter = lambda x:x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last letter'
table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()

sex,F,F,F,M,M,M
year,1910,1960,2010,1910,1960,2010
last letter,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,108376.0,691247.0,670605.0,977.0,5204.0,28438.0
b,,694.0,450.0,411.0,3912.0,38859.0
c,5.0,49.0,946.0,482.0,15476.0,23125.0
d,6750.0,3729.0,2607.0,22111.0,262112.0,44398.0
e,133569.0,435013.0,313833.0,28655.0,178823.0,129012.0


In [69]:
# 计算各性别末字母占总出生人数的比例
subtable.sum()

sex  year
F    1910     396416.0
     1960    2022062.0
     2010    1759010.0
M    1910     194198.0
     1960    2132588.0
     2010    1898382.0
dtype: float64

In [70]:
letter_prop = subtable / subtable.sum()
letter_prop

sex,F,F,F,M,M,M
year,1910,1960,2010,1910,1960,2010
last letter,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,0.27339,0.341853,0.38124,0.005031,0.00244,0.01498
b,,0.000343,0.000256,0.002116,0.001834,0.02047
c,1.3e-05,2.4e-05,0.000538,0.002482,0.007257,0.012181
d,0.017028,0.001844,0.001482,0.113858,0.122908,0.023387
e,0.336941,0.215133,0.178415,0.147556,0.083853,0.067959
f,,1e-05,5.5e-05,0.000783,0.004325,0.001188
g,0.000144,0.000157,0.000374,0.00225,0.009488,0.001404
h,0.051529,0.036224,0.075852,0.045562,0.037907,0.05167
i,0.001526,0.039965,0.031734,0.000844,0.000603,0.022628
j,,,9e-05,,,0.000769


In [72]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 1, figsize=(10,8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x12f2f47b8>

In [73]:
letter_prop = table / table.sum()
dny_ts = letter_prop.loc[['d', 'n', 'y'], 'M'].T  #懂了  行和列  然后转置  
dny_ts.head()

last letter,d,n,y
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1880,0.083055,0.153213,0.07576
1881,0.083247,0.153214,0.077451
1882,0.08534,0.14956,0.077537
1883,0.084066,0.151646,0.079144
1884,0.08612,0.149915,0.080405


In [78]:
dny_ts.plot(title='Male last letter end with d, n, y')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x139d03ba8>

In [89]:
# 变成女孩名字的男孩名字   变性了   例如  Lesley 或 Leslie   找到其中以lesl开头的名字
all_names = pd.Series(top1000.name.unique())
lesley_like = all_names[all_names.str.lower().str.contains('lesl')]
lesley_like

632     Leslie
2294    Lesley
4262    Leslee
4728     Lesli
6103     Lesly
dtype: object

In [81]:
filtered = top1000[top1000.name.isin(lesley_like)]   # 集合包含用isin
filtered.groupby('name').births.sum()

name
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births, dtype: int64

In [91]:
# 按性别和年度进行聚合  
table = filtered.pivot_table('births', index='year', columns='sex', aggfunc=sum)
table = table.div(table.sum(1), axis=0)   # 现在这个axis代表行方向  而axis=1代表列方向
table.head()

sex,F,M
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1880,0.091954,0.908046
1881,0.106796,0.893204
1882,0.065693,0.934307
1883,0.05303,0.94697
1884,0.107143,0.892857


In [85]:
table.plot(title='rate of names like \'Lesley\' used by boy or girl', style={'M':'k-', 'F':'k--'})

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x139f797b8>

In [None]:
# 纵观全部代码  似乎全都认识  但是不知道怎么转化  不熟悉  到底该怎么处理数据  