需求：
    导入文件，查看原始数据
    将人口数据和各州简称数据进行合并
    将合并的数据中重复的abbreviation列进行删除
    查看存在缺失数据的列
    找到有哪些state/region使得state的值为NaN，进行去重操作
    为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN
    合并各州面积数据areas
    我们会发现area(sq.mi)这一列有缺失数据，找出是哪些行
    去除含有缺失数据的行
    找出2010年的全民人口数据
    计算各州的人口密度
    排序，并找出人口密度最高的五个州 df.sort_values()

In [132]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [133]:
# 导入文件，查看原始数据
pl = pd.read_csv("./state-population.csv")
pl.head()

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


In [152]:
ar = pd.read_csv("./state-areas.csv")

In [135]:
abb = pd.read_csv("./state-abbrevs.csv")
abb.head()

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [136]:
# 将人口数据和各州简称数据进行合并
pl_abb = pd.merge(pl,abb,left_on="state/region",right_on="abbreviation",how="outer")
pl_abb.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AL,under18,2012,1117489.0,Alabama,AL
1,AL,total,2012,4817528.0,Alabama,AL
2,AL,under18,2010,1130966.0,Alabama,AL
3,AL,total,2010,4785570.0,Alabama,AL
4,AL,under18,2011,1125763.0,Alabama,AL


In [137]:
# 将合并的数据中重复的abbreviation列进行删除
pl_abb.drop("abbreviation",axis=1,inplace=True)

In [138]:
pl_abb.head()

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


In [139]:
# 查看存在缺失数据的列
pl_abb.isnull().head()

Unnamed: 0,state/region,ages,year,population,state
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False


In [140]:
#找到有哪些state/region使得state的值为NaN，进行去重操作
pl_abb["state"].isnull().head()

0    False
1    False
2    False
3    False
4    False
Name: state, dtype: bool

In [141]:
pl_abb.loc[pl_abb["state"].isnull()].head()

Unnamed: 0,state/region,ages,year,population,state
2448,PR,under18,1990,,
2449,PR,total,1990,,
2450,PR,total,1991,,
2451,PR,under18,1991,,
2452,PR,total,1993,,


In [143]:
# unique()是Series的一个函数，可以对Series元素进行去重操作
pl_abb.loc[pl_abb["state"].isnull()]["state/region"].unique()

array(['PR', 'USA'], dtype=object)

In [199]:
pl_abb.loc[pl_abb["state/region"] == "PR"]["state"]

2448    Puerto Rico
2449    Puerto Rico
2450    Puerto Rico
2451    Puerto Rico
2452    Puerto Rico
2453    Puerto Rico
2454    Puerto Rico
2455    Puerto Rico
2456    Puerto Rico
2457    Puerto Rico
2458    Puerto Rico
2459    Puerto Rico
2460    Puerto Rico
2461    Puerto Rico
2462    Puerto Rico
2463    Puerto Rico
2464    Puerto Rico
2465    Puerto Rico
2466    Puerto Rico
2467    Puerto Rico
2468    Puerto Rico
2469    Puerto Rico
2470    Puerto Rico
2471    Puerto Rico
2472    Puerto Rico
2473    Puerto Rico
2474    Puerto Rico
2475    Puerto Rico
2476    Puerto Rico
2477    Puerto Rico
2478    Puerto Rico
2479    Puerto Rico
2480    Puerto Rico
2481    Puerto Rico
2482    Puerto Rico
2483    Puerto Rico
2484    Puerto Rico
2485    Puerto Rico
2486    Puerto Rico
2487    Puerto Rico
2488    Puerto Rico
2489    Puerto Rico
2490    Puerto Rico
2491    Puerto Rico
2492    Puerto Rico
2493    Puerto Rico
2494    Puerto Rico
2495    Puerto Rico
Name: state, dtype: object

In [200]:
pl_abb.loc[pl_abb["state/region"]=="PR",["state"]] = "Puerto Rico"

In [172]:
#合并各州面积数据areas
pl_abb_area = pd.merge(pl_abb,ar,how="outer")

In [173]:
#我们会发现ara(sq.mi)这一列有缺失数据，找出是哪些行
null_area = pl_abb_area["area (sq. mi)"].isnull()

In [175]:
indexs = pl_abb_area["area (sq. mi)"].isnull()[null_area].index

In [176]:
# 去除含有缺失数据的行
pl_abb_area.drop(indexs,axis=0,inplace=True)

In [183]:
#找出2010年的全民人口数据
pl_abb_area.query("year == 2010 & ages=='total'").head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
3,AL,total,2010,4785570.0,Alabama,52423.0
91,AK,total,2010,713868.0,Alaska,656425.0
101,AZ,total,2010,6408790.0,Arizona,114006.0
189,AR,total,2010,2922280.0,Arkansas,53182.0
197,CA,total,2010,37333601.0,California,163707.0


In [187]:
# 计算各州的人口密度
pl_abb_area["density"] = pl_abb_area["population"]/pl_abb_area["area (sq. mi)"]

In [189]:
pl_abb_area.drop("[density",axis=1,inplace=True)

In [190]:
pl_abb_area

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi),density
0,AL,under18,2012,1117489.0,Alabama,52423.0,21.316769
1,AL,total,2012,4817528.0,Alabama,52423.0,91.897221
2,AL,under18,2010,1130966.0,Alabama,52423.0,21.573851
3,AL,total,2010,4785570.0,Alabama,52423.0,91.287603
4,AL,under18,2011,1125763.0,Alabama,52423.0,21.474601
5,AL,total,2011,4801627.0,Alabama,52423.0,91.593900
6,AL,total,2009,4757938.0,Alabama,52423.0,90.760506
7,AL,under18,2009,1134192.0,Alabama,52423.0,21.635389
8,AL,under18,2013,1111481.0,Alabama,52423.0,21.202163
9,AL,total,2013,4833722.0,Alabama,52423.0,92.206131


In [193]:
# 删除有NaN的行
pl_abb_area["population"].isnull().head()

0    False
1    False
2    False
3    False
4    False
Name: population, dtype: bool

In [197]:
indexs = pl_abb_area["population"].isnull()[pl_abb_area["population"].isnull()].index
pl_abb_area.drop(indexs,axis=0,inplace=True)

In [198]:
pl_abb_area

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi),density
0,AL,under18,2012,1117489.0,Alabama,52423.0,21.316769
1,AL,total,2012,4817528.0,Alabama,52423.0,91.897221
2,AL,under18,2010,1130966.0,Alabama,52423.0,21.573851
3,AL,total,2010,4785570.0,Alabama,52423.0,91.287603
4,AL,under18,2011,1125763.0,Alabama,52423.0,21.474601
5,AL,total,2011,4801627.0,Alabama,52423.0,91.593900
6,AL,total,2009,4757938.0,Alabama,52423.0,90.760506
7,AL,under18,2009,1134192.0,Alabama,52423.0,21.635389
8,AL,under18,2013,1111481.0,Alabama,52423.0,21.202163
9,AL,total,2013,4833722.0,Alabama,52423.0,92.206131
