In [1]:
# Rank US states and territories by their 2010 population density

# repo = "https://raw.githubusercontent.com/jakevdp/data-USstates/master"
# !cd data && curl -O {repo}/state-population.csv
# !cd data && curl -O {repo}/state-areas.csv
# !cd data && curl -O {repo}/state-abbrevs.csv

import pandas as pd 
import numpy as np 

In [2]:
path = "/__PROJECTS/aa_CommonData/ml/"
pop = pd.read_csv(path+'state-population.csv')
areas = pd.read_csv(path+'state-areas.csv')
abbrevs = pd.read_csv(path+'state-abbrevs.csv')

In [3]:
pop.head(), areas.head(), abbrevs.head()

(  state/region     ages  year  population
 0           AL  under18  2012   1117489.0
 1           AL    total  2012   4817528.0
 2           AL  under18  2010   1130966.0
 3           AL    total  2010   4785570.0
 4           AL  under18  2011   1125763.0,
         state  area (sq. mi)
 0     Alabama          52423
 1      Alaska         656425
 2     Arizona         114006
 3    Arkansas          53182
 4  California         163707,
         state abbreviation
 0     Alabama           AL
 1      Alaska           AK
 2     Arizona           AZ
 3    Arkansas           AR
 4  California           CA)

In [None]:
# we'll start with a many-to-one merge that will give us the full 
# state names within the population DataFrame 

# we want to merge based on the state/region column of pop and abbreviation 
# column of abbrevs
# we'll use how="outer" to make sure no data is throws away

In [4]:
merged = pd.merge(
    pop, abbrevs, how="outer",
    left_on = "state/region", right_on = "abbreviation"
)
merged = merged.drop("abbreviation", axis=1) # drop duplicate info
merged.head()

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


In [5]:
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

In [6]:
# ищем пустые вхождения:
merged[ merged["population"].isnull() ].head()

Unnamed: 0,state/region,ages,year,population,state
2448,PR,under18,1990,,
2449,PR,total,1990,,
2450,PR,total,1991,,
2451,PR,under18,1991,,
2452,PR,total,1993,,


In [7]:
# в других ячейках также
merged.loc[merged["state"].isnull(), "state/region"].unique()

array(['PR', 'USA'], dtype=object)

In [8]:
# исправляем данные:
merged.loc[ merged["state/region"] == "PR", "state" ] = "Puerto Rico"
merged.loc[ merged["state/region"] == "USA", "state" ] = "United States"

# обращаем внимание на проверку после исправления
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state           False
dtype: bool

In [9]:
# продолжаем объединение:
final = pd.merge(merged, areas, on="state", how="left")
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0


In [10]:
final.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool

In [12]:
# вообще думается что названия то ячеек по хорошему бы в энум хотя бы 
# поместить потому что эти выборки по строкам - бе
final['state'][final['area (sq. mi)'].isnull()].unique()

array(['United States'], dtype=object)

In [13]:
final.dropna(inplace=True)
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0


In [14]:
# сначала отрежем данные за 2010 год 
# требует установки NumExpr package

data2010 = final.query("year == 2010 & ages == 'total'")
data2010.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
3,AL,total,2010,4785570.0,Alabama,52423.0
91,AK,total,2010,713868.0,Alaska,656425.0
101,AZ,total,2010,6408790.0,Arizona,114006.0
189,AR,total,2010,2922280.0,Arkansas,53182.0
197,CA,total,2010,37333601.0,California,163707.0


In [15]:
data2010.set_index("state", inplace=True)

In [17]:
density = data2010['population'] / data2010['area (sq. mi)']

In [18]:
density.sort_values(ascending=False, inplace=True)
density.head()

state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64

In [19]:
density.tail()

state
South Dakota    10.583512
North Dakota     9.537565
Montana          6.736171
Wyoming          5.768079
Alaska           1.087509
dtype: float64