In [1]:
import pandas as pd
import numpy as np
from uszipcode import SearchEngine



In [2]:
# Instantiate a Zipcode object.
search = SearchEngine(simple_zipcode=True)

In [4]:
# Read in the file containing the zip codes.
df = pd.read_csv('tj_zips.csv')
df.head()

Unnamed: 0,Zip Code,City,State,Population (2010 Census),Unnamed: 4,Macro Test,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,99546,Adak,AK,326,,326.0,,,,,
1,99571,Adak,AK,160,,,,,,,
2,99615,Akhiok,AK,12899,,,,,,,https://factfinder.census.gov/faces/nav/jsf/pa...
3,99551,Akiachak,AK,627,,,,,,,
4,99552,Akiak,AK,346,,,,,,,


In [6]:
## Rename columns to make things a little easier.
df.rename(columns={'Zip Code': 'Zip_Code', 'Population (2010 Census)': 'Pop_2010' }, inplace=True)
df.Pop_2010.replace('--', np.nan, inplace=True)
df = df.iloc[:, :4]
df.head()

Unnamed: 0,Zip_Code,City,State,Pop_2010
0,99546,Adak,AK,326
1,99571,Adak,AK,160
2,99615,Akhiok,AK,12899
3,99551,Akiachak,AK,627
4,99552,Akiak,AK,346


In [9]:
# Remove all rows that contain nan Zip_Code values
df.dropna(subset=['Zip_Code'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41319 entries, 0 to 41318
Data columns (total 4 columns):
Zip_Code    41319 non-null object
City        41319 non-null object
State       41319 non-null object
Pop_2010    33498 non-null object
dtypes: object(4)
memory usage: 1.6+ MB


In [10]:
# In the data, a row contains an invalid zip code.
df[df['Zip_Code'].str.contains(',')]

Unnamed: 0,Zip_Code,City,State,Pop_2010
5488,10190,Littleton,CO,--


In [12]:
# Remove that row.
df.drop(df[df['Zip_Code'].str.contains(',')].index, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41318 entries, 0 to 41318
Data columns (total 4 columns):
Zip_Code    41318 non-null object
City        41318 non-null object
State       41318 non-null object
Pop_2010    33497 non-null object
dtypes: object(4)
memory usage: 1.6+ MB


In [13]:
# Convert column of zip codes to integers.
df['Zip_Code'] = df.Zip_Code.astype(np.int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41318 entries, 0 to 41318
Data columns (total 4 columns):
Zip_Code    41318 non-null int64
City        41318 non-null object
State       41318 non-null object
Pop_2010    33497 non-null object
dtypes: int64(1), object(3)
memory usage: 1.6+ MB


In [45]:
search.by_zipcode('92106')

SimpleZipcode(zipcode='92106', zipcode_type='Standard', major_city='San Diego', post_office_city='San Diego, CA', common_city_list=['San Diego'], county='San Diego County', state='CA', lat=32.73, lng=-117.23, timezone='Pacific', radius_in_miles=2.0, area_code_list=['619'], population=19330, population_density=3426.0, land_area_in_sqmi=5.64, water_area_in_sqmi=0.71, housing_units=8322, occupied_housing_units=7807, median_home_value=834800, median_household_income=82075, bounds_west=-117.25896, bounds_east=-117.208362, bounds_north=32.752296, bounds_south=32.66296)

In [28]:
'3' + '4'

'34'

In [30]:
## Using uszipcode library
df['Pop_2010_uszipcode'] = [search.by_zipcode(str(z)).population for z in df['Zip_Code']]

In [31]:
df.head()

Unnamed: 0,Zip_Code,City,State,Pop_2010,Pop_2010_uszipcode
0,99546,Adak,AK,326,326.0
1,99571,Adak,AK,160,160.0
2,99615,Akhiok,AK,12899,12899.0
3,99551,Akiachak,AK,627,627.0
4,99552,Akiak,AK,346,346.0


In [32]:
## Using outside data source
zf = pd.read_csv('https://query.data.world/s/op67ehfhbfiugjxpzpr6rk3vfouzxp')
zf.columns = [col.replace('-', '_') for col in zf.columns]
common = set(df.Zip_Code) & set(zf.zip_code)
zf.set_index('zip_code', inplace=True)

In [33]:
zf.head()

Unnamed: 0_level_0,y_2016,y_2015,y_2014,y_2013,y_2012,y_2011,y_2010,aggregate
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
601,17800,17982,18088,18450,18544,18533,18570,127967
602,39716,40260,40859,41302,41640,41930,41520,287227
603,51565,52408,53162,53683,54540,54475,54689,374522
606,6320,6331,6415,6591,6593,6386,6615,45251
610,27976,28328,28805,28963,29141,29111,29016,201340


In [34]:
## Using dataworld link

years = range(2010, 2017)
na = np.nan

for year in years:
    name, yr = 'Pop_{}_dataworld'.format(year), 'y_{}'.format(year)
    df[name] = [zf.at[x, yr] if x in common else na for x in df.Zip_Code]

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41318 entries, 0 to 41318
Data columns (total 12 columns):
Zip_Code              41318 non-null int64
City                  41318 non-null object
State                 41318 non-null object
Pop_2010              33497 non-null object
Pop_2010_uszipcode    32987 non-null float64
Pop_2010_dataworld    32987 non-null float64
Pop_2011_dataworld    32987 non-null float64
Pop_2012_dataworld    32987 non-null float64
Pop_2013_dataworld    32987 non-null float64
Pop_2014_dataworld    32987 non-null float64
Pop_2015_dataworld    32987 non-null float64
Pop_2016_dataworld    32987 non-null float64
dtypes: float64(8), int64(1), object(3)
memory usage: 4.1+ MB


In [40]:
df.head()

Unnamed: 0,Zip_Code,City,State,Pop_2010,Pop_2010_uszipcode,Pop_2010_dataworld,Pop_2011_dataworld,Pop_2012_dataworld,Pop_2013_dataworld,Pop_2014_dataworld,Pop_2015_dataworld,Pop_2016_dataworld
0,99546,Adak,AK,326,326.0,326.0,124.0,127.0,108.0,114.0,122.0,122.0
1,99571,Adak,AK,160,160.0,160.0,55.0,82.0,101.0,83.0,88.0,106.0
2,99615,Akhiok,AK,12899,12899.0,12899.0,12609.0,12770.0,13009.0,13096.0,13197.0,13135.0
3,99551,Akiachak,AK,627,627.0,627.0,644.0,638.0,606.0,590.0,562.0,584.0
4,99552,Akiak,AK,346,346.0,346.0,331.0,364.0,392.0,407.0,399.0,407.0


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41318 entries, 0 to 41318
Data columns (total 12 columns):
Zip_Code              41318 non-null int64
City                  41318 non-null object
State                 41318 non-null object
Pop_2010              33497 non-null object
Pop_2010_uszipcode    32987 non-null float64
Pop_2010_dataworld    32987 non-null float64
Pop_2011_dataworld    32987 non-null float64
Pop_2012_dataworld    32987 non-null float64
Pop_2013_dataworld    32987 non-null float64
Pop_2014_dataworld    32987 non-null float64
Pop_2015_dataworld    32987 non-null float64
Pop_2016_dataworld    32987 non-null float64
dtypes: float64(8), int64(1), object(3)
memory usage: 4.1+ MB


In [41]:
## Write the dataframe to a csv file.
df.to_csv('tjs_populations_by_zip.csv')

In [47]:
df[df['Pop_2010_dataworld'] != df['Pop_2010_uszipcode']].loc[30, 'Pop_2010']

'--'

In [378]:
df[df.pop_2010_uszipcode != df.Pop_2010_dataworld][:10]

Unnamed: 0,Zip_Code,City,State,Pop_2010,Pop_2010_dataworld,pop_2010_uszipcode
20,99509,Anchorage,AK,,,
22,99511,Anchorage,AK,,,
24,99514,Anchorage,AK,0,,
30,99520,Anchorage,AK,--,,
31,99521,Anchorage,AK,--,,
32,99522,Anchorage,AK,--,,
33,99523,Anchorage,AK,--,,
34,99524,Anchorage,AK,--,,
35,99529,Anchorage,AK,--,,
36,99530,Anchorage,AK,--,,


In [57]:
## Write out to csv file.
df.to_csv('tjs_zipcodes_out.csv')

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41318 entries, 0 to 41318
Data columns (total 12 columns):
Zip_Code              41318 non-null int64
City                  41318 non-null object
State                 41318 non-null object
Pop_2010              26841 non-null object
Pop_2010_uszipcode    32987 non-null float64
Pop_2010_dataworld    32987 non-null float64
Pop_2011_dataworld    32987 non-null float64
Pop_2012_dataworld    32987 non-null float64
Pop_2013_dataworld    32987 non-null float64
Pop_2014_dataworld    32987 non-null float64
Pop_2015_dataworld    32987 non-null float64
Pop_2016_dataworld    32987 non-null float64
dtypes: float64(8), int64(1), object(3)
memory usage: 4.1+ MB
