In [1]:
import pandas as pd
import numpy as np
from uszipcode import SearchEngine

In [2]:
# Instantiate a Zipcode object.
search = SearchEngine(simple_zipcode=True)

In [3]:
# Read in the file containing the zip codes.
df = pd.read_csv('tj_zips.csv')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Zip Code,City,State,Population (2010 Census),Unnamed: 4,Macro Test,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,99546,Adak,AK,326,,326.0,,,,,
1,99571,Adak,AK,160,,,,,,,
2,99615,Akhiok,AK,12899,,,,,,,https://factfinder.census.gov/faces/nav/jsf/pa...
3,99551,Akiachak,AK,627,,,,,,,
4,99552,Akiak,AK,346,,,,,,,


In [4]:
## Rename columns to make things a little easier.
df.rename(columns={'Zip Code': 'Zip_Code', 'Population (2010 Census)': 'Pop_2010' }, inplace=True)
## Replace 
df.Pop_2010.replace('--', np.nan, inplace=True)
df = df.iloc[:, :4]
df.head()

Unnamed: 0,Zip_Code,City,State,Pop_2010
0,99546,Adak,AK,326
1,99571,Adak,AK,160
2,99615,Akhiok,AK,12899
3,99551,Akiachak,AK,627
4,99552,Akiak,AK,346


In [5]:
# Remove all rows that contain nan Zip_Code values
df.dropna(subset=['Zip_Code'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41319 entries, 0 to 41318
Data columns (total 4 columns):
Zip_Code    41319 non-null object
City        41319 non-null object
State       41319 non-null object
Pop_2010    26841 non-null object
dtypes: object(4)
memory usage: 1.6+ MB


In [6]:
# In the data, a row contains an invalid zip code.
df[df['Zip_Code'].str.contains(',')]

Unnamed: 0,Zip_Code,City,State,Pop_2010
5488,10190,Littleton,CO,


In [7]:
# Remove that row.
df.drop(df[df['Zip_Code'].str.contains(',')].index, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41318 entries, 0 to 41318
Data columns (total 4 columns):
Zip_Code    41318 non-null object
City        41318 non-null object
State       41318 non-null object
Pop_2010    26841 non-null object
dtypes: object(4)
memory usage: 1.6+ MB


In [8]:
# Convert column of zip codes to integers.
df['Zip_Code'] = df.Zip_Code.astype(np.int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41318 entries, 0 to 41318
Data columns (total 4 columns):
Zip_Code    41318 non-null int64
City        41318 non-null object
State       41318 non-null object
Pop_2010    26841 non-null object
dtypes: int64(1), object(3)
memory usage: 1.6+ MB


In [9]:
search.by_zipcode('92106')

SimpleZipcode(zipcode='92106', zipcode_type='Standard', major_city='San Diego', post_office_city='San Diego, CA', common_city_list=['San Diego'], county='San Diego County', state='CA', lat=32.73, lng=-117.23, timezone='Pacific', radius_in_miles=2.0, area_code_list=['619'], population=19330, population_density=3426.0, land_area_in_sqmi=5.64, water_area_in_sqmi=0.71, housing_units=8322, occupied_housing_units=7807, median_home_value=834800, median_household_income=82075, bounds_west=-117.25896, bounds_east=-117.208362, bounds_north=32.752296, bounds_south=32.66296)

In [10]:
'3' + '4'

'34'

In [14]:
## Using uszipcode library
df['Pop_2010'] = [search.by_zipcode(str(z)).population for z in df['Zip_Code']]
df['County'] = [search.by_zipcode(str(z)).county for z in df['Zip_Code']]
df['Major_City'] = [search.by_zipcode(str(z)).major_city for z in df['Zip_Code']]
df['PO_City'] = [search.by_zipcode(str(z)).post_office_city for z in df['Zip_Code']]
df['State'] = [search.by_zipcode(str(z)).state for z in df['Zip_Code']]

In [15]:
df.head()

Unnamed: 0,Zip_Code,City,State,Pop_2010,County,Major_City,PO_City
0,99546,Adak,AK,326.0,Aleutians West Census Area,Adak,"Adak, AK"
1,99571,Adak,AK,160.0,Aleutians East Borough,Cold Bay,"Cold Bay, AK"
2,99615,Akhiok,AK,12899.0,Kodiak Island Borough,Kodiak,"Kodiak, AK"
3,99551,Akiachak,AK,627.0,Bethel Census Area,Akiachak,"Akiachak, AK"
4,99552,Akiak,AK,346.0,Bethel Census Area,Akiak,"Akiak, AK"


In [16]:
## Using outside data source
zf = pd.read_csv('https://query.data.world/s/op67ehfhbfiugjxpzpr6rk3vfouzxp')
zf.columns = [col.replace('-', '_') for col in zf.columns]
common = set(df.Zip_Code) & set(zf.zip_code)
zf.set_index('zip_code', inplace=True)

In [17]:
zf.head()

Unnamed: 0_level_0,y_2016,y_2015,y_2014,y_2013,y_2012,y_2011,y_2010,aggregate
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
601,17800,17982,18088,18450,18544,18533,18570,127967
602,39716,40260,40859,41302,41640,41930,41520,287227
603,51565,52408,53162,53683,54540,54475,54689,374522
606,6320,6331,6415,6591,6593,6386,6615,45251
610,27976,28328,28805,28963,29141,29111,29016,201340


In [18]:
## Using dataworld link

years = range(2011, 2017)
na = np.nan

for year in years:
    name, yr = 'Pop_{}'.format(year), 'y_{}'.format(year)
    df[name] = [zf.at[x, yr] if x in common else na for x in df.Zip_Code]

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41318 entries, 0 to 41318
Data columns (total 13 columns):
Zip_Code      41318 non-null int64
City          41318 non-null object
State         41318 non-null object
Pop_2010      32987 non-null float64
County        41318 non-null object
Major_City    41318 non-null object
PO_City       32987 non-null object
Pop_2011      32987 non-null float64
Pop_2012      32987 non-null float64
Pop_2013      32987 non-null float64
Pop_2014      32987 non-null float64
Pop_2015      32987 non-null float64
Pop_2016      32987 non-null float64
dtypes: float64(7), int64(1), object(5)
memory usage: 4.4+ MB


In [20]:
df.head()

Unnamed: 0,Zip_Code,City,State,Pop_2010,County,Major_City,PO_City,Pop_2011,Pop_2012,Pop_2013,Pop_2014,Pop_2015,Pop_2016
0,99546,Adak,AK,326.0,Aleutians West Census Area,Adak,"Adak, AK",124.0,127.0,108.0,114.0,122.0,122.0
1,99571,Adak,AK,160.0,Aleutians East Borough,Cold Bay,"Cold Bay, AK",55.0,82.0,101.0,83.0,88.0,106.0
2,99615,Akhiok,AK,12899.0,Kodiak Island Borough,Kodiak,"Kodiak, AK",12609.0,12770.0,13009.0,13096.0,13197.0,13135.0
3,99551,Akiachak,AK,627.0,Bethel Census Area,Akiachak,"Akiachak, AK",644.0,638.0,606.0,590.0,562.0,584.0
4,99552,Akiak,AK,346.0,Bethel Census Area,Akiak,"Akiak, AK",331.0,364.0,392.0,407.0,399.0,407.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41318 entries, 0 to 41318
Data columns (total 13 columns):
Zip_Code      41318 non-null int64
City          41318 non-null object
State         41318 non-null object
Pop_2010      32987 non-null float64
County        41318 non-null object
Major_City    41318 non-null object
PO_City       32987 non-null object
Pop_2011      32987 non-null float64
Pop_2012      32987 non-null float64
Pop_2013      32987 non-null float64
Pop_2014      32987 non-null float64
Pop_2015      32987 non-null float64
Pop_2016      32987 non-null float64
dtypes: float64(7), int64(1), object(5)
memory usage: 4.4+ MB


In [22]:
## Write the dataframe to a csv file.
df.to_csv('tjs_zipcodes_out.csv')