In [1]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census
from tqdm import tqdm_notebook
# Census API Key
from config import api_key
c = Census(api_key, year=2013)

In [2]:
# Run Census Search to retrieve data on all zip codes (2013 ACS5 Census)
# See: https://github.com/CommerceDataService/census-wrapper for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels
census_data = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                          "B19301_001E",
                          "B17001_002E",'B15003_002E', 'B15003_003E', "B15003_021E", "B15003_022E", "B15003_023E", "B15003_024E", "B15003_025E", 'B15003_001E'), {'for': 'zip code tabulation area:*'})
#B15003_001E - Total included in educational attainment over 25
#B15003_002E - No School

# Convert to DataFrame
census_df = pd.DataFrame(census_data)

# Column Reordering
census_df = census_df.rename(columns={'B01003_001E': 'Population',
                                      'B01002_001E': 'Median Age',
                                      'B19013_001E': 'Household Income',
                                      'B19301_001E': 'Per Capita Income',
                                      'B17001_002E': 'Poverty Count',
                                      'NAME': 'Name', 
                                      'zip code tabulation area': 'Zip Code',
                                      'B15003_021E' : 'Associate\'s Degree Count',
                                      'B15003_022E' : 'Bachelor\'s Degree Count',
                                      'B15003_023E' : 'Master\'s Degree Count',
                                      'B15003_024E' : 'Professional School Degree Count',
                                      'B15003_025E' : 'Doctorate Degree Count'})

# Add computed data colums
census_df["Poverty Rate"] = (100 * census_df["Poverty Count"].astype(int)
                            / census_df["Population"].astype(int))
census_df["Total Degrees"] = (census_df['Associate\'s Degree Count'].astype(int)+
                             census_df['Bachelor\'s Degree Count'].astype(int)+
                             census_df['Master\'s Degree Count'].astype(int)+
                             census_df['Professional School Degree Count'].astype(int)+
                             census_df['Doctorate Degree Count'].astype(int))
# Final DataFrame
census_df = census_df[['Zip Code', 'Population', 'Median Age', 'Household Income',
                       'Per Capita Income', 'Poverty Count', 'Poverty Rate', 'Associate\'s Degree Count', 'Bachelor\'s Degree Count',
                      'Master\'s Degree Count', 'Professional School Degree Count', 'Doctorate Degree Count', 'Total Degrees']]

# Visualize
print(len(census_df))

33120


In [21]:
census_df.loc[census_df['Household Income'] < 0, 'Household Income'] = 0
census_df.loc[census_df['Median Age'] < 0, 'Median Age'] = 0
census_df.loc[census_df['Per Capita Income'] < 0, 'Per Capita Income'] = 0

In [22]:
census_df=census_df.sort_values(by='Zip Code', ascending=True)
census_df.reset_index(drop=True)
census_df.head()

Unnamed: 0,Zip Code,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Associate's Degree Count,Bachelor's Degree Count,Master's Degree Count,Professional School Degree Count,Doctorate Degree Count,Total Degrees
984,601,18450.0,36.6,12041.0,7380.0,10816.0,58.623306,1027.0,1630.0,174.0,58.0,39.0,2928
985,602,41302.0,38.6,15663.0,8463.0,22409.0,54.256452,3677.0,4225.0,987.0,167.0,170.0,9226
986,603,53683.0,38.9,15485.0,9176.0,26220.0,48.842278,3101.0,5449.0,1664.0,229.0,274.0,10717
987,606,6591.0,37.3,15019.0,6383.0,3721.0,56.455773,79.0,466.0,63.0,16.0,42.0,666
988,610,28963.0,39.2,16707.0,7892.0,14569.0,50.30211,1062.0,2191.0,296.0,33.0,162.0,3744


In [23]:
census_df.describe()

Unnamed: 0,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Associate's Degree Count,Bachelor's Degree Count,Master's Degree Count,Professional School Degree Count,Doctorate Degree Count,Total Degrees
count,33120.0,33120.0,33120.0,32883.0,33120.0,32784.0,33120.0,33120.0,33120.0,33120.0,33120.0,33120.0
mean,9516.959994,40.817557,50203.762077,25746.124715,1458.514915,14.753802,493.738768,1138.135719,476.699185,122.716787,78.419052,2309.709511
std,13939.177211,9.950192,23947.975877,12062.117195,2840.714482,11.598479,738.766898,2022.361466,921.31772,296.485232,203.308322,3927.063723
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,721.0,36.0,36944.0,19240.5,80.0,6.934705,35.0,47.0,14.0,0.0,0.0,110.0
50%,2801.5,40.9,46926.5,23616.0,337.0,12.337662,150.0,209.5,79.0,13.0,7.0,474.5
75%,13000.0,46.0,59938.5,29578.5,1482.0,19.664953,651.0,1261.0,491.0,100.0,61.0,2696.0
max,114734.0,91.5,250001.0,286534.0,43450.0,100.0,7075.0,24256.0,17914.0,6154.0,4620.0,52272.0


In [24]:
zip_geo = "Project1_Geo.csv"
zip_geo_df = pd.read_csv(zip_geo, dtype=str, low_memory=False)

In [25]:
zip_geo_df.head(10)

Unnamed: 0,Zip Code,Lat,Lng
0,601,,
1,602,18.36,-67.18
2,603,18.45,-67.11
3,606,18.2,-66.9
4,610,18.28,-67.13
5,612,18.4,-66.68
6,616,18.4,-66.68
7,617,18.44,-66.56
8,622,18.0,-67.16
9,623,18.1,-67.2


In [26]:
Project_census_df = census_df.merge(zip_geo_df, on="Zip Code", how='right')
#census_df = census_df.sort_values(by='Zip Code', ascending=True)
Project_census_df.head()

Unnamed: 0,Zip Code,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Associate's Degree Count,Bachelor's Degree Count,Master's Degree Count,Professional School Degree Count,Doctorate Degree Count,Total Degrees,Lat,Lng
0,601,18450.0,36.6,12041.0,7380.0,10816.0,58.623306,1027.0,1630.0,174.0,58.0,39.0,2928,,
1,602,41302.0,38.6,15663.0,8463.0,22409.0,54.256452,3677.0,4225.0,987.0,167.0,170.0,9226,18.36,-67.18
2,603,53683.0,38.9,15485.0,9176.0,26220.0,48.842278,3101.0,5449.0,1664.0,229.0,274.0,10717,18.45,-67.11
3,606,6591.0,37.3,15019.0,6383.0,3721.0,56.455773,79.0,466.0,63.0,16.0,42.0,666,18.2,-66.9
4,610,28963.0,39.2,16707.0,7892.0,14569.0,50.30211,1062.0,2191.0,296.0,33.0,162.0,3744,18.28,-67.13


In [28]:
Project_census_df.to_csv('Project1_Zipcode_Data.csv',encoding="utf-8", index=False)