In [1]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census
from tqdm import tqdm_notebook
# Census API Key
from config import api_key
c = Census(api_key, year=2013)

In [2]:
# Run Census Search to retrieve data on all zip codes (2013 ACS5 Census)
# See: https://github.com/CommerceDataService/census-wrapper for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels
census_data = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                          "B19301_001E",
                          "B17001_002E",'B15003_002E', 'B15003_003E', 
                          'B15003_004E', 'B15003_005E', 'B15003_006E', 'B15003_007E', 'B15003_008E',
                          'B15003_009E', 'B15003_010E', 'B15003_011E', 'B15003_012E', 'B15003_013E',
                          'B15003_014E', 'B15003_015E', 'B15003_016E', 'B15003_017E', 'B15003_018E',
                          'B15003_019E', 'B15003_020E', "B15003_021E", "B15003_022E", "B15003_023E",
                          "B15003_024E", "B15003_025E", 'B15003_001E'), {'for': 'zip code tabulation area:*'})


# Convert to DataFrame
census_df = pd.DataFrame(census_data)

# Column Reordering
census_df = census_df.rename(columns={'B01003_001E': 'Population',
                                      'B01002_001E': 'Median Age',
                                      'B19013_001E': 'Household Income',
                                      'B19301_001E': 'Per Capita Income',
                                      'B17001_002E': 'Poverty Count',
                                      'NAME': 'Name', 
                                      'zip code tabulation area': 'Zip Code',
                                      'B15003_021E' : 'Associate\'s Degree Count',
                                      'B15003_022E' : 'Bachelor\'s Degree Count',
                                      'B15003_023E' : 'Master\'s Degree Count',
                                      'B15003_024E' : 'Professional School Degree Count',
                                      'B15003_025E' : 'Doctorate Degree Count',
                                      'B15003_001E' : 'Total educational attainment over 25',
                                      'B15003_002E' : 'No School',
                                      'B15003_003E' : 'Nursery school',
                                      'B15003_004E' : 'Kindergarten',
                                      'B15003_005E' : '1st grade',
                                      'B15003_006E' : '2nd grade',
                                      'B15003_007E' : '3rd grade',
                                      'B15003_008E' : '4th grade',
                                      'B15003_009E' : '5th grade',
                                      'B15003_010E' : '6th grade',
                                      'B15003_011E' : '7th grade',
                                      'B15003_012E' : '8th grade',
                                      'B15003_013E' : '9th grade',
                                      'B15003_014E' : '10th grade',
                                      'B15003_015E' : '11th grade',
                                      'B15003_016E' : '12th grade No Diploma',
                                      'B15003_017E' : 'HS Diploma',
                                      'B15003_018E' : 'GED',
                                      'B15003_019E' : 'Some College less than 1 year',
                                      'B15003_020E' : 'Some College More than 1 year'})

census_df["Poverty Rate"] = (100 * census_df["Poverty Count"].astype(int)
                            / census_df["Population"].astype(int))
census_df["Total College Degrees"] = (census_df['Associate\'s Degree Count'].astype(int)+
                             census_df['Bachelor\'s Degree Count'].astype(int)+
                             census_df['Master\'s Degree Count'].astype(int)+
                             census_df['Professional School Degree Count'].astype(int)+
                             census_df['Doctorate Degree Count'].astype(int))
census_df['Total No HS Diploma'] = (census_df['No School'].astype(int)+
                                    census_df['Nursery school'].astype(int)+
                                    census_df['Kindergarten'].astype(int)+
                                    census_df['1st grade'].astype(int)+
                                    census_df['2nd grade'].astype(int)+
                                    census_df['3rd grade'].astype(int)+
                                    census_df['4th grade'].astype(int)+
                                    census_df['5th grade'].astype(int)+
                                    census_df['6th grade'].astype(int)+
                                    census_df['7th grade'].astype(int)+
                                    census_df['8th grade'].astype(int)+
                                    census_df['9th grade'].astype(int)+
                                    census_df['10th grade'].astype(int)+
                                    census_df['11th grade'].astype(int)+
                                    census_df['12th grade No Diploma'].astype(int))
census_df['Total HS Diploma'] = (census_df['HS Diploma'].astype(int)+
                                    census_df['GED'].astype(int))

census_df['Total Some College'] = (census_df['Some College less than 1 year'].astype(int)+
                                    census_df['Some College More than 1 year'].astype(int))

census_df['Education Check'] = (census_df['Total College Degrees'].astype(int)+
                             census_df['Total No HS Diploma'].astype(int)+
                             census_df['Total HS Diploma'].astype(int)+
                             census_df['Total Some College'].astype(int))
                                    
# Final DataFrame
census_df = census_df[['Zip Code', 'Population', 'Median Age', 'Household Income',
                       'Per Capita Income', 'Poverty Count', 'Poverty Rate', 'Total No HS Diploma',
                       'Total HS Diploma', 'Total Some College', 'Total College Degrees',
                       'Total educational attainment']]

# Visualize
print(len(census_df))

33120


In [3]:
census_df.loc[census_df['Household Income'] < 0, 'Household Income'] = 0
census_df.loc[census_df['Median Age'] < 0, 'Median Age'] = 0
census_df.loc[census_df['Per Capita Income'] < 0, 'Per Capita Income'] = 0

In [4]:
census_df=census_df.sort_values(by='Zip Code', ascending=True)
census_df.reset_index(drop=True)
census_df.head()

Unnamed: 0,Zip Code,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Total No HS Diploma,Total HS Diploma,Total Some College,Total College Degrees,Total educational attainment over 25
984,601,18450.0,36.6,12041.0,7380.0,10816.0,58.623306,4821,2930,1302,2928,11981.0
985,602,41302.0,38.6,15663.0,8463.0,22409.0,54.256452,10845,5883,1673,9226,27627.0
986,603,53683.0,38.9,15485.0,9176.0,26220.0,48.842278,12220,9769,3726,10717,36432.0
987,606,6591.0,37.3,15019.0,6383.0,3721.0,56.455773,1731,1657,278,666,4332.0
988,610,28963.0,39.2,16707.0,7892.0,14569.0,50.30211,7245,6733,1747,3744,19469.0


In [5]:
census_df.describe()

Unnamed: 0,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Total No HS Diploma,Total HS Diploma,Total Some College,Total College Degrees,Total educational attainment over 25
count,33120.0,33120.0,33120.0,32883.0,33120.0,32784.0,33120.0,33120.0,33120.0,33120.0,33120.0
mean,9516.959994,40.817557,50203.762077,25746.124715,1458.514915,14.753802,893.540187,1773.031612,1334.613255,2309.709511,6310.894565
std,13939.177211,9.950192,23947.975877,12062.117195,2840.714482,11.598479,1813.272096,2476.488927,1976.245042,3927.063723,9088.804187
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,721.0,36.0,36944.0,19240.5,80.0,6.934705,56.0,179.0,101.0,110.0,492.0
50%,2801.5,40.9,46926.5,23616.0,337.0,12.337662,234.0,669.0,395.0,474.5,1903.0
75%,13000.0,46.0,59938.5,29578.5,1482.0,19.664953,933.0,2399.25,1761.0,2696.0,8754.25
max,114734.0,91.5,250001.0,286534.0,43450.0,100.0,35160.0,24509.0,16996.0,52272.0,74784.0


In [6]:
zip_geo = "Project1_Geo.csv"
zip_geo_df = pd.read_csv(zip_geo, dtype=str, low_memory=False)

In [7]:
zip_geo_df.head()

Unnamed: 0,Zip Code,Lat,Lng
0,601,,
1,602,18.36,-67.18
2,603,18.45,-67.11
3,606,18.2,-66.9
4,610,18.28,-67.13
5,612,18.4,-66.68
6,616,18.4,-66.68
7,617,18.44,-66.56
8,622,18.0,-67.16
9,623,18.1,-67.2


In [8]:
Project_census_df = census_df.merge(zip_geo_df, on="Zip Code", how='right')
#census_df = census_df.sort_values(by='Zip Code', ascending=True)
Project_census_df.head()

Unnamed: 0,Zip Code,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Total No HS Diploma,Total HS Diploma,Total Some College,Total College Degrees,Total educational attainment over 25,Lat,Lng
0,601,18450.0,36.6,12041.0,7380.0,10816.0,58.623306,4821,2930,1302,2928,11981.0,,
1,602,41302.0,38.6,15663.0,8463.0,22409.0,54.256452,10845,5883,1673,9226,27627.0,18.36,-67.18
2,603,53683.0,38.9,15485.0,9176.0,26220.0,48.842278,12220,9769,3726,10717,36432.0,18.45,-67.11
3,606,6591.0,37.3,15019.0,6383.0,3721.0,56.455773,1731,1657,278,666,4332.0,18.2,-66.9
4,610,28963.0,39.2,16707.0,7892.0,14569.0,50.30211,7245,6733,1747,3744,19469.0,18.28,-67.13


In [9]:
Project_census_df.to_csv('Project1_Zipcode_Data.csv',encoding="utf-8", index=False)