In [1]:
import os
import pandas as pd
import zipfile
import geopandas as  gpd
from fiona.crs import from_epsg
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import seaborn
import datetime
import pylab as pl
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
#puidata = os.getenv('PUIDATA')
#open zipcode file stored in PUIDATA
nycshape = gpd.read_file("nyc-zip-code-tabulation-areas-polygons.geojson")
nycshape.head()

Unnamed: 0,@id,BLDGpostalCode,CTY_FIPS,OBJECTID,PO_NAME,STATE,ST_FIPS,Shape_Area,Shape_Leng,borough,geometry,postalCode
0,http://nyc.pediacities.com/Resource/PostalCode...,0,81,1,Jackson Heights,NY,36,20163280.0,20624.692317,Queens,POLYGON ((-73.86942457284175 40.74915687096787...,11372
1,http://nyc.pediacities.com/Resource/PostalCode...,0,81,2,Glen Oaks,NY,36,22606530.0,23002.816039,Queens,POLYGON ((-73.71068374074007 40.75004039450917...,11004
2,http://nyc.pediacities.com/Resource/PostalCode...,0,81,3,New Hyde Park,NY,36,6269333.0,15749.161511,Queens,POLYGON ((-73.70098278625547 40.73889569923034...,11040
3,http://nyc.pediacities.com/Resource/PostalCode...,0,81,4,Bellerose,NY,36,49418360.0,35932.810639,Queens,POLYGON ((-73.72270447144122 40.75373371438336...,11426
4,http://nyc.pediacities.com/Resource/PostalCode...,0,81,5,Fresh Meadows,NY,36,69385870.0,38693.565676,Queens,POLYGON ((-73.81088634744755 40.72717187575918...,11365


In [3]:
# rename postalcode column name to zipcode and remove columns deemed unnecessary
nycshape = nycshape.rename(columns={'postalCode': "zipcode"})
nycshape = nycshape[['zipcode', 'borough', 'geometry', 'PO_NAME']]
#datatype for all columns is 'object', changing zipcode to 'numeric'
nycshape.zipcode = pd.to_numeric(nycshape.zipcode)
nycshape.head()

Unnamed: 0,zipcode,borough,geometry,PO_NAME
0,11372,Queens,POLYGON ((-73.86942457284175 40.74915687096787...,Jackson Heights
1,11004,Queens,POLYGON ((-73.71068374074007 40.75004039450917...,Glen Oaks
2,11040,Queens,POLYGON ((-73.70098278625547 40.73889569923034...,New Hyde Park
3,11426,Queens,POLYGON ((-73.72270447144122 40.75373371438336...,Bellerose
4,11365,Queens,POLYGON ((-73.81088634744755 40.72717187575918...,Fresh Meadows


In [6]:
#create a new dataframe with values from all census business data files, 
years = ['94', '95', '96', '97', '98', '99', '00', '01', '02', '03', '04', '05', '06', '07',
        '08', '09', '10', '11', '12', '13', '14']
CombinedValues = pd.DataFrame()
for year in years:
    fname = 'zbp' + year + 'totals.zip'
    zf = zipfile.ZipFile(fname)
    each_year = pd.read_csv(zf.open(fname.replace('.zip','.txt')))
    each_year['year'] = year
    CombinedValues = pd.concat([CombinedValues, each_year], axis=0)
    
CombinedValues.head()

Unnamed: 0,AP,EMP,EMPFLAG,EST,NAME,QP1,ZIP,ap,ap_nf,city,...,emp,emp_nf,empflag,est,name,qp1,qp1_nf,stabbr,year,zip
0,,,,,,,,155158.0,,,...,6198.0,,,439.0,"Agawam, MA",33601.0,,,94,1001.0
1,,,,,,,,127367.0,,,...,6073.0,,,450.0,"Amherst, MA",28924.0,,,94,1002.0
2,,,,,,,,1604.0,,,...,68.0,,,10.0,"Amherst, MA",367.0,,,94,1003.0
3,,,,,,,,4302.0,,,...,210.0,,,47.0,"Amherst, MA",844.0,,,94,1004.0
4,,,,,,,,13521.0,,,...,678.0,,,92.0,"Barre, MA",3010.0,,,94,1005.0


In [7]:
CombinedValues_reduced = CombinedValues
CombinedValues_reduced.drop(CombinedValues_reduced.columns[[0,1,2,3,4,
                                                            5,6,7,8,9,10,11,12,13,15,
                                                            16,17,18]], axis=1, inplace=True)
CombinedValues_reduced.head()

Unnamed: 0,est,year,zip
0,439.0,94,1001.0
1,450.0,94,1002.0
2,10.0,94,1003.0
3,47.0,94,1004.0
4,92.0,94,1005.0


In [10]:
#convert zipcode to numeric and rename zip column to zipcode for the merge
CombinedValues_reduced["zip"] = CombinedValues_reduced["zip"].apply(pd.to_numeric)
CombinedValues_reduced = CombinedValues_reduced.rename(columns={'zip': 'zipcode'})

#merge shapefile and census information together on zipcode column
combinedtest = nycshape.merge(CombinedValues_reduced, on = 'zipcode')
combinedtest.head()

Unnamed: 0,zipcode,borough,geometry,PO_NAME,est,year
0,11372,Queens,POLYGON ((-73.86942457284175 40.74915687096787...,Jackson Heights,1229.0,94
1,11372,Queens,POLYGON ((-73.86942457284175 40.74915687096787...,Jackson Heights,1236.0,95
2,11372,Queens,POLYGON ((-73.86942457284175 40.74915687096787...,Jackson Heights,1292.0,96
3,11372,Queens,POLYGON ((-73.86942457284175 40.74915687096787...,Jackson Heights,1309.0,97
4,11372,Queens,POLYGON ((-73.86942457284175 40.74915687096787...,Jackson Heights,1521.0,3
