# 2014-2018 ACS Data by Neighborhood Tabulation Area (NTA)

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from pandas.api.types import CategoricalDtype

In [2]:
def colpercent(df):
    print("Total NaN in Dataframe: " , df.isnull().sum().sum())
    print("Percent Missingness in Dataframe: ", 100*df.isnull().sum().sum()/(len(df.index)*len(df.columns)))
    print('-'*55)
    percentnulldf = df.isnull().sum()/(df.isnull().sum()+df.notna().sum())
    print("Percent Missingness by Columns:")
    print(100*percentnulldf[percentnulldf>0].sort_values(ascending=False))
    
def data_eval(df):
    for i in range(len(df.columns)):
        print('-'*50)
        print('Column Name: ', df.columns[i])
        if (df[df.columns[i]].dtypes == 'float64' or df[df.columns[i]].dtypes == 'int64') and df[df.columns[i]][df[df.columns[i]]<0].count()>0:
            print('Number of negatives: ', df[df.columns[i]][df[df.columns[i]]<0].count())
        if df[df.columns[i]][df[df.columns[i]]=='None'].count() > 0:
            print('Number of None strings: ', df[df.columns[i]][df[df.columns[i]]=='None'].count())
        if df[df.columns[i]][df[df.columns[i]]==''].count() > 0:
            print('Number of empty strings: ', df[df.columns[i]][df[df.columns[i]]==''].count())
        else:
            print('Column ' + str(i) + ' has no negatives, empty strings or Nones')

## Import and load data

In [3]:
acsnta_raw = pd.read_excel('../../data/00_raw/econ_2018_acs5yr_nta.xlsx',
                          usecols = 'B:D, J, O, T, Y, AD, AI, CL, CQ, CV, DA, DF, DK, DP, DU, DZ, EE, EJ, EO, ET, EY, HV, IA, IF, IK, IP, IU, IZ, IE, JJ, JO, JT, JY, KD, KI, KN, KS, KX, LC, PD, PI, PN, UI, UN, US, UX')

In [4]:
acsnta = acsnta_raw.copy()

Rename columns:

In [12]:
acsnta.columns = ['nta_name', 'nta_code', 'borough', 'in_labor_force',
                 'civilian_labor_force', 'employed', 'unemployed', 'armed_forces',
                 'not_in_labor_force', 'workers_16+', 'vehicle', 'carpool',
                 'public_transit', 'walking', 'other_commuting_means', 'work_at_home',
                 'mean_commuting_time_(min)', 'occ_civilian_employed_16+', 'mng_biz_sci_arts',
                 'service', 'sales_office','natres_construct_maint', 'prod_transport_moving',
                 'class_civilian_employed_16+', 'salary_workers', 'govt_workers', 'self-employed',
                 'unpaid_family_workers', 'total_households','hincome_10K_under',
                 'hincome_10K_15K','hincome_15K_25K', 'hincome_25K_35K', 
                 'hincome_35K_50K', 'hincome_50K_75K', 'hincome_75K_100K',
                 'hincome_100K_150K', 'hincome_150K_200K', 'hincome_200K_more',
                 'median_hincome', 'mean_hincome', 'median_earnings', 'median_earnings_male',
                 'median_earnings_female','all_families','all_families_below_poverty',
                 'pop_poverty_status_determ', 'pop_below_poverty']

## Inspect data frame

In [13]:
acsnta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 176 entries, 0 to 175
Data columns (total 48 columns):
nta_name                       176 non-null object
nta_code                       176 non-null object
borough                        176 non-null object
in_labor_force                 176 non-null int64
civilian_labor_force           176 non-null int64
employed                       176 non-null int64
unemployed                     176 non-null int64
armed_forces                   176 non-null int64
not_in_labor_force             176 non-null int64
workers_16+                    176 non-null int64
vehicle                        176 non-null int64
carpool                        176 non-null int64
public_transit                 176 non-null int64
walking                        176 non-null int64
other_commuting_means          176 non-null int64
work_at_home                   176 non-null int64
mean_commuting_time_(min)      174 non-null float64
occ_civilian_employed_16+      176 non-n

In [14]:
acsnta.sample(n=5)

Unnamed: 0,nta_name,nta_code,borough,in_labor_force,civilian_labor_force,employed,unemployed,armed_forces,not_in_labor_force,workers_16+,...,hincome_200K_more,median_hincome,mean_hincome,median_earnings,median_earnings_male,median_earnings_female,all_families,all_families_below_poverty,pop_poverty_status_determ,pop_below_poverty
134,Far Rockaway-Bayswater,QN15,Queens,24512,24481,22450,2031,31,16360,21811,...,760,50213.0,67475.0,32964.0,46366.0,40444.0,11736,2038,54523,10810
115,Washington Heights South,MN36,Manhattan,53735,53735,48838,4897,0,26877,47537,...,957,50880.0,67649.0,27978.0,38920.0,38662.0,17082,3072,92277,20966
62,Flatbush,BK42,Brooklyn,55781,55781,51890,3891,0,28524,50533,...,2108,55804.0,74156.0,34777.0,48332.0,44716.0,24284,3293,106409,19518
156,Old Astoria,QN71,Queens,14535,14535,13624,911,0,6879,12886,...,826,61029.0,85973.0,44848.0,54383.0,57325.0,5260,865,24449,4562
46,Brownsville,BK81,Brooklyn,20606,20606,16733,3873,0,21049,16021,...,210,20295.0,37269.0,26386.0,38944.0,35624.0,10501,3373,50853,21848


Exclude Staten Island:

In [19]:
acsnta = acsnta[acsnta.borough != 'Staten Island']

Check for missing values or incorrectly entered data:

In [15]:
colpercent(acsnta)

Total NaN in Dataframe:  13
Percent Missingness in Dataframe:  0.15388257575757575
-------------------------------------------------------
Percent Missingness by Columns:
mean_hincome                 1.704545
median_hincome               1.704545
govt_workers                 1.136364
mean_commuting_time_(min)    1.136364
median_earnings_female       0.568182
median_earnings_male         0.568182
median_earnings              0.568182
dtype: float64


In [16]:
data_eval(acsnta)

--------------------------------------------------
Column Name:  nta_name
Column 0 has no negatives, empty strings or Nones
--------------------------------------------------
Column Name:  nta_code
Column 1 has no negatives, empty strings or Nones
--------------------------------------------------
Column Name:  borough
Column 2 has no negatives, empty strings or Nones
--------------------------------------------------
Column Name:  in_labor_force
Column 3 has no negatives, empty strings or Nones
--------------------------------------------------
Column Name:  civilian_labor_force
Column 4 has no negatives, empty strings or Nones
--------------------------------------------------
Column Name:  employed
Column 5 has no negatives, empty strings or Nones
--------------------------------------------------
Column Name:  unemployed
Column 6 has no negatives, empty strings or Nones
--------------------------------------------------
Column Name:  armed_forces
Column 7 has no negatives, empty st

## Feature engineering

Create dictionary with NTA codes and names:

In [23]:
pd.Series(acsnta.nta_name.values,index=acsnta.nta_code).to_dict()

{'BX31': 'Allerton-Pelham Gardens',
 'BX05': 'Bedford Park-Fordham North',
 'BX06': 'Belmont',
 'BX07': 'Bronxdale',
 'BX01': 'Claremont-Bathgate',
 'BX13': 'Co-op City',
 'BX75': 'Crotona Park East',
 'BX14': 'East Concourse-Concourse Village',
 'BX17': 'East Tremont',
 'BX03': 'Eastchester-Edenwald-Baychester',
 'BX40': 'Fordham South',
 'BX26': 'Highbridge',
 'BX27': 'Hunts Point',
 'BX30': 'Kingsbridge Heights',
 'BX33': 'Longwood',
 'BX34': 'Melrose South-Mott Haven North',
 'BX35': 'Morrisania-Melrose',
 'BX39': 'Mott Haven-Port Morris',
 'BX41': 'Mount Hope',
 'BX22': 'North Riverdale-Fieldston-Riverdale',
 'BX43': 'Norwood',
 'BX99': 'park-cemetery-etc-Bronx',
 'BX46': 'Parkchester',
 'BX10': 'Pelham Bay-Country Club-City Island',
 'BX49': 'Pelham Parkway',
 'BX98': 'Rikers Island',
 'BX52': 'Schuylerville-Throgs Neck-Edgewater Park',
 'BX55': 'Soundview-Bruckner',
 'BX09': 'Soundview-Castle Hill-Clason Point-Harding Park',
 'BX29': 'Spuyten Duyvil-Kingsbridge',
 'BX36': 'Unive