In [1]:
import pandas as pd
import geopandas as gp
import numpy as np
import pickle
import os

In [2]:
# Function to make 4 classifications based on balance due and the portion of the penality paid
def make_classification(penality, paid, due):
    if due == 0:
        if penality > paid:
            classification = 0
            desc = 'NO BALANCE - PARITALLY PAID'
        else:
            classification = 1
            desc = 'NO BALANCE - PAID IN FULL'
    else:
        if paid == 0:
            classification = 2
            desc = 'BALANCE DUE - UNPAID'
        else:
            classification = 3
            desc = 'BALANCE DUE - PARITALLY PAID'
    return classification, desc

# convert values to float
def make_float(expected_float):
    try:
        if type(expected_float) == str:
            expected_float = expected_float.replace('+', '').replace(',', '')
        return float(expected_float)
    except:
        # print expected_float
        return np.nan
    


In [3]:
# Trump owned buildings identified from:
# https://www.bisnow.com/new-york/news/economy/how-big-is-trumps-nyc-empire-63995?single-page

# BIN NUMBER
_40_WALL_ST = '1001018' # TRUMP_BUILDING
_1290_AVENUE_OF_THE_AMERICAS = '1034510' # _1290_AVENUE_OF_THE_AMERICAS 30% Stake in Vornado owned building
_1_CENTRAL_PARK_WEST = '1027191' # TRUMP_INTERNATIONAL_HOTEL_AND_TOWER
_725_5TH_AVE = '1035794' # TRUMP_TOWER
_246_SPRING_ST = '1088431' # TRUMP_SOHO
_502_PARK_AVE = '1040756' # TRUMP_PARK_AVE
_327_E_47TH_ST = '1038908' # TRUMP_WORLD_TOWER
_200_E_69TH_ST = '1043902' # TRUMP_PALACE
_106_CENTRAL_PARK_S = '1069595' # TRUMP_PARC
_610_PARK_AVE = '1041086' # _610_PARK_AVE

TRUMP_BUILDINGS = [_106_CENTRAL_PARK_S, _1290_AVENUE_OF_THE_AMERICAS, 
                   _1_CENTRAL_PARK_WEST, _200_E_69TH_ST, _246_SPRING_ST, 
                   _327_E_47TH_ST, _40_WALL_ST, _502_PARK_AVE,
                   _610_PARK_AVE, _725_5TH_AVE]
# Wollman Rink - Central Park (Not a building)

In [32]:
# DOB ECB BUILDING VIOLATION DATA
DOB_ECB = 'data/20170322_DOB_ECB_Violations.csv'
# CENSUS ACS 2015 5-YEAR ESTIMATES (CENSUS TRACT LEVEL)
ACS_5YR_RACE = 'data/CENSUS_TRACT_RACE_INCOME/ACS_15_5YR_DP05_with_ann.csv'
ACS_5YR_INCOME = 'data/CENSUS_TRACT_RACE_INCOME/ACS_15_5YR_S1901_with_ann.csv'
# PLUTO SHAPE FILES - BBL DATA FOR ALL NYC
PLUTO_BX = 'data/PLUTO/Bronx/BXMapPLUTO.shp'
PLUTO_BK = 'data/PLUTO/Brooklyn/BKMapPLUTO.shp'
PLUTO_QN = 'data/PLUTO/Queens/QNMapPLUTO.shp'
PLUTO_MN = 'data/PLUTO/Manhattan/MNMapPLUTO.shp'
PLUTO_SI = 'data/PLUTO/Staten_Island/SIMapPLUTO.shp'
# CENSUS TRACT SHAPE FILES
CENSUS_TRACT_SHAPEFILE = 'data/CENSUS_TRACT_SHAPEFILE/cb_2015_36_tract_500k.shp'
# FILE NAME OF PROCESSED PLUTO DATA - IMPROVES PERFORMANCE OF NOTEBOOK AFTER INITIAL RUN
MASTER_PLUTO_PICKLE = 'processed_data/master_pluto.pickle'


In [5]:
DF_DOB_ECB = pd.read_csv(DOB_ECB, usecols=['BIN', 'ISSUE_DATE', 'SEVERITY', 
                                           'PENALITY_IMPOSED', 'AMOUNT_PAID', 
                                           'BALANCE_DUE', 'ECB_VIOLATION_STATUS', 
                                           'BORO', 'BLOCK', 'LOT', 'VIOLATION_DESCRIPTION',
                                           'INFRACTION_CODE1'],
                         dtype={'BIN': str, 'PENALITY_IMPOSED': float,
                                'AMOUNT_PAID': float, 'BALANCE_DUE': float}) 

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
# Only 181 FINES associated with trump buildings

print '"TRUMP PROPERTIES" WITH FINES', len(DF_DOB_ECB[(DF_DOB_ECB['PENALITY_IMPOSED'] > 0) & (DF_DOB_ECB['BIN'].isin(TRUMP_BUILDINGS))])
print 'PENALITY IMPOSED', DF_DOB_ECB[(DF_DOB_ECB['PENALITY_IMPOSED'] > 0) & (DF_DOB_ECB['BIN'].isin(TRUMP_BUILDINGS))]['PENALITY_IMPOSED'].sum()
print 'AMOUNT_PAID', DF_DOB_ECB[(DF_DOB_ECB['PENALITY_IMPOSED'] > 0) & (DF_DOB_ECB['BIN'].isin(TRUMP_BUILDINGS))]['AMOUNT_PAID'].sum()
print 'OUTSTANDING BALANCE', DF_DOB_ECB[(DF_DOB_ECB['PENALITY_IMPOSED'] > 0) & (DF_DOB_ECB['BIN'].isin(TRUMP_BUILDINGS))]['BALANCE_DUE'].sum()

"TRUMP PROPERTIES" WITH FINES 181
PENALITY IMPOSED 183425.0
AMOUNT_PAID 177357.27
OUTSTANDING BALANCE 765.34


## Perhaps fines associated with trump owned buildings may not hold enough data

In [7]:
# VIOLATIONS WITH A BALANCE DUE
DF_DOB_ECB[DF_DOB_ECB['BALANCE_DUE'] > 0]['BALANCE_DUE'].sum()

748325710.25

In [8]:
# TOTAL PENALITIES VALUE
DF_DOB_ECB['PENALITY_IMPOSED'].sum()

1669745657.52

In [9]:
# FINES PAID
DF_DOB_ECB['AMOUNT_PAID'].sum()

616422508.6099999

In [10]:
DF_DOB_ECB_FINES = DF_DOB_ECB[DF_DOB_ECB['PENALITY_IMPOSED'] > 0].copy()

In [11]:
DF_DOB_ECB_FINES['CLASSIFICATION'], DF_DOB_ECB_FINES['CLASS_DESC'] = np.vectorize(make_classification)(DF_DOB_ECB_FINES['PENALITY_IMPOSED'],
                                                                                                       DF_DOB_ECB_FINES['AMOUNT_PAID'],
                                                                                                       DF_DOB_ECB_FINES['BALANCE_DUE'])

In [12]:
# Breakdown of different classes
DF_DOB_ECB_FINES[['CLASS_DESC', 'ECB_VIOLATION_STATUS']].groupby('CLASS_DESC').count()

Unnamed: 0_level_0,ECB_VIOLATION_STATUS
CLASS_DESC,Unnamed: 1_level_1
BALANCE DUE - PARITALLY PAID,31699
BALANCE DUE - UNPAID,104709
NO BALANCE - PAID IN FULL,479941
NO BALANCE - PARITALLY PAID,136943


In [13]:
# race data
ACS_5YR_RACE_DF = pd.read_csv(ACS_5YR_RACE,skiprows=[1], usecols=['GEO.id2', 'HC01_VC03', 'HC01_VC49', 'HC01_VC50', 'HC01_VC51','HC01_VC56', 'HC01_VC64', 'HC01_VC69', 'HC01_VC23'])

# rename columns
ACS_5YR_RACE_DF.rename(columns={'HC01_VC03': 'TOTAL_POPULATION', 'HC01_VC49': 'WHITE',
                        'HC01_VC50': 'BLACK_AFRICAN_AMERICAN', 'HC01_VC51': 'AMERICAN_INDIAN_AND_ALASKA_NATIVE',
                        'HC01_VC56': 'ASIAN', 'HC01_VC64': 'NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER',
                        'HC01_VC69': 'SOME_OTHER_RACE', 'HC01_VC23': 'MEDIAN_AGE', 'GEO.id2': 'GEOID'}, inplace=True)

for i in ACS_5YR_RACE_DF.columns[ACS_5YR_RACE_DF.columns!='GEOID']:
    ACS_5YR_RACE_DF[i] = ACS_5YR_RACE_DF[i].apply(lambda x: make_float(x))

ACS_5YR_RACE_DF['GEOID'] = ACS_5YR_RACE_DF['GEOID'].astype(str)

# income data
ACS_5YR_INCOME_DF = pd.read_csv(ACS_5YR_INCOME,skiprows=[1],usecols=['GEO.id2', 'HC01_EST_VC01', 'HC01_EST_VC15'])

# rename columns
ACS_5YR_INCOME_DF.rename(columns={'HC01_EST_VC01': 'TOTAL_HOUSEHOLDS', 
                       'HC01_EST_VC15': 'MEAN_INCOME', 'GEO.id2': 'GEOID'}, inplace=True)

#convert values to float
for i in ACS_5YR_INCOME_DF.columns[ACS_5YR_INCOME_DF.columns!='GEOID']:
    ACS_5YR_INCOME_DF[i] = ACS_5YR_INCOME_DF[i].apply(lambda x: make_float(x))

ACS_5YR_INCOME_DF['GEOID'] = ACS_5YR_RACE_DF['GEOID'].astype(str)


In [14]:
def import_filter_pluto():
    
    # import PLUTO for 5 boros
    BK = gp.read_file(PLUTO_BK)
    BX = gp.read_file(PLUTO_BX)
    MN = gp.read_file(PLUTO_MN)
    QN = gp.read_file(PLUTO_QN)
    SI = gp.read_file(PLUTO_SI)
    
    # merge 5 boro PLUTO datasets 
    pluto_agg = BK.append(BX)
    pluto_agg = pluto_agg.append(MN)
    pluto_agg = pluto_agg.append(QN)
    pluto_agg = pluto_agg.append(SI)
    
    pluto_select = pluto_agg[['BBL','YearBuilt','Tract2010','UnitsRes',
    'BldgClass','LandUse','BldgArea',
    'ComArea',
    'ResArea',
    'UnitsTotal',
    'AssessTot',
    'BuiltFAR','LotArea','OwnerType']]
    with open(MASTER_PLUTO_PICKLE, 'wb') as handle:
        pickle.dump(pluto_select, handle, protocol=pickle.HIGHEST_PROTOCOL)

### CLEANING PLUTO

In [15]:
if os.path.exists(MASTER_PLUTO_PICKLE):
    print "File exists. Loading pickle..."
    # load pickle of PLUTO data
    with open(MASTER_PLUTO_PICKLE, 'rb') as handle:
        master_pluto = pickle.load(handle)
    print "File loaded!"
    
else:
    print "File does not yet exist. Importing and filtering PLUTO. This could take several minutes..."
    # first time only, import, filter, and save processed PLUTO as a pickle for future use
    import_filter_pluto()
    
    # load pickle of PLUTO data
    with open(MASTER_PLUTO_PICKLE, 'rb') as handle:
        master_pluto = pickle.load(handle)

File exists. Loading pickle...


In [16]:
boro_to_ct = {'1':'36061','2':'36005','3':'36047','4':'36081','5':'36085'}

master_pluto['ST_CT_FIPS'] = master_pluto['BBL'].apply(lambda x: boro_to_ct[str(x)[0]])
master_pluto['Tract2010'] = master_pluto['Tract2010'].apply(lambda x: x + '00' if len(x) == 4 else x)
master_pluto['GEOID'] = master_pluto['ST_CT_FIPS'] + master_pluto['Tract2010']

In [17]:
master_pluto['BBL'] = master_pluto['BBL'].astype(int)
master_pluto['BBL'] = master_pluto['BBL'].astype(str)

In [18]:
master_pluto.reset_index()['BBL'][0]

'3000060010'

In [19]:
DF_DOB_ECB_FINES.fillna('', inplace=True)

In [20]:
DF_DOB_ECB_FINES['BORO'] = DF_DOB_ECB_FINES['BORO'].astype(str)
DF_DOB_ECB_FINES['BORO'] = DF_DOB_ECB_FINES['BORO'].apply(lambda x: x.split('.')[0])
DF_DOB_ECB_FINES['BLOCK'] = DF_DOB_ECB_FINES['BLOCK'].astype(str)
DF_DOB_ECB_FINES['BLOCK'] = DF_DOB_ECB_FINES['BLOCK'].apply(lambda x: x.split('.')[0])
DF_DOB_ECB_FINES['BLOCK'] = DF_DOB_ECB_FINES['BLOCK'].apply(lambda x: x if len(x) == 5 else ((5-len(x))*'0') + x)
DF_DOB_ECB_FINES['LOT'] = DF_DOB_ECB_FINES['LOT'].astype(str)
DF_DOB_ECB_FINES['LOT'] = DF_DOB_ECB_FINES['LOT'].apply(lambda x: x.split('.')[0])
DF_DOB_ECB_FINES['LOT'] = DF_DOB_ECB_FINES['LOT'].apply(lambda x: x if len(x) == 5 else ((4-len(x))*'0') + x)

In [21]:
DF_DOB_ECB_FINES['BBL'] = DF_DOB_ECB_FINES['BORO'] + DF_DOB_ECB_FINES['BLOCK'] + DF_DOB_ECB_FINES['LOT']

In [22]:
MERGE_DF = DF_DOB_ECB_FINES.merge(master_pluto, how='left', on='BBL').merge(ACS_5YR_INCOME_DF, how='left', on='GEOID').merge(ACS_5YR_RACE_DF, how='left', on='GEOID')

In [23]:
col_select = ['CLASSIFICATION', 'CLASS_DESC', 'BIN', 'BBL', 'GEOID', 'PENALITY_IMPOSED', 
              'YearBuilt', 'UnitsRes', 'BldgArea', 'ComArea',
              'ResArea', 'UnitsTotal', 'AssessTot', 'LotArea',
              'TOTAL_HOUSEHOLDS', 'MEAN_INCOME', 'TOTAL_POPULATION', 'MEDIAN_AGE', 'WHITE',
              'BLACK_AFRICAN_AMERICAN', 'AMERICAN_INDIAN_AND_ALASKA_NATIVE', 'ASIAN',
              'NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER', 'SOME_OTHER_RACE']

In [24]:
MASTER_DF = MERGE_DF[col_select]

In [25]:
MASTER_DF['YearBuilt'] = 2017. - MASTER_DF['YearBuilt']
MASTER_DF['UnitsNonRes'] = (MASTER_DF['UnitsTotal'] - MASTER_DF['UnitsRes']) / (MASTER_DF['UnitsTotal'] * 1.0) 
MASTER_DF['UnitsRes'] = MASTER_DF['UnitsRes'] / (MASTER_DF['UnitsTotal'] * 1.0) 
MASTER_DF['HOUSEHOLD_SIZE'] = MASTER_DF['TOTAL_POPULATION'] / (MASTER_DF['TOTAL_HOUSEHOLDS'] * 1.0)
MASTER_DF['WHITE'] = MASTER_DF['WHITE'] / (MASTER_DF['TOTAL_POPULATION'] * 1.0)
MASTER_DF['BLACK_AFRICAN_AMERICAN'] = MASTER_DF['BLACK_AFRICAN_AMERICAN'] / (MASTER_DF['TOTAL_POPULATION'] * 1.0)
MASTER_DF['AMERICAN_INDIAN_AND_ALASKA_NATIVE'] = MASTER_DF['AMERICAN_INDIAN_AND_ALASKA_NATIVE'] / (MASTER_DF['TOTAL_POPULATION'] * 1.0)
MASTER_DF['ASIAN'] = MASTER_DF['ASIAN'] / (MASTER_DF['TOTAL_POPULATION'] * 1.0)
MASTER_DF['NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER'] = MASTER_DF['NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER'] / (MASTER_DF['TOTAL_POPULATION'] * 1.0)
MASTER_DF['SOME_OTHER_RACE'] = MASTER_DF['SOME_OTHER_RACE'] / (MASTER_DF['TOTAL_POPULATION'] * 1.0)
MASTER_DF['MORE_THAN_ONE_RACE'] = 1.0 - (MASTER_DF['WHITE'] + MASTER_DF['BLACK_AFRICAN_AMERICAN'] +
                                         MASTER_DF['AMERICAN_INDIAN_AND_ALASKA_NATIVE'] + MASTER_DF['ASIAN'] +
                                         MASTER_DF['NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER'] + MASTER_DF['SOME_OTHER_RACE'])
                                         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [26]:
MASTER_DF.rename(columns={'YearBuilt': 'YEARS_OLD'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [45]:
CTS = gp.GeoDataFrame.from_file(CENSUS_TRACT_SHAPEFILE)
# SQ METERS TO SQ MILES
CTS['LAND_SQ_MI'] = CTS['ALAND'] / 2589988.1
CTS['GEOID'] = CTS['GEOID'].astype(str)
CTS_SELECT = CTS[['GEOID', 'LAND_SQ_MI']]

In [48]:
MASTER_DF = MASTER_DF.merge(CTS_SELECT, how='left', on='GEOID')

In [50]:
MASTER_DF['POPULATION_DENSITY'] = MASTER_DF['TOTAL_POPULATION'] / MASTER_DF['LAND_SQ_MI']

In [58]:
MASTER_DF = MASTER_DF[['CLASSIFICATION', 'CLASS_DESC', 'BIN', 'BBL', 'GEOID', 
                       'PENALITY_IMPOSED', 'YEARS_OLD', 'UnitsRes', 'BldgArea', 
                       'AssessTot', 'MEAN_INCOME', 'MEDIAN_AGE', 'WHITE', 
                       'BLACK_AFRICAN_AMERICAN', 'AMERICAN_INDIAN_AND_ALASKA_NATIVE', 
                       'ASIAN', 'NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER',
                       'SOME_OTHER_RACE', 'UnitsNonRes', 'HOUSEHOLD_SIZE', 
                       'MORE_THAN_ONE_RACE', 'POPULATION_DENSITY']]

MASTER_DF.rename(columns={'UnitsRes': 'PERC_RES', 'BldgArea': 'BLDG_AREA',
                          'AssessTot': 'BLDG_ASSESSMENT', 'MEDIAN_AGE': 'MEDIAN_PERSON_AGE',
                          'UnitsNonRes': 'PERC_NON_RES'}, inplace=True)

In [60]:
MASTER_DF.to_csv('processed_data/FINAL_DATA.csv', index=False)
