In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read Data

In [2]:
# https://nycmaps-nyc.hub.arcgis.com/datasets/nyc::building/about
buildings_raw = pd.read_csv('BUILDING_view_7607496916235021567.csv')

# https://data.cityofnewyork.us/City-Government/Property-Address-Directory/bc8t-ecyu/about_data
zipcodes_raw = pd.read_csv('bobaadr.txt', low_memory=False)

# https://www.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page
pluto_raw = pd.read_csv('pluto_25v1.csv', low_memory=False)

# https://data.cityofnewyork.us/Public-Safety/Risk-Based-Inspections-RBIS-/itd7-gx3g/about_data
inspection_raw = pd.read_csv('Risk_Based_Inspections__RBIS__20250308.csv', low_memory=False)

# https://simplemaps.com/data/us-zips
zip_area_raw = pd.read_csv('uszips.csv')

# Clean Data
### building data

In [3]:
buildings_raw.columns

Index(['OBJECTID', 'Base BBL', 'BIN', 'Construction Year', 'DOITT ID',
       'Feature Code', 'Geometry Source', 'Ground Elevation', 'Height Roof',
       'LAST_EDITED_DATE', 'LAST_STATUS_TYPE', 'Map Pluto BBL', 'Name', 'Area',
       'Length'],
      dtype='object')

In [4]:
buildings = buildings_raw[['OBJECTID', 'BIN', 'Construction Year', 'DOITT ID', 'Feature Code', 'Height Roof',
       'LAST_STATUS_TYPE','Map Pluto BBL', 'Area', 'Length']]

In [5]:
# Keep only LAST_STATUS_TYPE == "Constructed"
buildings = buildings[buildings['LAST_STATUS_TYPE'] == "Constructed"]
buildings.drop('LAST_STATUS_TYPE', axis=1, inplace=True)

### zipcode data

In [6]:
zipcodes = zipcodes_raw[['bin','zipcode']]
zipcodes.head()

Unnamed: 0,bin,zipcode
0,1000000,10004
1,1000000,10004
2,1000000,10004
3,1000000,10004
4,1000000,10004


### pluto data

In [7]:
pluto = pluto_raw[['borough', 'block', 'lot', 'bbl', 'zipcode', 'latitude', 'longitude']]
pluto.head()

Unnamed: 0,borough,block,lot,bbl,zipcode,latitude,longitude
0,BK,5852,1,3058520000.0,11220.0,40.638298,-74.030598
1,BK,5852,13,3058520000.0,11220.0,40.638575,-74.030126
2,BK,5852,6,3058520000.0,11220.0,40.638567,-74.03049
3,BK,5852,58,3058520000.0,11220.0,40.638142,-74.029704
4,BK,5848,77,3058480000.0,11220.0,40.639039,-74.030115


In [8]:
# Convert bbl column from float to int
pluto['bbl'] = pluto['bbl'].apply(lambda x: int(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pluto['bbl'] = pluto['bbl'].apply(lambda x: int(x))


### inspection data

In [9]:
inspection = inspection_raw[['INSP_INSPECT_DT', 'BBL', 'BLDG_CURRENT_BIN_FK']]
inspection.head()

Unnamed: 0,INSP_INSPECT_DT,BBL,BLDG_CURRENT_BIN_FK
0,08/14/2018,4000510000.0,4000431
1,01/29/2016,3050480000.0,3115797
2,05/21/2014,5024508000.0,5120386
3,05/17/2016,1022480000.0,1065067
4,06/01/2018,3009170000.0,3018012


### zip area data

In [10]:
zip_area = zip_area_raw[['zip', 'population', 'density']]
# Density in population per km^2 (1 km^2 = 0.386102 mile^2)
zip_area['area_mile2'] = zip_area['population'] / zip_area['density'] * 0.386102
zip_area = zip_area[['zip', 'area_mile2']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zip_area['area_mile2'] = zip_area['population'] / zip_area['density'] * 0.386102


# Merge data
## Find zipcodes
### buildings with zipcodes & pluto

In [11]:
# Buildings with zipcodes
num_empty = buildings.merge(zipcodes, left_on='BIN', right_on='bin', how='left').value_counts('zipcode').max()
print(f'There are {num_empty} rows with empty zipcode')

There are 77488 rows with empty zipcode


In [12]:
# Buildings with pluto
data = buildings.merge(pluto, left_on='Map Pluto BBL', right_on='bbl', how='left').drop('Map Pluto BBL', axis=1)

# Merge with zip_area
data = data.merge(zip_area, left_on = 'zipcode', right_on='zip', how='left')
data.head()

Unnamed: 0,OBJECTID,BIN,Construction Year,DOITT ID,Feature Code,Height Roof,Area,Length,borough,block,lot,bbl,zipcode,latitude,longitude,zip,area_mile2
0,1,3170958,1925.0,96807,2100,29.749853,264.464844,74.28149,BK,6522.0,21.0,3065220000.0,11230.0,40.626025,-73.966724,11230.0,1.842909
1,2,5028452,1965.0,326368,2100,22.63,121.714844,44.163804,SI,1264.0,36.0,5012640000.0,10303.0,40.63931,-74.167926,10303.0,3.169214
2,3,5078368,1970.0,746627,2100,35.76,115.828125,43.920822,SI,6019.0,91.0,5060190000.0,10312.0,40.556102,-74.195182,10312.0,6.871665
3,4,3245111,1928.0,786626,2100,37.5,188.238281,73.872226,BK,8691.0,48.0,3086910000.0,11235.0,40.577413,-73.961165,11235.0,2.457111
4,5,4161096,1950.0,746409,2100,18.015113,284.390625,72.749024,QN,7502.0,5.0,4075020000.0,11364.0,40.75584,-73.754259,11364.0,2.508998


In [None]:
# Create building age column
data['age'] = data.apply(lambda x: 2025-x['Construction Year'], axis=1)

In [None]:
data.sample(5)

## Find inspection risks

In [None]:
insp_hist = data.merge(inspection, left_on=['bbl', 'BIN'], right_on=['BBL', 'BLDG_CURRENT_BIN_FK'], how='right')\
            .drop(['OBJECTID', 'BBL', 'BLDG_CURRENT_BIN_FK', 'Area', 'Length', 'block', 'lot', 'Construction Year'], axis=1)\
            .dropna()
insp_hist['zipcode'] = insp_hist['zipcode'].apply(lambda x:int(x))
insp_hist

In [None]:
# Create inspection count using DOITT ID
insp_count = pd.DataFrame(insp_hist['DOITT ID'].value_counts()).reset_index()

In [None]:
# Merge data with insp_count
df = data.merge(insp_count, on='DOITT ID', how='left').rename({'count':'insp_count'}, axis=1)
df.insp_count = df.insp_count.fillna(0)

In [None]:
df.insp_count.quantile(0.94), df.insp_count.quantile(0.98)

# Classify Fire Risks

- **High risks**: Less than 2% of the buildings had been **inspected more than 3 times** since 2014, and they will be classified as with high fire risk. Also, buildings **over 87 years old** will also be classified as with high fire risk (fire code was introduced to NYC in 1938).
- **Medium risks**: Approximately 4% of the buildings had been **inspected 2 or 3 times** since 2014, and they will be classified as with medium fire risk. Also, buildings **between 37 and 87 years old** will also be classified as with medium fire risk (major building code overhaul occurred in 1968).
- **Low risks**: Buildings with **1 or less inspections** and **less than 37 years old** will be classified as with the lowest fire risk (major building code overhaul occurred in 1968).

source: https://digitalcommons.njit.edu/dissertations/77/

In [None]:
def risk(insp_count, age):
    if insp_count > 3 or age > 87:
        return 'high'
    elif 2 <= insp_count <= 3 or 37 <= age <= 87:
        return 'medium'
    elif insp_count <= 1 or age < 37:
        return 'low'

In [None]:
df['risk'] = df.apply(lambda x: risk(x['insp_count'], x['age']), axis=1)
df.risk.value_counts()

In [None]:
df = df.groupby(['zipcode', 'risk'], as_index=False).count()[['zipcode', 'risk', 'lot']].rename({'lot':'count'}, axis=1)
df = df.merge(zip_area, left_on = 'zipcode', right_on='zip', how='left').drop('zip', axis=1)
df['zipcode'] = df['zipcode'].astype(int)

In [None]:
df.to_csv('building_fire_risk.csv', index=False)