In [1]:
import pandas as pd
import dask.dataframe as dd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# Load data into dataframes
trees = pd.read_csv('../data/2015_Street_Tree_Census_-_Tree_Data.csv')
temps = pd.read_csv('../data/Hyperlocal_Temperature_Monitoring.csv')

###  Convert Datatypes for all columns in `temps` and `trees` to reduce memory usage and enable comparisons

In [3]:
temps.columns

Index(['Sensor.ID', 'AirTemp', 'Day', 'Hour', 'Latitude', 'Longitude', 'Year', 'Install.Type', 'Borough', 'ntacode'], dtype='object')

In [4]:
dtypes = {
    'Sensor.ID': str,
    'AirTemp': float,
    'Latitude': float,
    'Longitude': float,
    'Day': 'datetime64[ns]',
    'Hour': int,
    'Year': int,
    'Install.Type': str,
    'Borough': str,
    'ntacode': str
}

temps = temps.astype(dtypes)

In [5]:
trees.columns

Index(['tree_id', 'block_id', 'created_at', 'tree_dbh', 'stump_diam', 'curb_loc', 'status', 'health', 'spc_latin', 'spc_common', 'steward', 'guards', 'sidewalk', 'user_type', 'problems', 'root_stone', 'root_grate', 'root_other', 'trunk_wire', 'trnk_light', 'trnk_other', 'brch_light', 'brch_shoe', 'brch_other', 'address', 'postcode', 'zip_city', 'community board', 'borocode', 'borough', 'cncldist', 'st_assem', 'st_senate', 'nta', 'nta_name', 'boro_ct', 'state', 'latitude', 'longitude', 'x_sp', 'y_sp', 'council district', 'census tract', 'bin', 'bbl'], dtype='object')

In [6]:
dtypes = {
    'created_at': 'datetime64[ns]',
    'tree_id': 'int32',
    'block_id': 'int32',
    'tree_dbh': 'int32',
    'stump_diam': 'int32',
    'curb_loc': 'str',
    'status': 'str',
    'health': 'str',
    'spc_latin': 'str',
    'spc_common': 'str',
    'steward': 'str',
    'guards': 'str',
    'sidewalk': 'str',
    'user_type': 'str',
    'problems': 'str',
    'root_stone': 'str',
    'root_grate': 'str',
    'root_other': 'str',
    'trunk_wire': 'str',
    'trnk_light': 'str',
    'trnk_other': 'str',
    'brch_light': 'str',
    'brch_shoe': 'str',
    'brch_other': 'str',
    'address': 'str',
    'postcode': 'int32',
    'zip_city': 'str',
    'community board': 'int32',
    'borocode': 'int32',
    'borough': 'str',
    'cncldist': 'int32',
    'st_assem': 'int32',
    'st_senate': 'int32',
    'nta': 'str',
    'nta_name': 'str',
    'boro_ct': 'int32',
    'state': 'str',
    'latitude': 'float32',
    'longitude': 'float32',
    'x_sp': 'float32',
    'y_sp': 'float32',
    'census tract': 'float32',
    'bin': 'float32',
    'bbl': 'float32',
}

trees = trees.astype(dtypes)

In [7]:
trees.head()

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,steward,guards,sidewalk,user_type,problems,root_stone,root_grate,root_other,trunk_wire,trnk_light,trnk_other,brch_light,brch_shoe,brch_other,address,postcode,zip_city,community board,borocode,borough,cncldist,st_assem,st_senate,nta,nta_name,boro_ct,state,latitude,longitude,x_sp,y_sp,council district,census tract,bin,bbl
0,180683,348711,2015-08-27,3,0,OnCurb,Alive,Fair,Acer rubrum,red maple,,,NoDamage,TreesCount Staff,,No,No,No,No,No,No,No,No,No,108-005 70 AVENUE,11375,Forest Hills,406,4,Queens,29,28,16,QN17,Forest Hills,4073900,New York,40.723091,-73.844215,1027431.0,202756.765625,29.0,739.0,4052307.0,4022210000.0
1,200540,315986,2015-09-03,21,0,OnCurb,Alive,Fair,Quercus palustris,pin oak,,,Damage,TreesCount Staff,Stones,Yes,No,No,No,No,No,No,No,No,147-074 7 AVENUE,11357,Whitestone,407,4,Queens,19,27,11,QN49,Whitestone,4097300,New York,40.794109,-73.81868,1034456.0,228644.84375,19.0,973.0,4101931.0,4044750000.0
2,204026,218365,2015-09-05,3,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,1or2,,Damage,Volunteer,,No,No,No,No,No,No,No,No,No,390 MORGAN AVENUE,11211,Brooklyn,301,3,Brooklyn,34,50,18,BK90,East Williamsburg,3044900,New York,40.717579,-73.936607,1001823.0,200716.890625,34.0,449.0,3338310.0,3028870000.0
3,204337,217969,2015-09-05,10,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,,,Damage,Volunteer,Stones,Yes,No,No,No,No,No,No,No,No,1027 GRAND STREET,11211,Brooklyn,301,3,Brooklyn,34,53,18,BK90,East Williamsburg,3044900,New York,40.713539,-73.934456,1002420.0,199244.25,34.0,449.0,3338342.0,3029250000.0
4,189565,223043,2015-08-30,21,0,OnCurb,Alive,Good,Tilia americana,American linden,,,Damage,Volunteer,Stones,Yes,No,No,No,No,No,No,No,No,603 6 STREET,11215,Brooklyn,306,3,Brooklyn,39,44,21,BK37,Park Slope-Gowanus,3016500,New York,40.666779,-73.975983,990913.8,182202.421875,39.0,165.0,3025654.0,3010850000.0


### Filter columns of datasets before merging

In [8]:
# Drop columns that are not relevant to the analysis. This includes:
# - spatial columns (since we are not mapping using GIS)
# - columns that contain redundant information (e.g. borocode encodes same info as borough)
cols_to_drop = [
    'block_id',
    'x_sp',
    'y_sp',
    'zip_city',
    'census tract',
    'borocode',
    'boro_ct',
    'nta_name',
    'cncldist',
    'st_assem',
    'st_senate',
    'community board',
    'council district',
    'census tract',
    'bin',
    'bbl',
    'state'
]

[trees.drop(columns=col, inplace=True) for col in cols_to_drop if col in trees.columns]
trees.head()

Unnamed: 0,tree_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,steward,guards,sidewalk,user_type,problems,root_stone,root_grate,root_other,trunk_wire,trnk_light,trnk_other,brch_light,brch_shoe,brch_other,address,postcode,borough,nta,latitude,longitude
0,180683,2015-08-27,3,0,OnCurb,Alive,Fair,Acer rubrum,red maple,,,NoDamage,TreesCount Staff,,No,No,No,No,No,No,No,No,No,108-005 70 AVENUE,11375,Queens,QN17,40.723091,-73.844215
1,200540,2015-09-03,21,0,OnCurb,Alive,Fair,Quercus palustris,pin oak,,,Damage,TreesCount Staff,Stones,Yes,No,No,No,No,No,No,No,No,147-074 7 AVENUE,11357,Queens,QN49,40.794109,-73.81868
2,204026,2015-09-05,3,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,1or2,,Damage,Volunteer,,No,No,No,No,No,No,No,No,No,390 MORGAN AVENUE,11211,Brooklyn,BK90,40.717579,-73.936607
3,204337,2015-09-05,10,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,,,Damage,Volunteer,Stones,Yes,No,No,No,No,No,No,No,No,1027 GRAND STREET,11211,Brooklyn,BK90,40.713539,-73.934456
4,189565,2015-08-30,21,0,OnCurb,Alive,Good,Tilia americana,American linden,,,Damage,Volunteer,Stones,Yes,No,No,No,No,No,No,No,No,603 6 STREET,11215,Brooklyn,BK37,40.666779,-73.975983


In [9]:
temps.columns

Index(['Sensor.ID', 'AirTemp', 'Day', 'Hour', 'Latitude', 'Longitude', 'Year', 'Install.Type', 'Borough', 'ntacode'], dtype='object')

In [10]:
# Sampling the temp data
# Keeping only hour 0, 6, 12, 18 for each day for each location
# daily_avg_temps = temps[temps['Hour'].isin([0, 6, 12, 18])]
aggregation_functions = {
                        'AirTemp': 'mean',
                        'Hour': 'first',
                        'Latitude': 'first',
                        'Longitude': 'first',
                        'Year': 'first',
                        'Install.Type': 'first',
                        'Borough': 'first',
                        'ntacode': 'first'
                    }
daily_avg_temps = temps.groupby(['Sensor.ID', 'Day']).agg(aggregation_functions).reset_index()

# Dropping columns that are not relevant to the analysis.
cols_to_drop = ['Year']
[daily_avg_temps.drop(columns=col, inplace=True) for col in cols_to_drop if col in trees.columns]
daily_avg_temps.head()

Unnamed: 0,Sensor.ID,Day,AirTemp,Hour,Latitude,Longitude,Year,Install.Type,Borough,ntacode
0,Bk-BR_01,2018-06-15,72.018986,1,40.666205,-73.91691,2018,Street Tree,Brooklyn,BK81
1,Bk-BR_01,2018-06-16,75.564931,0,40.666205,-73.91691,2018,Street Tree,Brooklyn,BK81
2,Bk-BR_01,2018-06-17,78.812097,0,40.666205,-73.91691,2018,Street Tree,Brooklyn,BK81
3,Bk-BR_01,2018-06-18,80.050965,0,40.666205,-73.91691,2018,Street Tree,Brooklyn,BK81
4,Bk-BR_01,2018-06-19,82.986972,0,40.666205,-73.91691,2018,Street Tree,Brooklyn,BK81


In [11]:
# Rename column to match trees dataset for joining.
if 'ntacode' in daily_avg_temps.columns:
    daily_avg_temps['nta'] = daily_avg_temps['ntacode']
    daily_avg_temps.drop(columns=['ntacode'], inplace=True)

In [12]:
# Trees has more NTA codes than temps, so we will filter temps to only include the NTA codes that are relevant to trees.
relevant_nta_codes = daily_avg_temps['nta'].unique()
trees_nta_filtered = trees[trees['nta'].isin(relevant_nta_codes)]

In [13]:
# Filter out trees that are stumps
trees_nta_filtered = trees_nta_filtered[trees_nta_filtered['stump_diam'] == 0]
trees_nta_filtered.drop(columns=['stump_diam'], inplace=True)

In [14]:
# Randomly sample trees so there are 91696 -> 50000 rows.
trees_nta_filtered = trees_nta_filtered.sample(n=50000, random_state=42)

In [15]:
trees_nta_filtered.head()

Unnamed: 0,tree_id,created_at,tree_dbh,curb_loc,status,health,spc_latin,spc_common,steward,guards,sidewalk,user_type,problems,root_stone,root_grate,root_other,trunk_wire,trnk_light,trnk_other,brch_light,brch_shoe,brch_other,address,postcode,borough,nta,latitude,longitude
540508,22107,2015-06-16,11,OnCurb,Alive,Good,Quercus rubra,northern red oak,,,NoDamage,TreesCount Staff,,No,No,No,No,No,No,No,No,No,1241 EASTERN PARKWAY,11213,Brooklyn,BK61,40.668407,-73.927231
483274,702924,2016-09-08,14,OnCurb,Alive,Good,Platanus x acerifolia,London planetree,,,Damage,NYC Parks Staff,"Stones,RootOther,TrunkOther",Yes,No,Yes,No,No,Yes,No,No,No,75 GRAFTON STREET,11212,Brooklyn,BK81,40.666672,-73.918663
245955,432020,2015-11-05,5,OnCurb,Alive,Fair,Quercus palustris,pin oak,,,Damage,TreesCount Staff,BranchLights,No,No,No,No,No,No,Yes,No,No,444 EAST 105 STREET,11236,Brooklyn,BK82,40.653198,-73.902328
239500,434259,2015-11-05,5,OnCurb,Alive,Good,Quercus palustris,pin oak,1or2,,NoDamage,NYC Parks Staff,,No,No,No,No,No,No,No,No,No,1127 BELMONT AVENUE,11208,Brooklyn,BK82,40.674789,-73.867348
262630,468239,2015-11-18,5,OnCurb,Alive,Good,Ulmus americana,American elm,,,NoDamage,NYC Parks Staff,,No,No,No,No,No,No,No,No,No,1053 EAST 227 STREET,10466,Bronx,BX44,40.885883,-73.849083


In [16]:
cols_to_drop = [
    'root_stone',	
    'root_grate',	
    'root_other',
    'trunk_wire',
    'trnk_light',
    'trnk_other',
    'brch_light',
    'brch_shoe',	
    'brch_other',
    'borough',
]
trees_nta_filtered.drop(columns=cols_to_drop, inplace=True)

In [17]:
# To make the join faster, index on the merging columns and sort the dataframes by the index.
daily_avg_temps.set_index('nta', inplace=True)
trees_nta_filtered.set_index('nta', inplace=True)

In [18]:
# Further filter temps to only include every third day.
daily_avg_temps_filtered = daily_avg_temps[daily_avg_temps['Day'].dt.day % 7 == 0]

In [19]:
# Should be (11382, 9).
daily_avg_temps_filtered.shape

(11382, 9)

In [20]:
# Use dask to merge the dataframes in parallel.
dd_daily_avg_temps = dd.from_pandas(daily_avg_temps_filtered, npartitions=3)
dd_trees_nta_filtered = dd.from_pandas(trees_nta_filtered, npartitions=3)
integrated = dd.merge(dd_daily_avg_temps, dd_trees_nta_filtered, on='nta').compute()

In [21]:
integrated.head()

Unnamed: 0_level_0,Sensor.ID,Day,AirTemp,Hour,Latitude,Longitude,Year,Install.Type,Borough,tree_id,created_at,tree_dbh,curb_loc,status,health,spc_latin,spc_common,steward,guards,sidewalk,user_type,problems,address,postcode,latitude,longitude
nta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
BK33,Bk-RH_13,2019-08-14,76.531111,0,40.678376,-73.991677,2019,Street Tree,Brooklyn,177398,2015-08-26,11,OnCurb,Alive,Good,Acer platanoides,Norway maple,,,Damage,Volunteer,,236 WARREN STREET,11201,40.687092,-73.994965
BK33,Bk-RH_13,2019-08-14,76.531111,0,40.678376,-73.991677,2019,Street Tree,Brooklyn,183494,2015-08-28,16,OnCurb,Alive,Good,Pyrus calleryana,Callery pear,,,Damage,Volunteer,Stones,182 WARREN STREET,11201,40.687778,-73.99704
BK33,Bk-RH_13,2019-08-14,76.531111,0,40.678376,-73.991677,2019,Street Tree,Brooklyn,176592,2015-08-26,4,OnCurb,Alive,Good,Gymnocladus dioicus,Kentucky coffeetree,,Harmful,NoDamage,Volunteer,TrunkLights,158 BALTIC STREET,11201,40.687393,-73.998138
BK33,Bk-RH_13,2019-08-14,76.531111,0,40.678376,-73.991677,2019,Street Tree,Brooklyn,549914,2016-03-07,15,OnCurb,Alive,Good,Quercus phellos,willow oak,1or2,,NoDamage,Volunteer,"Stones,RootOther,BranchOther",75 1 PLACE,11231,40.680725,-73.99823
BK33,Bk-RH_13,2019-08-14,76.531111,0,40.678376,-73.991677,2019,Street Tree,Brooklyn,240800,2015-09-20,6,OnCurb,Alive,Good,Zelkova serrata,Japanese zelkova,1or2,,NoDamage,Volunteer,,168 DOUGLASS STREET,11217,40.682014,-73.988846


In [22]:
# Adding labels for latitude and longitude columns from trees and temps in merged
# to differentiate between the two

integrated.rename({'Latitude': 'latidude_temp', 'Longitude': 'longitude_temp'}, axis=1, inplace=True)

In [23]:
integrated.shape

(20158056, 26)

In [24]:
# Sample integrated dataset from 2M rows to 1M rows.
# We need to sample the data because the dataset is too large to read.
integrated = integrated.sample(n=1000000, random_state=42)
integrated.shape

(1000000, 26)

## Bin the numerical data

In [39]:
# Separate temperatures so 5% of the data is used for each bin.
integrated['AirTempBinned'] = pd.qcut(integrated['AirTemp'], q=20)
integrated['AirTempBinned'].value_counts()

(74.787, 75.553]                49914
(72.47, 72.995]                 49881
(70.864, 71.412]                49853
(81.342, 82.759]                49828
(74.104, 74.787]                49812
(61.458, 67.373]                49800
(52.443000000000005, 61.458]    49792
(67.373, 70.864]                49776
(82.759, 84.188]                49773
(76.283, 77.182]                49770
(73.547, 74.104]                49764
(79.41, 81.342]                 49763
(71.899, 72.47]                 49752
(84.188, 85.459]                49751
(77.182, 79.41]                 49741
(85.459, 88.411]                49726
(88.411, 95.695]                49719
(71.412, 71.899]                49666
(72.995, 73.547]                49573
(75.553, 76.283]                49572
Name: AirTempBinned, dtype: int64

In [41]:
integrated.drop(columns=['AirTemp'], inplace=True)
integrated.head()

Unnamed: 0_level_0,Sensor.ID,Day,Hour,latidude_temp,longitude_temp,Year,Install.Type,Borough,tree_id,created_at,tree_dbh,curb_loc,status,health,spc_latin,spc_common,steward,guards,sidewalk,user_type,problems,address,postcode,latitude,longitude,AirTempBinned
nta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
BX39,Bx-MT_01,2019-08-28,0,40.806479,-73.914458,2019,Street Tree,Bronx,92562,2015-07-25,7,OnCurb,Alive,Fair,Prunus,cherry,,,NoDamage,Volunteer,,536 EAST 142 STREET,10454,40.809856,-73.91629,"(71.412, 71.899]"
BK35,Bk-SH_30,2018-08-07,0,40.684987,-73.932392,2018,Street Tree,Brooklyn,76100,2015-07-18,8,OnCurb,Alive,Good,Quercus palustris,pin oak,4orMore,Harmful,NoDamage,Volunteer,"RootOther,WiresRope",699 HANCOCK STREET,11221,40.685215,-73.927109,"(81.342, 82.759]"
BX03,Bx-EC_33,2018-07-21,0,40.876405,-73.847207,2018,Light Pole,Bronx,336839,2015-10-16,29,OnCurb,Alive,Good,Platanus x acerifolia,London planetree,,,NoDamage,NYC Parks Staff,,3321 FENTON AVENUE,10469,40.874771,-73.847336,"(71.412, 71.899]"
BK81,NYCHA-BR_05,2019-10-14,0,40.663994,-73.906047,2019,Light Pole,Brooklyn,701636,2016-09-07,2,OnCurb,Alive,Fair,Gleditsia triacanthos var. inermis,honeylocust,1or2,,Damage,NYC Parks Staff,,1488 PITKIN AVENUE,11212,40.668411,-73.918816,"(61.458, 67.373]"
MN03,M-CH_23,2018-06-21,0,40.807807,-73.943346,2018,Street Tree,Manhattan,86334,2015-07-22,12,OnCurb,Alive,Good,Styphnolobium japonicum,Sophora,,Harmful,Damage,Volunteer,,91 EDGECOMBE AVENUE,10030,40.819283,-73.946411,"(76.283, 77.182]"


In [42]:
integrated.to_csv('../INTEGRATED-DATASET.csv')