This iPython Notebook calculates the distance from each facility to the monitoring data points. Only facilities that were categorized as high emissions are used in the analysis. Facilities of the same category, located close to each other are combined into one single point based on their centroid to avoid multi-colinearity issues. Grouping of facilities was done in ArcGIS.

<b> Two Input Files: </b>
    
     PM_Facilities_High_Centroid.csv 
     NO2_Facilities_High_Centroid.csv

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
from shapely.ops import nearest_points

import seaborn as sns

from mpl_toolkits.axes_grid1 import make_axes_locatable

import math

import time

from matplotlib import cm

import matplotlib.lines as mlines

%matplotlib inline

## Monitoring Data

In [15]:
df = pd.read_csv('EDF_Data.csv', header = 1)
df.tail()

Unnamed: 0,Longitude,Latitude,NO Value,NO2 Value,BC Value
21483,-122.034943,37.560076,129.999995,44.77822,3.923761
21484,-122.034724,37.560164,60.799998,39.027545,1.408693
21485,-122.034681,37.55983,34.622951,28.816797,2.659885
21486,-122.034504,37.559958,74.764705,35.735434,1.776353
21487,-122.034503,37.559957,78.754782,41.062757,2.014664


### Split into BC and NO2

In [16]:
BC_df = df[['Longitude', 'Latitude', 'BC Value']]

In [17]:
NO2_df = df[['Longitude', 'Latitude', 'NO2 Value']]

## Facility Level Data with Source Type

In [18]:
Facility_PM = pd.read_csv("Data/PM_Facilities_High_Centroid.csv")

In [19]:
Facility_PM.head()

Unnamed: 0,latitude,longitude,source_typ,ems_type,source_cat,Grouping,EIS,Centroid_X,Centroid_Y
0,37.695818,-122.164205,FoodPlant,high,Food-Products-Processing-Plant,0,15756011,37.695818,-122.164205
1,37.70902,-122.14191,FoodPlant,high,Food-Products-Processing-Plant,14,340611,37.704465,-122.14023
2,37.70556,-122.15209,Manufacturing,high,Manufacturing,0,10708111,37.70556,-122.15209
3,37.70459,-122.17404,Manufacturing,high,Manufacturing,8,445511,37.706633,-122.179205
4,37.704586,-122.161181,Retail-Res,high,Wholesale,2,17244511,37.708139,-122.175454


In [20]:
Facility_NO2 = pd.read_csv("Data/NO2_Facilities_High_Centroid.csv")

In [21]:
Facility_NO2.head()

Unnamed: 0,latitude,longitude,source-typ,ems-type,source_cat,eis facili,Grouping,Centroid_X,Centroid_Y
0,37.6971,-122.17306,FoodPlant,high,Food-Products-Processing-Plant,804911,17,37.700845,-122.17355
1,37.70459,-122.17404,Manufacturing,high,Manufacturing,445511,8,37.706805,-122.157975
2,37.70902,-122.14191,FoodPlant,high,Food-Products-Processing-Plant,340611,14,37.707945,-122.159205
3,37.70687,-122.1765,Retail-Res,high,Wholesale,18135811,2,37.70775,-122.182175
4,37.70863,-122.18785,Manufacturing,high,Manufacturing,14071611,18,37.710605,-122.18925


## Calculate distance from monitoring data to each Facility - PM

In [22]:
Facility_PM_All = Facility_PM.copy()

In [23]:
Facility_PM.head()

Unnamed: 0,latitude,longitude,source_typ,ems_type,source_cat,Grouping,EIS,Centroid_X,Centroid_Y
0,37.695818,-122.164205,FoodPlant,high,Food-Products-Processing-Plant,0,15756011,37.695818,-122.164205
1,37.70902,-122.14191,FoodPlant,high,Food-Products-Processing-Plant,14,340611,37.704465,-122.14023
2,37.70556,-122.15209,Manufacturing,high,Manufacturing,0,10708111,37.70556,-122.15209
3,37.70459,-122.17404,Manufacturing,high,Manufacturing,8,445511,37.706633,-122.179205
4,37.704586,-122.161181,Retail-Res,high,Wholesale,2,17244511,37.708139,-122.175454


In [24]:
### Create a columns as eis-source-type
Facility_PM['eis-source'] =    Facility_PM['EIS'].apply(str) + '-' +  Facility_PM['source_typ']   

In [25]:
### Create a columns as eis-source-ems-type
Facility_PM['eis-source-ems'] =    Facility_PM['eis-source']   + '-' + Facility_PM['ems_type']

In [26]:
Facility_PM.head(100)

Unnamed: 0,latitude,longitude,source_typ,ems_type,source_cat,Grouping,EIS,Centroid_X,Centroid_Y,eis-source,eis-source-ems
0,37.695818,-122.164205,FoodPlant,high,Food-Products-Processing-Plant,0,15756011,37.695818,-122.164205,15756011-FoodPlant,15756011-FoodPlant-high
1,37.709020,-122.141910,FoodPlant,high,Food-Products-Processing-Plant,14,340611,37.704465,-122.140230,340611-FoodPlant,340611-FoodPlant-high
2,37.705560,-122.152090,Manufacturing,high,Manufacturing,0,10708111,37.705560,-122.152090,10708111-Manufacturing,10708111-Manufacturing-high
3,37.704590,-122.174040,Manufacturing,high,Manufacturing,8,445511,37.706633,-122.179205,445511-Manufacturing,445511-Manufacturing-high
4,37.704586,-122.161181,Retail-Res,high,Wholesale,2,17244511,37.708139,-122.175454,17244511-Retail-Res,17244511-Retail-Res-high
...,...,...,...,...,...,...,...,...,...,...,...
56,37.877360,-122.301290,ConcretePlant,high,Concrete-Batch-Plant,0,477811,37.877360,-122.301290,477811-ConcretePlant,477811-ConcretePlant-high
57,37.877430,-122.250880,Retail-Res,high,Institution,0,448011,37.877430,-122.250880,448011-Retail-Res,448011-Retail-Res-high
58,37.878390,-122.303300,AutoRepair,high,Auto-Body-Painting-or-Repair-Shop,0,138911,37.878390,-122.303300,138911-AutoRepair,138911-AutoRepair-high
59,37.877850,-122.305270,Retail-Res,high,Institution,7,203711,37.878429,-122.302847,203711-Retail-Res,203711-Retail-Res-high


In [27]:
### Add an empty column for distance
Facility_PM['dist'] = 0
Facility_PM['dist'].astype(float)

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
     ... 
56    0.0
57    0.0
58    0.0
59    0.0
60    0.0
Name: dist, Length: 61, dtype: float64

### Create a dataframe where each column is one of the industrial facilities. Add one column for latitude, longitude and PM2.5 emissions value. Dropping PM10 emissions since Black Carbon is a form on PM2.5 emissions

In [28]:
# Create individual dataframes
Oak_PM_lat = Facility_PM[['eis-source-ems', 'Centroid_X']]
Oak_PM_long = Facility_PM[['eis-source-ems', 'Centroid_Y']]
Oak_PM_dist = Facility_PM[['eis-source-ems', 'dist']]


In [29]:
# Transpose all the dataframes
Oak_PM_lat = Oak_PM_lat.T
Oak_PM_long = Oak_PM_long.T
Oak_PM_dist = Oak_PM_dist.T




In [30]:
## Make the header as the first row in each transposed dataframe
Oak_PM_lat = Oak_PM_lat.rename(columns=Oak_PM_lat.iloc[0]).drop(Oak_PM_lat.index[0])
Oak_PM_long = Oak_PM_long.rename(columns=Oak_PM_long.iloc[0]).drop(Oak_PM_long.index[0])
Oak_PM_dist = Oak_PM_dist.rename(columns=Oak_PM_dist.iloc[0]).drop(Oak_PM_dist.index[0])

#Oak_PM_PM25 = Oak_PM_PM25.rename(columns=Oak_PM_PM25.iloc[0]).drop(Oak_PM_PM25.index[0])
#Oak_PM_PM10 = Oak_PM_PM10.rename(columns=Oak_PM_PM10.iloc[0].astype(int)).drop(Oak_PM_PM10.index[0])
#Oak_PM_emsdist = Oak_PM_emsdist.rename(columns=Oak_PM_emsdist.iloc[0]).drop(Oak_PM_emsdist.index[0])
#Oak_PM_geo = Oak_PM_geo.rename(columns=Oak_PM_geo.iloc[0]).drop(Oak_PM_geo.index[0])

In [31]:
## Add suffix to column header based on the dataframe type
Oak_PM_lat.columns = [str(col) + '_latitude' for col in Oak_PM_lat.columns]
Oak_PM_long.columns = [str(col) + '_longitude' for col in Oak_PM_long.columns]
Oak_PM_dist.columns = [str(col) + '_dist' for col in Oak_PM_dist.columns]

#Oak_PM_PM25.columns = [str(col) + '_PM25' for col in Oak_PM_PM25.columns]
#Oak_PM_PM10.columns = [str(col) + '_PM10' for col in Oak_PM_PM10.columns]
#Oak_PM_emsdist.columns = [str(col) + '_emsdist' for col in Oak_PM_emsdist.columns]
#Oak_PM_geo.columns = [str(col) + '_geo' for col in Oak_PM_geo.columns]

In [32]:
## Remove index for each dataframe
Oak_PM_lat.reset_index(drop=True, inplace=True)
Oak_PM_long.reset_index(drop=True, inplace=True)
Oak_PM_dist.reset_index(drop=True, inplace=True)

#Oak_PM_PM25.reset_index(drop=True, inplace=True)
#Oak_PM_PM10.reset_index(drop=True, inplace=True)
#Oak_PM_emsdist.reset_index(drop=True, inplace=True)
#Oak_PM_geo.reset_index(drop=True, inplace=True)

In [33]:
### Combine individual dataframes into one
Oak_PM_combined = Oak_PM_lat.join(Oak_PM_long).join(Oak_PM_dist)





In [34]:
### Sort based on column names
Oak_PM_combined = Oak_PM_combined.reindex(columns=sorted(Oak_PM_combined.columns))

In [35]:
Oak_PM_combined

Unnamed: 0,10457411-Manufacturing-high_dist,10457411-Manufacturing-high_latitude,10457411-Manufacturing-high_longitude,10457711-Foundaries-high_dist,10457711-Foundaries-high_latitude,10457711-Foundaries-high_longitude,10460511-Retail-Res-high_dist,10460511-Retail-Res-high_latitude,10460511-Retail-Res-high_longitude,10469511-Retail-Res-high_dist,...,771711-Eateries-high_longitude,772011-EGen-high_dist,772011-EGen-high_latitude,772011-EGen-high_longitude,808611-AsphaltPlant-high_dist,808611-AsphaltPlant-high_latitude,808611-AsphaltPlant-high_longitude,808811-FoodPlant-high_dist,808811-FoodPlant-high_latitude,808811-FoodPlant-high_longitude
0,0,37.8583,-122.297,0,37.7104,-122.183,0,37.8357,-122.268,0,...,-122.209,0,37.8019,-122.278,0,37.8698,-122.302,0,37.7936,-122.268


In [36]:
#Create a datafram where each row contains emissions of PM2.5 for each facility
Oak_PM_combined = Oak_PM_combined.loc[Oak_PM_combined.index.repeat(21488)].reset_index(drop=True)

In [37]:
combined_BC_Facility = BC_df.join(Oak_PM_combined)

In [38]:
combined_BC_Facility.head()

Unnamed: 0,Longitude,Latitude,BC Value,10457411-Manufacturing-high_dist,10457411-Manufacturing-high_latitude,10457411-Manufacturing-high_longitude,10457711-Foundaries-high_dist,10457711-Foundaries-high_latitude,10457711-Foundaries-high_longitude,10460511-Retail-Res-high_dist,...,771711-Eateries-high_longitude,772011-EGen-high_dist,772011-EGen-high_latitude,772011-EGen-high_longitude,808611-AsphaltPlant-high_dist,808611-AsphaltPlant-high_latitude,808611-AsphaltPlant-high_longitude,808811-FoodPlant-high_dist,808811-FoodPlant-high_latitude,808811-FoodPlant-high_longitude
0,-122.322594,37.806781,0.818032,0,37.8583,-122.297,0,37.7104,-122.183,0,...,-122.209,0,37.8019,-122.278,0,37.8698,-122.302,0,37.7936,-122.268
1,-122.32231,37.80615,0.551475,0,37.8583,-122.297,0,37.7104,-122.183,0,...,-122.209,0,37.8019,-122.278,0,37.8698,-122.302,0,37.7936,-122.268
2,-122.322301,37.80642,0.593712,0,37.8583,-122.297,0,37.7104,-122.183,0,...,-122.209,0,37.8019,-122.278,0,37.8698,-122.302,0,37.7936,-122.268
3,-122.322299,37.80588,0.489898,0,37.8583,-122.297,0,37.7104,-122.183,0,...,-122.209,0,37.8019,-122.278,0,37.8698,-122.302,0,37.7936,-122.268
4,-122.322267,37.806689,0.739341,0,37.8583,-122.297,0,37.7104,-122.183,0,...,-122.209,0,37.8019,-122.278,0,37.8698,-122.302,0,37.7936,-122.268


In [39]:
# Convert distance or emissions distance column to float type
for idx, col in enumerate(combined_BC_Facility.columns):
        if "_dist" in col:
            combined_BC_Facility[col] = pd.to_numeric(combined_BC_Facility[col], downcast="float")



## Calculate distance between point of measurement and each facility and add it to the '_dist' column

In [40]:
### Defining a function to calculate the distance between two GPS coordinates (latitude and longitude)
def distance(origin, destination):
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c

    return d


In [41]:
time1 = time.time()
for index, row in combined_BC_Facility.iterrows():
    for idx, col in enumerate(combined_BC_Facility.columns):
        if "_dist" in col:
            combined_BC_Facility.at[index,col] = float(distance((row.iloc[1], row.iloc[0]), (row.iloc[idx+1], row.iloc[idx+2])))*0.621
            #BC_Facility.at[index,col] = float(row.iloc[idx])
time2 = time.time()            
    
print(time2 - time1)

49.23340082168579


In [42]:
### Write this to a dataframe
combined_BC_Facility.to_csv("Data/BC_PM_Facilities_High_Dist.csv")



In [43]:
combined_BC_Facility.shape

(21488, 186)

## Calculate distance from monitoring data to each Facility - NO2

In [44]:
Facility_NO2_All = Facility_NO2.copy()

In [45]:
Facility_NO2.head()

Unnamed: 0,latitude,longitude,source-typ,ems-type,source_cat,eis facili,Grouping,Centroid_X,Centroid_Y
0,37.6971,-122.17306,FoodPlant,high,Food-Products-Processing-Plant,804911,17,37.700845,-122.17355
1,37.70459,-122.17404,Manufacturing,high,Manufacturing,445511,8,37.706805,-122.157975
2,37.70902,-122.14191,FoodPlant,high,Food-Products-Processing-Plant,340611,14,37.707945,-122.159205
3,37.70687,-122.1765,Retail-Res,high,Wholesale,18135811,2,37.70775,-122.182175
4,37.70863,-122.18785,Manufacturing,high,Manufacturing,14071611,18,37.710605,-122.18925


In [47]:
### Create a columns as eis-source-type
Facility_NO2['eis-source'] =    Facility_NO2['eis facili'].apply(str) + '-' +  Facility_NO2['source-typ']   

In [48]:
### Create a columns as eis-source-ems-type
Facility_NO2['eis-source-ems'] =    Facility_NO2['eis-source']   + '-' + Facility_NO2['ems-type']

In [49]:
### Add an empty column for distance
Facility_NO2['dist'] = 0
Facility_NO2['dist'].astype(float)

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    0.0
20    0.0
21    0.0
22    0.0
23    0.0
24    0.0
25    0.0
26    0.0
27    0.0
28    0.0
29    0.0
30    0.0
31    0.0
32    0.0
33    0.0
34    0.0
35    0.0
36    0.0
37    0.0
38    0.0
39    0.0
40    0.0
41    0.0
42    0.0
43    0.0
44    0.0
45    0.0
46    0.0
47    0.0
48    0.0
49    0.0
50    0.0
51    0.0
Name: dist, dtype: float64

In [50]:
# Create individual dataframes
Oak_NO2_lat = Facility_NO2[['eis-source-ems', 'Centroid_X']]
Oak_NO2_long = Facility_NO2[['eis-source-ems', 'Centroid_Y']]
Oak_NO2_dist = Facility_NO2[['eis-source-ems', 'dist']]


In [51]:
# Transpose all the dataframes
Oak_NO2_lat = Oak_NO2_lat.T
Oak_NO2_long = Oak_NO2_long.T
Oak_NO2_dist = Oak_NO2_dist.T


In [52]:
## Make the header as the first row in each transposed dataframe
Oak_NO2_lat = Oak_NO2_lat.rename(columns=Oak_NO2_lat.iloc[0]).drop(Oak_NO2_lat.index[0])
Oak_NO2_long = Oak_NO2_long.rename(columns=Oak_NO2_long.iloc[0]).drop(Oak_NO2_long.index[0])
Oak_NO2_dist = Oak_NO2_dist.rename(columns=Oak_NO2_dist.iloc[0]).drop(Oak_NO2_dist.index[0])


In [53]:
## Add suffix to column header based on the dataframe type
Oak_NO2_lat.columns = [str(col) + '_latitude' for col in Oak_NO2_lat.columns]
Oak_NO2_long.columns = [str(col) + '_longitude' for col in Oak_NO2_long.columns]
Oak_NO2_dist.columns = [str(col) + '_dist' for col in Oak_NO2_dist.columns]


In [54]:
## Remove index for each dataframe
Oak_NO2_lat.reset_index(drop=True, inplace=True)
Oak_NO2_long.reset_index(drop=True, inplace=True)
Oak_NO2_dist.reset_index(drop=True, inplace=True)

In [55]:
### Combine individual dataframes into one
Oak_NO2_combined = Oak_NO2_lat.join(Oak_NO2_long).join(Oak_NO2_dist)

In [56]:
### Sort based on column names
Oak_NO2_combined = Oak_NO2_combined.reindex(columns=sorted(Oak_NO2_combined.columns))

In [57]:
Oak_NO2_combined

Unnamed: 0,10457511-Manufacturing-high_dist,10457511-Manufacturing-high_latitude,10457511-Manufacturing-high_longitude,10460511-Retail-Res-high_dist,10460511-Retail-Res-high_latitude,10460511-Retail-Res-high_longitude,10510811-AsphaltPlant-high_dist,10510811-AsphaltPlant-high_latitude,10510811-AsphaltPlant-high_longitude,10517111-Retail-Res-high_dist,...,7845611-Retail-Res-high_longitude,804911-FoodPlant-high_dist,804911-FoodPlant-high_latitude,804911-FoodPlant-high_longitude,808611-AsphaltPlant-high_dist,808611-AsphaltPlant-high_latitude,808611-AsphaltPlant-high_longitude,808811-FoodPlant-high_dist,808811-FoodPlant-high_latitude,808811-FoodPlant-high_longitude
0,0,37.7999,-122.287,0,37.8357,-122.268,0,37.7627,-122.222,0,...,-122.305,0,37.7008,-122.174,0,37.8698,-122.302,0,37.7936,-122.268


In [58]:
#Create a datafram where each row contains emissions of PM2.5 for each facility
Oak_NO2_combined = Oak_NO2_combined.loc[Oak_NO2_combined.index.repeat(21488)].reset_index(drop=True)

In [59]:
combined_NO2_Facility = NO2_df.join(Oak_NO2_combined)

In [60]:
combined_NO2_Facility.head()

Unnamed: 0,Longitude,Latitude,NO2 Value,10457511-Manufacturing-high_dist,10457511-Manufacturing-high_latitude,10457511-Manufacturing-high_longitude,10460511-Retail-Res-high_dist,10460511-Retail-Res-high_latitude,10460511-Retail-Res-high_longitude,10510811-AsphaltPlant-high_dist,...,7845611-Retail-Res-high_longitude,804911-FoodPlant-high_dist,804911-FoodPlant-high_latitude,804911-FoodPlant-high_longitude,808611-AsphaltPlant-high_dist,808611-AsphaltPlant-high_latitude,808611-AsphaltPlant-high_longitude,808811-FoodPlant-high_dist,808811-FoodPlant-high_latitude,808811-FoodPlant-high_longitude
0,-122.322594,37.806781,17.539762,0,37.7999,-122.287,0,37.8357,-122.268,0,...,-122.305,0,37.7008,-122.174,0,37.8698,-122.302,0,37.7936,-122.268
1,-122.32231,37.80615,19.95675,0,37.7999,-122.287,0,37.8357,-122.268,0,...,-122.305,0,37.7008,-122.174,0,37.8698,-122.302,0,37.7936,-122.268
2,-122.322301,37.80642,23.967768,0,37.7999,-122.287,0,37.8357,-122.268,0,...,-122.305,0,37.7008,-122.174,0,37.8698,-122.302,0,37.7936,-122.268
3,-122.322299,37.80588,18.435184,0,37.7999,-122.287,0,37.8357,-122.268,0,...,-122.305,0,37.7008,-122.174,0,37.8698,-122.302,0,37.7936,-122.268
4,-122.322267,37.806689,25.797037,0,37.7999,-122.287,0,37.8357,-122.268,0,...,-122.305,0,37.7008,-122.174,0,37.8698,-122.302,0,37.7936,-122.268


In [61]:
# Convert distance or emissions distance column to float type
for idx, col in enumerate(combined_NO2_Facility.columns):
        if "_dist" in col:
            combined_NO2_Facility[col] = pd.to_numeric(combined_NO2_Facility[col], downcast="float")



In [62]:
time1 = time.time()
for index, row in combined_NO2_Facility.iterrows():
    for idx, col in enumerate(combined_NO2_Facility.columns):
        if "_dist" in col:
            combined_NO2_Facility.at[index,col] = float(distance((row.iloc[1], row.iloc[0]), (row.iloc[idx+1], row.iloc[idx+2])))*0.621
            #BC_Facility.at[index,col] = float(row.iloc[idx])
time2 = time.time()            
    
print(time2 - time1)

43.142014503479004


In [63]:
### Write this to a dataframe
combined_NO2_Facility.to_csv("Data/NO2_NO2_Facilities_High_Dist.csv")

