In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
from shapely.ops import nearest_points

import seaborn as sns

from mpl_toolkits.axes_grid1 import make_axes_locatable

import math

import time

from matplotlib import cm

import matplotlib.lines as mlines

%matplotlib inline

### AIR POLLUTION MONITORING DATA FROM EDF

In [111]:
df = pd.read_csv('EDF_Data.csv', header = 1)
df['TimePeriod'] = 'Jun2015-May2016'
df.tail()

Unnamed: 0,Longitude,Latitude,NO Value,NO2 Value,BC Value,TimePeriod
21483,-122.034943,37.560076,129.999995,44.77822,3.923761,Jun2015-May2016
21484,-122.034724,37.560164,60.799998,39.027545,1.408693,Jun2015-May2016
21485,-122.034681,37.55983,34.622951,28.816797,2.659885,Jun2015-May2016
21486,-122.034504,37.559958,74.764705,35.735434,1.776353,Jun2015-May2016
21487,-122.034503,37.559957,78.754782,41.062757,2.014664,Jun2015-May2016


In [112]:
df.shape

(21488, 6)

In [113]:
geometry = [Point(xy) for xy in zip(df['Longitude'], df['Latitude'])]

### Split the dataset into BC and NO2 since we are interested only in those two pollutants

In [114]:
BC_df = df[['Longitude', 'Latitude', 'BC Value', 'TimePeriod']]

In [115]:
NO2_df = df[['Longitude', 'Latitude', 'NO2 Value', 'TimePeriod']]

In [116]:
crs = {'init': 'epsg:4326'}
geo_df = gpd.GeoDataFrame(df, crs = crs, geometry = geometry)

### FACILITY LEVEL DATA

In [117]:
Facility_PM = pd.read_csv("Data/All_PM_Facilities_Final.csv")

In [118]:
Facility_PM.drop(columns = ['Unnamed: 0'], inplace=True)

In [119]:
Facility_PM.tail()

Unnamed: 0,eis facility id,company,source-category,latitude,longitude,city,PM25,PM10,units,source-type
343,18776511,Unknown,Manufacturing,37.853018,-122.29271,BERKELEY,0.0105,0.0105,TON,Manufacturing
344,18778911,Unknown,Wholesale,37.71296,-122.18868,SAN-LEANDRO,22.124544,22.971518,TON,Wholesale
345,18779811,Unknown,Manufacturing,37.70782,-122.1855,SAN-LEANDRO,1.5e-05,1.5e-05,TON,Manufacturing
346,18782611,Unknown,Waste-Recycling,37.759959,-122.207782,OAKLAND,0.992654,1.030213,TON,Waste-Recycling
347,18787011,Unknown,Manufacturing,37.766939,-122.216336,OAKLAND,0.079014,0.13169,TON,Manufacturing


## Converting facility and traffic dataframe into a geopandas dataframe for plotting

In [120]:
# Create a list of x and y coordinates for the PM facility data in Oakland using geopandas
geometry_facility_PM = [Point(xy) for xy in zip(Facility_PM['longitude'], Facility_PM['latitude'])]
geo_df_facility_PM = gpd.GeoDataFrame(Facility_PM, crs = crs, geometry = geometry_facility_PM)

In [121]:
# Create a list of x and y coordinates for the Black Carbon concentration data using geopandas
geometry_df_BC = [Point(xy) for xy in zip(BC_df['Longitude'], BC_df['Latitude'])]
geo_df_BC = gpd.GeoDataFrame(BC_df, crs = crs, geometry = geometry_df_BC)

### Combining Air Pollution Monitoring Data and Facility level data

In [122]:
Facility_PM_All = Facility_PM.copy()

In [123]:
Facility_PM.shape

(348, 11)

In [124]:
Facility_PM.describe()

Unnamed: 0,eis facility id,latitude,longitude,PM25,PM10
count,348.0,348.0,348.0,348.0,348.0
mean,11661560.0,37.785381,-122.239371,0.4811451,0.636884
std,6447159.0,0.053832,0.052052,2.289948,2.978462
min,126911.0,37.6745,-122.319375,4e-09,5e-09
25%,10460490.0,37.742283,-122.278356,0.0001342943,0.0001438983
50%,13818610.0,37.795385,-122.25905,0.000692444,0.000759442
75%,17247140.0,37.81787,-122.19672,0.0234496,0.02608391
max,18787010.0,37.888535,-122.10774,22.12454,26.26146


In [125]:
Facility_PM.head()

Unnamed: 0,eis facility id,company,source-category,latitude,longitude,city,PM25,PM10,units,source-type,geometry
0,126911,PACIFIC-BELL,Institution,37.76685,-122.24972,ALAMEDA,0.000731,0.000749,TON,Institution,POINT (-122.24972 37.76685)
1,127111,PACIFIC-BELL,Institution,37.86771,-122.26784,BERKELEY,0.002173,0.002226,TON,Institution,POINT (-122.26784 37.86771)
2,127511,PACIFIC-BELL,Institution,37.80603,-122.26946,OAKLAND,0.008291,0.008494,TON,Institution,POINT (-122.26946 37.80603)
3,127611,PACIFIC-BELL,Institution,37.78422,-122.22241,OAKLAND,0.000706,0.000724,TON,Institution,POINT (-122.22241 37.78422)
4,127711,PACIFIC-BELL,Institution,37.83336,-122.26307,OAKLAND,0.001554,0.001592,TON,Institution,POINT (-122.26307 37.83336)


In [126]:
### Create a columns as eis-source-type
Facility_PM['eis-source'] = Facility_PM['eis facility id'].apply(str) + '-' + Facility_PM['source-type']

In [127]:
### Add an empty column for distance
Facility_PM['dist'] = 0
Facility_PM['dist'].astype(float)

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
343    0.0
344    0.0
345    0.0
346    0.0
347    0.0
Name: dist, Length: 348, dtype: float64

### Add a column for emissions/distance since concentration is directly proportional to emissions and inversly proportional to distance from soure

In [128]:
Facility_PM['emsdist'] = 0
Facility_PM['emsdist'].astype(float)

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
343    0.0
344    0.0
345    0.0
346    0.0
347    0.0
Name: emsdist, Length: 348, dtype: float64

#### Create a dataframe where each column is one of the industrial facilities, with columns as latitude, longitude, PM2.5 emissions value and geometry. Dropping PM10 emissions since Black Carbon is a form on PM2.5 emissions

In [129]:
Oak_PM_lat = Facility_PM[['eis-source', 'latitude']]
Oak_PM_long = Facility_PM[['eis-source', 'longitude']]
Oak_PM_PM25 = Facility_PM[['eis-source', 'PM25']]
Oak_PM_dist = Facility_PM[['eis-source', 'dist']]
Oak_PM_emsdist = Facility_PM[['eis-source', 'emsdist']]
Oak_PM_geo = Facility_PM[['eis-source', 'geometry']]


In [130]:
# Transpose all the dataframes
Oak_PM_lat = Oak_PM_lat.T
Oak_PM_long = Oak_PM_long.T
Oak_PM_PM25 = Oak_PM_PM25.T
#Oak_PM_PM10 = Oak_PM_PM10.T
Oak_PM_dist = Oak_PM_dist.T
Oak_PM_emsdist = Oak_PM_emsdist.T
Oak_PM_geo = Oak_PM_geo.T

In [131]:
## Make the header as the first row in each transposed dataframe
Oak_PM_lat = Oak_PM_lat.rename(columns=Oak_PM_lat.iloc[0]).drop(Oak_PM_lat.index[0])
Oak_PM_long = Oak_PM_long.rename(columns=Oak_PM_long.iloc[0]).drop(Oak_PM_long.index[0])
Oak_PM_PM25 = Oak_PM_PM25.rename(columns=Oak_PM_PM25.iloc[0]).drop(Oak_PM_PM25.index[0])
#Oak_PM_PM10 = Oak_PM_PM10.rename(columns=Oak_PM_PM10.iloc[0].astype(int)).drop(Oak_PM_PM10.index[0])
Oak_PM_dist = Oak_PM_dist.rename(columns=Oak_PM_dist.iloc[0]).drop(Oak_PM_dist.index[0])
Oak_PM_emsdist = Oak_PM_emsdist.rename(columns=Oak_PM_emsdist.iloc[0]).drop(Oak_PM_emsdist.index[0])
Oak_PM_geo = Oak_PM_geo.rename(columns=Oak_PM_geo.iloc[0]).drop(Oak_PM_geo.index[0])

In [132]:
## Add suffix to column header based on the dataframe type
Oak_PM_lat.columns = [str(col) + '_latitude' for col in Oak_PM_lat.columns]
Oak_PM_long.columns = [str(col) + '_longitude' for col in Oak_PM_long.columns]
Oak_PM_PM25.columns = [str(col) + '_PM25' for col in Oak_PM_PM25.columns]
#Oak_PM_PM10.columns = [str(col) + '_PM10' for col in Oak_PM_PM10.columns]
Oak_PM_dist.columns = [str(col) + '_dist' for col in Oak_PM_dist.columns]
Oak_PM_emsdist.columns = [str(col) + '_emsdist' for col in Oak_PM_emsdist.columns]
Oak_PM_geo.columns = [str(col) + '_geo' for col in Oak_PM_geo.columns]

In [133]:
## Remove index for each dataframe
Oak_PM_lat.reset_index(drop=True, inplace=True)
Oak_PM_long.reset_index(drop=True, inplace=True)
Oak_PM_PM25.reset_index(drop=True, inplace=True)
#Oak_PM_PM10.reset_index(drop=True, inplace=True)
Oak_PM_dist.reset_index(drop=True, inplace=True)
Oak_PM_emsdist.reset_index(drop=True, inplace=True)
Oak_PM_geo.reset_index(drop=True, inplace=True)

In [134]:
Oak_PM_combined = Oak_PM_lat.join(Oak_PM_long).join(Oak_PM_PM25).join(Oak_PM_dist).join(Oak_PM_emsdist).join(Oak_PM_geo)


In [135]:
Oak_PM_combined

Unnamed: 0,126911-Institution_latitude,127111-Institution_latitude,127511-Institution_latitude,127611-Institution_latitude,127711-Institution_latitude,127811-Institution_latitude,128211-Institution_latitude,130511-Institution_latitude,136511-EGen_latitude,137211-Institution_latitude,...,18697311-Retail_geo,18697911-Wholesale_geo,18698311-Metals_geo,18705211-Waste-Recycling_geo,18776211-Manufacturing_geo,18776511-Manufacturing_geo,18778911-Wholesale_geo,18779811-Manufacturing_geo,18782611-Waste-Recycling_geo,18787011-Manufacturing_geo
0,37.7668,37.8677,37.806,37.7842,37.8334,37.8119,37.7001,37.7605,37.7868,37.8227,...,POINT (-122.178222 37.714556),POINT (-122.190515 37.744311),POINT (-122.18159 37.71716),POINT (-122.208693 37.760758),POINT (-122.17814 37.707579),POINT (-122.29271 37.853018),POINT (-122.18868 37.71296),POINT (-122.1855 37.70782),POINT (-122.207782 37.759959),POINT (-122.216336 37.766939)


In [136]:
Oak_PM_combined = Oak_PM_combined.reindex(columns=sorted(Oak_PM_combined.columns))

In [137]:
Oak_PM_combined

Unnamed: 0,10452611-Metals_PM25,10452611-Metals_dist,10452611-Metals_emsdist,10452611-Metals_geo,10452611-Metals_latitude,10452611-Metals_longitude,10452911-Institution_PM25,10452911-Institution_dist,10452911-Institution_emsdist,10452911-Institution_geo,...,808611-AsphaltPlant_emsdist,808611-AsphaltPlant_geo,808611-AsphaltPlant_latitude,808611-AsphaltPlant_longitude,808811-FoodPlant_PM25,808811-FoodPlant_dist,808811-FoodPlant_emsdist,808811-FoodPlant_geo,808811-FoodPlant_latitude,808811-FoodPlant_longitude
0,4e-09,0,0,POINT (-122.26765 37.81329),37.8133,-122.268,0.051645,0,0,POINT (-122.23476 37.74567),...,0,POINT (-122.30178 37.86982),37.8698,-122.302,0.590966,0,0,POINT (-122.26824 37.79355),37.7936,-122.268


In [138]:
#Create a datafram where each row contains emissions of PM2.5 for each facility
Oak_PM_combined = Oak_PM_combined.loc[Oak_PM_combined.index.repeat(21488)].reset_index(drop=True)

In [139]:
BC_Facility = BC_df.join(Oak_PM_combined)

In [143]:
# Convert distance or emissions distance column to float type
for idx, col in enumerate(BC_Facility.columns):
        if "_dist" in col:
            BC_Facility[col] = pd.to_numeric(BC_Facility[col], downcast="float")


In [144]:
# Convert distance or emissions distance column to float type
for idx, col in enumerate(BC_Facility.columns):
        if "_emsdist" in col:
            BC_Facility[col] = pd.to_numeric(BC_Facility[col], downcast="float")


### Calculate distance between point of measurement and each facility and add it to the _dist column

In [145]:
### Defining a function to calculate the distance between two GPS coordinates (latitude and longitude)
def distance(origin, destination):
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c

    return d

In [146]:
BC_Facility.head()

Unnamed: 0,Longitude,Latitude,BC Value,TimePeriod,geometry,10452611-Metals_PM25,10452611-Metals_dist,10452611-Metals_emsdist,10452611-Metals_geo,10452611-Metals_latitude,...,808611-AsphaltPlant_emsdist,808611-AsphaltPlant_geo,808611-AsphaltPlant_latitude,808611-AsphaltPlant_longitude,808811-FoodPlant_PM25,808811-FoodPlant_dist,808811-FoodPlant_emsdist,808811-FoodPlant_geo,808811-FoodPlant_latitude,808811-FoodPlant_longitude
0,-122.322594,37.806781,0.818032,Jun2015-May2016,POINT (-122.32259 37.80678),4e-09,0.0,0.0,POINT (-122.26765 37.81329),37.8133,...,0.0,POINT (-122.30178 37.86982),37.8698,-122.302,0.590966,0.0,0.0,POINT (-122.26824 37.79355),37.7936,-122.268
1,-122.32231,37.80615,0.551475,Jun2015-May2016,POINT (-122.32231 37.80615),4e-09,0.0,0.0,POINT (-122.26765 37.81329),37.8133,...,0.0,POINT (-122.30178 37.86982),37.8698,-122.302,0.590966,0.0,0.0,POINT (-122.26824 37.79355),37.7936,-122.268
2,-122.322301,37.80642,0.593712,Jun2015-May2016,POINT (-122.32230 37.80642),4e-09,0.0,0.0,POINT (-122.26765 37.81329),37.8133,...,0.0,POINT (-122.30178 37.86982),37.8698,-122.302,0.590966,0.0,0.0,POINT (-122.26824 37.79355),37.7936,-122.268
3,-122.322299,37.80588,0.489898,Jun2015-May2016,POINT (-122.32230 37.80588),4e-09,0.0,0.0,POINT (-122.26765 37.81329),37.8133,...,0.0,POINT (-122.30178 37.86982),37.8698,-122.302,0.590966,0.0,0.0,POINT (-122.26824 37.79355),37.7936,-122.268
4,-122.322267,37.806689,0.739341,Jun2015-May2016,POINT (-122.32227 37.80669),4e-09,0.0,0.0,POINT (-122.26765 37.81329),37.8133,...,0.0,POINT (-122.30178 37.86982),37.8698,-122.302,0.590966,0.0,0.0,POINT (-122.26824 37.79355),37.7936,-122.268


### Calculating distance between each industrial facility and the point of measurement. 


##### The first method I tried was to iterate through all the columns, identify the columns with "_dist" as suffix, then iterate through rows to calcualte distance using the dist function. But this method took 47 seconds just for 1,000 rows. 

In [148]:
time3 = time.time()
for idx, col in enumerate(BC_Facility.columns):
    if "_dist" in col:
        for index, row in BC_Facility.head(1000).iterrows():
            BC_Facility.at[index,col] = float(distance((row.iloc[1], row.iloc[0]), (row.iloc[idx+3], row.iloc[idx+4])))*0.621
time4 = time.time()

print(time4 - time3)

KeyboardInterrupt: 

##### In the second method, I iterate through all rows, identify the rows with "_dist" as column headers, and then calculate distance using distance function

In [149]:
time1 = time.time()
for index, row in BC_Facility.iterrows():
    for idx, col in enumerate(BC_Facility.columns):
        if "_dist" in col:
            BC_Facility.at[index,col] = float(distance((row.iloc[1], row.iloc[0]), (row.iloc[idx+3], row.iloc[idx+4])))*0.621
            #BC_Facility.at[index,col] = float(row.iloc[idx])
time2 = time.time()            
    
print(time2 - time1)


270.35911536216736


In [151]:
BC_Facility.head()

Unnamed: 0,Longitude,Latitude,BC Value,TimePeriod,geometry,10452611-Metals_PM25,10452611-Metals_dist,10452611-Metals_emsdist,10452611-Metals_geo,10452611-Metals_latitude,...,808611-AsphaltPlant_emsdist,808611-AsphaltPlant_geo,808611-AsphaltPlant_latitude,808611-AsphaltPlant_longitude,808811-FoodPlant_PM25,808811-FoodPlant_dist,808811-FoodPlant_emsdist,808811-FoodPlant_geo,808811-FoodPlant_latitude,808811-FoodPlant_longitude
0,-122.322594,37.806781,0.818032,Jun2015-May2016,POINT (-122.32259 37.80678),4e-09,3.030941,0.0,POINT (-122.26765 37.81329),37.8133,...,0.0,POINT (-122.30178 37.86982),37.8698,-122.302,0.590966,3.10317,0.0,POINT (-122.26824 37.79355),37.7936,-122.268
1,-122.32231,37.80615,0.551475,Jun2015-May2016,POINT (-122.32231 37.80615),4e-09,3.022442,0.0,POINT (-122.26765 37.81329),37.8133,...,0.0,POINT (-122.30178 37.86982),37.8698,-122.302,0.590966,3.075784,0.0,POINT (-122.26824 37.79355),37.7936,-122.268
2,-122.322301,37.80642,0.593712,Jun2015-May2016,POINT (-122.32230 37.80642),4e-09,3.018953,0.0,POINT (-122.26765 37.81329),37.8133,...,0.0,POINT (-122.30178 37.86982),37.8698,-122.302,0.590966,3.080629,0.0,POINT (-122.26824 37.79355),37.7936,-122.268
3,-122.322299,37.80588,0.489898,Jun2015-May2016,POINT (-122.32230 37.80588),4e-09,3.024943,0.0,POINT (-122.26765 37.81329),37.8133,...,0.0,POINT (-122.30178 37.86982),37.8698,-122.302,0.590966,3.069979,0.0,POINT (-122.26824 37.79355),37.7936,-122.268
4,-122.322267,37.806689,0.739341,Jun2015-May2016,POINT (-122.32227 37.80669),4e-09,3.014269,0.0,POINT (-122.26765 37.81329),37.8133,...,0.0,POINT (-122.30178 37.86982),37.8698,-122.302,0.590966,3.084273,0.0,POINT (-122.26824 37.79355),37.7936,-122.268


In [152]:
### Write this to a dataframe
BC_Facility.to_csv("Data/BC_Facility_Dist_ALL.csv")

### Calculate a column as emission PM25 / Dist to use as the feature in the model

In [154]:
time1 = time.time()
for index, row in BC_Facility.iterrows():
    for idx, col in enumerate(BC_Facility.columns):
        if "_emsdist" in col:
            try:
                BC_Facility.at[index,col] = (row.iloc[idx-2]/row.iloc[idx-1])
            except ZeroDivisionError:
                BC_Facility.at[index,col] = 0
time2 = time.time()            
    
print(time2 - time1)


157.45495772361755


In [155]:
### Write this to a dataframe
BC_Facility.to_csv("Data/BC_Facility_EmsDist_ALL.csv")

##### Drop latitude, longitude and emissions since we don't need those columns. Latitude and longitude values are captured in the distance column. Since all the emissions values are the same, that feature would not be important

In [158]:
BC_Facility = BC_Facility.loc[:,~BC_Facility.columns.str.contains('_dist', case=False)] 
BC_Facility = BC_Facility.loc[:,~BC_Facility.columns.str.contains('_PM25', case=False)] 
BC_Facility = BC_Facility.loc[:,~BC_Facility.columns.str.contains('_latitude', case=False)]
BC_Facility = BC_Facility.loc[:,~BC_Facility.columns.str.contains('_longitude', case=False)] 
BC_Facility = BC_Facility.loc[:,~BC_Facility.columns.str.contains('_geo', case=False)] 
BC_Facility = BC_Facility.loc[:,~BC_Facility.columns.str.contains('geometry', case=False)] 
BC_Facility = BC_Facility.loc[:,~BC_Facility.columns.str.contains('Longitude', case=False)] 
BC_Facility = BC_Facility.loc[:,~BC_Facility.columns.str.contains('Latitude', case=False)] 
BC_Facility = BC_Facility.loc[:,~BC_Facility.columns.str.contains('TimePeriod', case=False)] 

In [159]:
BC_Facility.head()

Unnamed: 0,BC Value,10452611-Metals_emsdist,10452911-Institution_emsdist,10453011-WWTP_emsdist,10457011-Metals_emsdist,10457411-Metals_emsdist,10457511-Solvent_emsdist,10457611-Metals_emsdist,10457711-Foundaries_emsdist,10457911-ConcretePlant_emsdist,...,608411-Institution_emsdist,771711-Bakeries_emsdist,772011-EGen_emsdist,772611-Manufacturing_emsdist,773811-Manufacturing_emsdist,7845611-Institution_emsdist,7845711-Crematory_emsdist,804911-FoodPlant_emsdist,808611-AsphaltPlant_emsdist,808811-FoodPlant_emsdist
0,0.818032,1.319722e-09,0.008086,5.3e-05,0.000105,0.010047,0.020865,0.105508,0.004339,0.002241,...,2e-06,0.010207,0.63897,1.6e-05,0.101794,0.109436,0.000564,0.001326,0.47944,0.190439
1,0.551475,1.323433e-09,0.008138,5.3e-05,0.000105,0.00996,0.02113,0.107204,0.004356,0.002253,...,2e-06,0.010257,0.646614,1.6e-05,0.102343,0.108554,0.000564,0.001331,0.475391,0.192135
2,0.593712,1.324963e-09,0.008123,5.3e-05,0.000105,0.010003,0.02109,0.106851,0.004351,0.00225,...,2e-06,0.010245,0.645273,1.6e-05,0.102182,0.10896,0.000565,0.00133,0.477307,0.191833
3,0.489898,1.322339e-09,0.008154,5.3e-05,0.000106,0.009918,0.02118,0.10761,0.004362,0.002256,...,2e-06,0.010271,0.648225,1.6e-05,0.102515,0.108155,0.000563,0.001333,0.47352,0.192498
4,0.739341,1.327022e-09,0.008109,5.3e-05,0.000105,0.010047,0.021063,0.106562,0.004347,0.002248,...,2e-06,0.010234,0.644267,1.6e-05,0.102037,0.109374,0.000566,0.001328,0.479263,0.191606


In [160]:
### Write this to a dataframe
BC_Facility.to_csv("Data/BC_Facility_EmsDist_Inputs.csv")

In [161]:
corr_BC_Facility = BC_Facility.corr()
arr_corr= corr_BC_Facility.as_matrix()

  


In [162]:
corr_BC_Facility_pearson = BC_Facility.corr(method = 'pearson')


In [163]:
corr_BC_Facility_pearson

Unnamed: 0,BC Value,10452611-Metals_emsdist,10452911-Institution_emsdist,10453011-WWTP_emsdist,10457011-Metals_emsdist,10457411-Metals_emsdist,10457511-Solvent_emsdist,10457611-Metals_emsdist,10457711-Foundaries_emsdist,10457911-ConcretePlant_emsdist,...,608411-Institution_emsdist,771711-Bakeries_emsdist,772011-EGen_emsdist,772611-Manufacturing_emsdist,773811-Manufacturing_emsdist,7845611-Institution_emsdist,7845711-Crematory_emsdist,804911-FoodPlant_emsdist,808611-AsphaltPlant_emsdist,808811-FoodPlant_emsdist
BC Value,1.000000,-0.085243,0.007048,0.096268,-0.134468,-0.077580,0.019817,0.043046,0.039387,0.105770,...,0.149518,0.060618,0.000273,0.128021,-0.076436,-0.078714,-0.078848,0.108598,-0.077821,0.012942
10452611-Metals_emsdist,-0.085243,1.000000,-0.159283,-0.091052,-0.166504,0.400432,0.127205,0.226100,-0.271773,-0.047344,...,-0.259033,-0.105060,0.176782,-0.200779,-0.245040,0.411286,0.509304,-0.297784,0.407046,0.112033
10452911-Institution_emsdist,0.007048,-0.159283,1.000000,0.556048,0.347228,-0.337336,-0.124054,-0.203517,0.487600,0.391973,...,0.231601,0.561407,-0.116786,0.257698,0.900605,-0.314277,-0.247554,0.353616,-0.325143,-0.012827
10453011-WWTP_emsdist,0.096268,-0.091052,0.556048,1.000000,0.094366,-0.172592,-0.085591,-0.136863,0.141364,0.300426,...,0.070185,0.681708,-0.096509,0.069599,0.410392,-0.165394,-0.135464,0.111010,-0.168957,-0.045372
10457011-Metals_emsdist,-0.134468,-0.166504,0.347228,0.094366,1.000000,-0.330199,-0.155619,-0.256688,0.496294,0.030852,...,0.315317,0.115968,-0.189131,0.307031,0.457133,-0.332192,-0.298426,0.412351,-0.331910,-0.114100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7845611-Institution_emsdist,-0.078714,0.411286,-0.314277,-0.165394,-0.332192,0.997249,0.329093,0.568873,-0.560627,-0.080091,...,-0.556321,-0.193261,0.381147,-0.422058,-0.480639,1.000000,0.822556,-0.632222,0.999527,0.197840
7845711-Crematory_emsdist,-0.078848,0.509304,-0.247554,-0.135464,-0.298426,0.812707,0.214824,0.382554,-0.497827,-0.055791,...,-0.490091,-0.162936,0.286230,-0.374695,-0.416752,0.822556,1.000000,-0.557039,0.818663,0.192631
804911-FoodPlant_emsdist,0.108598,-0.297784,0.353616,0.111010,0.412351,-0.622323,-0.276860,-0.461509,0.892935,0.029685,...,0.963878,0.139323,-0.334601,0.854576,0.538460,-0.632222,-0.557039,1.000000,-0.628800,-0.198563
808611-AsphaltPlant_emsdist,-0.077821,0.407046,-0.325143,-0.168957,-0.331910,0.999050,0.320966,0.556291,-0.559131,-0.083680,...,-0.552392,-0.196398,0.370791,-0.419853,-0.485090,0.999527,0.818663,-0.628800,1.000000,0.190539


In [164]:
print(plt.get_backend())

# close any existing plots
plt.close("all")

# mask out the top triangle
arr_corr[np.triu_indices_from(arr_corr)] = np.nan

fig, ax = plt.subplots(figsize=(50, 50))

hm = sns.heatmap(arr_corr, cbar=True, vmin = -1, vmax = 1, center = 0,
                 fmt='.2f', annot_kws={'size': 8}, annot=True, 
                 square=False, cmap = 'coolwarm')
#cmap=plt.cm.Blues

ticks = np.arange(corr_BC_Facility.shape[0]) + 0.5
ax.set_xticks(ticks)
ax.set_xticklabels(corr_BC_Facility.columns, rotation=90, fontsize=8)
ax.set_yticks(ticks)
ax.set_yticklabels(corr_BC_Facility.index, rotation=360, fontsize=8)

ax.set_title('correlation matrix')
plt.tight_layout()
#plt.savefig("corr_matrix_incl_anno_double.png", dpi=300)

module://ipykernel.pylab.backend_inline
Error in callback <function flush_figures at 0x7f51bf1209e0> (for post_execute):


KeyboardInterrupt: 

#### Based on the above heatmap, there seems to be only a small correlation between distance to industrial facilities and the BC concentartion measured at a given point. A small correlation indicates that there could be other factors that are contributing to BC measurements at a given point. This could also indicate that most of the meseasurement are from traffic, thus looking into location of all traffic intersections would be helpful.  