In [23]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

import numpy as np

from spreg import OLS
from libpysal.weights import Kernel
from esda.moran import Moran
import statsmodels.api as sm

In [2]:
# Read in all the data

AQMS_df=pd.read_csv('data/daily.csv')
Rd_gdf=gpd.read_file('data/london_Road.shp')
Gsp_gdf=gpd.read_file('data/LD_GreenSpace.shp')
loc_gdf=gpd.read_file('data/AQMS_loc.shp')
cond=pd.read_csv('data/cond_daily.csv')

In [3]:
AQMS_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7665 entries, 0 to 7664
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   DateTime  7665 non-null   object 
 1   Value     7665 non-null   float64
 2   Site      7665 non-null   object 
dtypes: float64(1), object(2)
memory usage: 179.8+ KB


In [4]:
Rd_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 209201 entries, 0 to 209200
Data columns (total 20 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   fictitious  209201 non-null  object  
 1   identifier  209201 non-null  object  
 2   class       209201 non-null  object  
 3   roadNumber  32375 non-null   object  
 4   name1       172339 non-null  object  
 5   name1_lang  0 non-null       object  
 6   name2       0 non-null       object  
 7   name2_lang  0 non-null       object  
 8   formOfWay   209201 non-null  object  
 9   length      209201 non-null  int64   
 10  primary     209201 non-null  object  
 11  trunkRoad   209201 non-null  object  
 12  loop        209201 non-null  object  
 13  startNode   209201 non-null  object  
 14  endNode     209201 non-null  object  
 15  structure   93 non-null      object  
 16  nameTOID    172339 non-null  object  
 17  numberTOID  32375 non-null   object  
 18  function    2092

In [5]:
Gsp_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 10021 entries, 0 to 10020
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   function   10021 non-null  object  
 1   distName1  2679 non-null   object  
 2   distName2  11 non-null     object  
 3   distName3  0 non-null      object  
 4   distName4  0 non-null      object  
 5   geometry   10021 non-null  geometry
dtypes: geometry(1), object(5)
memory usage: 469.9+ KB


In [6]:
loc_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   siteid    21 non-null     object  
 1   sitename  21 non-null     object  
 2   geometry  21 non-null     geometry
dtypes: geometry(1), object(2)
memory usage: 632.0+ bytes


In [7]:
cond.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   DateTime  365 non-null    object 
 1   bp_mean   365 non-null    float64
 2   tmp_mean  365 non-null    float64
 3   rh_mean   365 non-null    float64
dtypes: float64(3), object(1)
memory usage: 11.5+ KB


In [None]:
site_name=AQMS_df['Site'].unique().tolist()

In [8]:
# check correlation between PM data and the conditional variables

AQMS_df.merge(cond,how='left',on='DateTime').corr()

Unnamed: 0,Value,bp_mean,tmp_mean,rh_mean
Value,1.0,0.240444,-0.139696,-0.050411
bp_mean,0.240444,1.0,0.052443,-0.362295
tmp_mean,-0.139696,0.052443,1.0,-0.41619
rh_mean,-0.050411,-0.362295,-0.41619,1.0


In [9]:
Y=AQMS_df['Value'].values
X=np.tile(cond[['bp_mean','tmp_mean','rh_mean']].values,(21,1))
print(X.shape,Y.shape)

(7665, 3) (7665,)


In [11]:
# fit a OLS model
m_multi=sm.OLS(Y,X)
m_multi_fit=m_multi.fit()
print(m_multi_fit.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.674
Model:                            OLS   Adj. R-squared (uncentered):              0.674
Method:                 Least Squares   F-statistic:                              5277.
Date:                Wed, 02 Mar 2022   Prob (F-statistic):                        0.00
Time:                        07:40:37   Log-Likelihood:                         -27052.
No. Observations:                7665   AIC:                                  5.411e+04
Df Residuals:                    7662   BIC:                                  5.413e+04
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [13]:
weight=Kernel.from_dataframe(loc_gdf,function='gaussian')

In [19]:
AQMS_df['Date']=pd.to_datetime(AQMS_df['DateTime'])

In [22]:
type(AQMS_df['Date'].values[0])

numpy.datetime64

In [26]:
moran=[]
for date in AQMS_df['Date'].values:
    mi=Moran(AQMS_df[AQMS_df['Date']==date].Value.values,weight)
    moran.append(mi.I)
moran

[0.036448373290137843,
 0.05133791825211746,
 0.06620287626761011,
 0.029148892240181863,
 0.07105415712062906,
 -0.009483229027273174,
 -0.029286036471369556,
 0.024869005255235905,
 -0.002564728839679633,
 -0.0019760975277556656,
 0.06539853203741847,
 0.05403253278966145,
 -0.019632323483785193,
 -0.012844323696860272,
 0.009027009872557973,
 -0.006158883742440713,
 0.021483219555856557,
 0.04759006215905462,
 0.008749151993178722,
 0.02498937300803994,
 0.05648512139262148,
 0.05225069778400054,
 0.01883585323194943,
 0.050423890952931996,
 0.04734225770628097,
 0.021734102616220333,
 0.04156451166033041,
 -0.003919419391526301,
 -0.0036092547749057044,
 0.04503666256952569,
 0.016333618998160375,
 0.03871311806167887,
 0.0968533041261891,
 0.023054252811617812,
 0.042062912836642756,
 -0.040654821822698785,
 0.0011508244043209998,
 -0.03382482415969873,
 0.049561076081319135,
 0.034561331477550465,
 0.03065970543203157,
 0.0951879459717388,
 0.17515053685807305,
 0.023686351618464

In [30]:
# mean global moran's I of the PM readings
sum(moran)/len(moran)

0.033165887514608874

In [None]:
# reindex loc_gdf and set buffer zones around each site (1km)

loc_gdf=loc_gdf.set_index('siteid')
loc_gdf['buffer_1km']=loc_gdf['geometry'].buffer(1000)
loc_gdf.head()

In [None]:
# Get all green spaces

Gsp=Gsp_gdf['geometry'].unary_union

In [None]:
# Set buffer zones for every road (50m)

Rd_gdf['buffer_50m']=Rd_gdf['geometry'].buffer(50)

In [None]:
# Get all roads
Rd=Rd_gdf['geometry'].unary_union

In [None]:
# Get all road buffer zones

Rd_buffer=Rd_gdf['buffer_50m'].unary_union

In [None]:
# Get all near-road green spaces

nRd_Gsp=Gsp.intersection(Rd_buffer)

In [None]:
AQMS_shp=[]
for site in loc_gdf.index:
    AQMS_shp.append(loc_gdf.loc[site,'buffer_1km'])

In [None]:
Rd_shp=[]
Gsp_shp=[]
nRd_Gsp_shp=[]
for poly in AQMS_shp:
    Rd_shp.append(poly.intersection(Rd))
    Gsp_shp.append(poly.intersection(Gsp))
    nRd_Gsp_shp.append(poly.intersection(nRd_Gsp))
print(len(Rd_shp),len(Gsp_shp),len(nRd_Gsp_shp))

In [None]:
loc_gdf['Road']=Rd_shp
loc_gdf['GreenSpace']=Gsp_shp
loc_gdf['NR_GreenSpace']=nRd_Gsp_shp
loc_gdf=loc_gdf.set_geometry('Road').set_geometry('GreenSpace').set_geometry('NR_GreenSpace')
loc_gdf

In [None]:
loc_gdf.dtypes

In [None]:
london=gpd.read_file('data/london_boundary.shp')

In [None]:
fig,ax=plt.subplots(1,figsize=(15,13))

london.plot(color='lightgrey',ax=ax)
loc_gdf['buffer_1km'].plot(color='silver',ax=ax)
loc_gdf['GreenSpace'].plot(label='Green space',color='limegreen',ax=ax)
loc_gdf['NR_GreenSpace'].plot(label='Near_road green space',color='darkgreen',ax=ax)
loc_gdf['Road'].plot(color='black',label='Road',linewidth=0.1,ax=ax)
loc_gdf['geometry'].plot(markersize=5,marker='^',color='blue',label='Air quality monitoring site',ax=ax)

ax.axis('off')

legend=ax.legend(loc='best',shadow=True,fontsize=15)

#plt.savefig('sample1.png',facecolor='black',dpi=500)
plt.show()

In [None]:
loc_gdf['min_dis']=pd.Series(dtype='float64')
for index, row in loc_gdf.iterrows():
    dis=[]
    for i, v in loc_gdf['geometry'].iteritems():
        dis.append(row['geometry'].distance(v))
    dis.remove(0)
    loc_gdf.loc[index,'min_dis']=min(dis)

In [None]:
loc_gdf[loc_gdf['min_dis']<=1500]

In [None]:
AQMS_df[AQMS_df['Site'].isin(['BL0','CD9','GR4','GB0'])].groupby('Site').describe()

In [None]:
loc_gdf.drop(['BL0','GR4'],inplace=True)

In [None]:
loc_gdf['buffer_area']=loc_gdf['buffer_1km'].area

In [None]:
loc_gdf['pct_Gsp_area']=loc_gdf['GreenSpace'].area/loc_gdf['buffer_area'] # percentage of gsp area
loc_gdf['pct_nR_Gsp_area']=loc_gdf['NR_GreenSpace'].area/loc_gdf['buffer_area'] # percentage of near road gsp area

In [None]:
loc_gdf.info()

In [None]:
df=AQMS_df.merge(loc_gdf,left_on='Site',right_index=True)
df.drop(['sitename','geometry','buffer_1km','Road','GreenSpace','NR_GreenSpace','min_dis']
        ,axis=1,inplace=True)
df.info()

In [None]:
df=df.merge(cond,on='DateTime')
df.info()

In [None]:
df['DateTime']=pd.to_datetime(df['DateTime'])
df.info()

In [None]:
df['month']=df['DateTime'].dt.month

In [None]:
df.dtypes

In [None]:
df['month'].isin([2,3,4])

In [None]:
high=df[df['month'].isin([2,3,4])].copy()
high.info()

In [None]:
vars=['pct_Gsp_area','pct_nR_Gsp_area','bp_mean','tmp_mean','rh_mean']
Y=high['Value'].values
X=high[vars].values
m_con=sm.OLS(Y,X)
m_con_fit=m_con.fit()
print(m_con_fit.summary())

In [None]:
from sklearn.linear_model import LinearRegression

reg=LinearRegression().fit(X,Y)
reg.score(X,Y)

In [None]:
high.corr().round(3)

In [None]:
df.info()

In [None]:
df=AQMS_df.drop(AQMS_df[AQMS_df['Site'].isin(['BL0','GR4'])].index).reset_index(drop=True)
df.info()

In [None]:
df.groupby('Site').describe()

In [None]:
df=df.merge(loc_gdf,left_on='Site',right_index=True)
df

In [None]:
fig,ax=plt.subplots(1,figsize=(15,13))

london.plot(color='lightgrey',ax=ax)
loc_gdf['buffer_1km'].plot(color='silver',ax=ax)
loc_gdf['GreenSpace'].plot(label='Green space',color='limegreen',ax=ax)
loc_gdf['NR_GreenSpace'].plot(label='Near_road green space',color='darkgreen',ax=ax)
loc_gdf['Road'].plot(color='black',label='Road',linewidth=0.1,ax=ax)
loc_gdf['geometry'].plot(markersize=5,marker='^',color='blue',label='Air quality monitoring site',ax=ax)

ax.axis('off')

legend=ax.legend(loc='best',shadow=True,fontsize=15)

#plt.savefig('sample1.png',facecolor='black',dpi=500)
plt.show()

In [None]:
loc_gdf.head()

In [None]:
df.head()

In [None]:
cond.head()

In [None]:
df=df.merge(cond,on='DateTime')
df.head()

In [None]:
df.info()

In [None]:
loc_gdf.info()

In [None]:
df=df.merge(loc_gdf,left_on='Site',right_index=True)
df.info()

In [None]:
var=['pct_Gsp_area','pct_nR_Gsp_area','BP','Tmp','RH']