In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import glob
import matplotlib.pyplot as plt

In [None]:
csv_files=glob.glob('data/AQMS'+'/*.csv')
df=pd.concat((pd.read_csv(f) for f in csv_files))

In [None]:
df.info()

In [None]:
df.drop(['Species','Units','Provisional or Ratified'],axis=1,inplace=True)

In [None]:
df.groupby('Site').describe()

GN0, LW5, RB7, RD0, WM0 has too few data (less than half of the total amount)

In [None]:
# list of site codes with PM data
valid_AQMS=df.dropna()['Site'].unique().tolist()
print(valid_AQMS)

In [None]:
for site in ['GN0','LW5','RB7','RD0','WM0']:
    df=df[df['Site']!=site]
    valid_AQMS.remove(site)
df=df[df['Site'].isin(valid_AQMS)]
df=df.reset_index(drop=True)
df.info()

In [None]:
# KF1 and KC1 are very similar
fig,ax=plt.subplots()
df[df['Site']=='KC1'].plot(x='ReadingDateTime',y='Value',ax=ax,label='KC1',linewidth=0.5)
df[df['Site']=='KF1'].plot(x='ReadingDateTime',y='Value',ax=ax,label='KF1',linewidth=0.5)
plt.show()

In [None]:
# Remove KF1
df=df[df['Site']!='KF1']
valid_AQMS.remove('KF1')

In [None]:
len(df['Site'].unique())

In [None]:
df.groupby('Site').describe()

In [None]:
# read in AQMS geometry
gdf=gpd.read_file('data/AQMS/AQMS.gpkg')
gdf.head()

In [None]:
gdf=gdf.loc[:,['latitude','longitude','siteid','sitename']]

gdf.info()

In [None]:
# check if all sites with data are within the geometry dataframe
for elem in valid_AQMS:
    if elem not in gdf['siteid'].unique().tolist():
        print(elem)

**TK3**: Thurrock - Stanford-le-Hope

      51.518162000000, 0.4395480000000

**Thurrock** is not in London, so ignore

In [None]:
valid_AQMS.remove('TK3')
df=df[df['Site']!='TK3']

In [None]:
len(valid_AQMS)

In [None]:
len(df['Site'].unique())

In [None]:
AQMS_gdf=gdf[gdf['siteid'].isin(valid_AQMS)]
AQMS_gdf.info()

In [None]:
AQMS_gdf=AQMS_gdf.astype({'latitude':'float64','longitude':'float64','siteid':'string','sitename':'string'})
AQMS_gdf.dtypes

In [None]:
london_gdf=gpd.read_file('data/london_boundary.shp')
london_gdf=london_gdf.set_crs(27700)
london_gdf.crs

In [None]:
AQMS_gdf = gpd.GeoDataFrame(AQMS_gdf, geometry=gpd.points_from_xy(AQMS_gdf.longitude, AQMS_gdf.latitude),crs='EPSG:4326')

In [None]:
AQMS_gdf=AQMS_gdf.to_crs(27700)
AQMS_gdf=AQMS_gdf.drop(['latitude','longitude'],axis=1)

In [None]:
AQMS_gdf.to_file('data/AQMS_loc.shp')

------
**Geometry of sites saved.**

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.info()

There are many null values.

According to [this](https://www.researchgate.net/publication/237537115_Estimation_of_missing_values_in_air_pollution_data_using_single_imputation_techniques), using mean-before-after is an approach.

In [None]:
val=df['Value'].values
val.shape

In [None]:
for i in range(21):
    print(val[8760*i])
    print(val[8760*i-1])

In [None]:
val[183940:]

In [None]:
for i in range(21):
    for j in range(8760*i,8760*(i+1)-1):
        if np.isnan(val[j]):
            for w in range(1,13):
                if not np.isnan(val[j+w]):
                    for z in range(1,w+1):
                        val[j+z]=(val[j-1]+val[j+w])*z/(z+1)
                    break
            
val.min()

In [None]:
df['DateTime']=pd.to_datetime(df['ReadingDateTime'])

In [None]:
dm_PM=pd.DataFrame()
for site in valid_AQMS:
    temp=df[df['Site']==site].groupby(pd.Grouper(key='DateTime',freq='D')).mean()
    temp['Site']=site
    dm_PM=dm_PM.append(temp)
dm_PM

In [None]:
dm_PM.to_csv('data/AQMS_readings.csv')


In [None]:
fig,ax=plt.subplots(1,figsize=(10,8))

london_gdf.plot(ax=ax,color='lightgrey')
AQMS_gdf.plot(ax=ax,color='green',marker='^')

ax.axis('off')
plt.show()