In [1]:
import os
import scipy
import datetime
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from shapely.geometry import Point
from sklearn.impute import KNNImputer

In [2]:
start_date = '2020-03-01'
end_date = '2022-03-31'
long_term_exposure_years = 5
N_neighbors_spatial_interpolation = 3
N_neighbors_temporal_interpolation = 3

In [3]:
# for measuring long-term air pollution exposure
long_term_start_date = (datetime.datetime.strptime(start_date, "%Y-%m-%d") - relativedelta(years=long_term_exposure_years)).strftime('%Y-%m-%d')

In [4]:
site_information = pd.read_csv(os.path.join('air_pollution', 'all_site_information.csv'))
location = {}
for idx, row in site_information.iterrows():
    station_name = row['Site Name']
    station_lat = row['Latitude']
    station_lon = row['Longitude']
    location[station_name] = (station_lat, station_lon)

In [5]:
def get_pollution_df(pollution_name):
    station_start_idx = []
    for idx, line in enumerate(open(os.path.join("air_pollution", "%s.csv" % pollution_name))):
        if idx == 3:
            stations = line.strip().split(",")
            for j, station in enumerate(stations):
                if station != '':
                    station_start_idx.append(j)
            break

    station_names = list(map(lambda x: x.replace('"', ''), "".join(stations).split('""')))

    df = pd.read_csv(os.path.join("air_pollution", "%s.csv" % pollution_name), skiprows=4,
                     skipfooter=1, engine='python', na_values="No data")

    date = df['Date']

    station_df = []

    def rename_column(x):
        if "PM10" in x:
            return "pm10"
        elif "PM2.5" in x:
            return "pm2_5"
        elif "Ozone" in x:
            return "o3"
        elif "Nitrogen dioxide" in x:
            return "no2"
        else:
            return x

    for i in range(len(station_names)):
        station_idx = station_start_idx[i]
        if i == len(station_names)-1:
            next_station_idx = -1
        else:
            next_station_idx = station_start_idx[i+1]
        station_name = station_names[i]
        one_station_data = pd.DataFrame(
            df.iloc[:, station_idx:next_station_idx:2].values)
        one_station_data.columns = df.columns[station_idx:next_station_idx:2]
        one_station_data['station_name'] = station_name
        one_station_data['date'] = date
        one_station_data = one_station_data.rename(
            columns=lambda x: rename_column(x))

        station_df.append(one_station_data)

    station_df = pd.concat(station_df, axis=0, sort=True)
    station_df['lat'] = station_df.apply(
        lambda row: location[row['station_name']][0], axis=1)
    station_df['lon'] = station_df.apply(
        lambda row: location[row['station_name']][1], axis=1)

    return station_df

In [6]:
pm2_5_df = get_pollution_df("PM2.5")
pm10_df = get_pollution_df("PM10")
no2_df = get_pollution_df("NO2")
o3_df = get_pollution_df("O3")

df = pd.merge(pm2_5_df, pm10_df, on=["date", "station_name", "lat", "lon"], how="outer")
df = pd.merge(df, no2_df, on=["date", "station_name", "lat", "lon"], how="outer")
df = pd.merge(df, o3_df, on=["date", "station_name", "lat", "lon"], how="outer")

df = df[['date', 'station_name', 'lat', 'lon', 'pm2_5', 'pm10', 'no2', 'o3']]
df['coords'] = list(zip(df['lon'], df['lat']))
df['coords'] = df['coords'].apply(Point)

df['date'] = pd.to_datetime(df['date'])
df = df[(df["date"] >= long_term_start_date) & (df["date"] <= end_date)]
df['date'] = df['date'].dt.strftime('%Y-%m-%d')

In [None]:
ltlas_gdf = gpd.GeoDataFrame.from_file(os.path.join("gis", 'lad19.geojson'))
ltlas_gdf.head()

In [None]:
pollution_gdf = gpd.GeoDataFrame(df, geometry='coords', crs=4326)  # WGS84
pollution_gdf = pollution_gdf.to_crs(ltlas_gdf.crs)
pollution_gdf.head()

In [9]:
# spatial join
pointInPolys = gpd.tools.sjoin(pollution_gdf, ltlas_gdf, predicate="within", how='right')

# remove non-UK stations
non_UK_stations = pointInPolys[pointInPolys['district_id'].isnull()]['station_name'].unique()
pollution_gdf = pollution_gdf[~pollution_gdf['station_name'].isin(non_UK_stations)]
pointInPolys = pointInPolys[~pointInPolys['station_name'].isin(non_UK_stations)]

In [None]:
pollution_ltla_gdf = pointInPolys[['date', 'station_name', 'district_id', 'district_name',
                                   'lat', 'lon', 'district_lat', 'district_lon', 'pm2_5', 'pm10', 'no2', 'o3']]
pollution_ltla_gdf = pollution_ltla_gdf.rename(columns={'lat': 'station_lat', 'lon': 'station_lon'})
pollution_ltla_gdf['date'] = pd.to_datetime(pollution_ltla_gdf['date'])
stations = pollution_ltla_gdf['station_name'].unique()
districts = pollution_ltla_gdf['district_name'].unique()

print("# of air pollution stations: %s" % len(stations))
print("# of districts: %s" % len(ltlas_gdf['district_name'].unique()))

In [None]:
# for districts without stations, fill empty values
districts_without_stations = pollution_ltla_gdf[pollution_ltla_gdf['station_name'].isnull()]
dates = pd.date_range(start=long_term_start_date, end=end_date)
new_data = []
for idx, row in districts_without_stations.iterrows():
    for date in dates:
        new_data.append({
            "date": date,
            "station_name": row.station_name,
            "district_id": row.district_id,
            "district_name": row.district_name,
            "station_lat": row.station_lat,
            "station_lon": row.station_lon,
            "district_lat": row.district_lat,
            "district_lon": row.district_lon,
            "pm2_5": row.pm2_5,
            "pm10": row.pm10,
            "no2": row.no2,
            "o3": row.o3,
        })
districts_without_stations_filled_dates = pd.DataFrame.from_records(new_data)
districts_without_stations_filled_dates.head()
new_pollution_ltla_gdf = pd.concat([pollution_ltla_gdf[~pollution_ltla_gdf['station_name'].isnull()], districts_without_stations_filled_dates], axis=0)
new_pollution_ltla_gdf = new_pollution_ltla_gdf.sort_values(['district_name', 'station_name', 'date'])

# make sure no date is missing for each district
for district in districts:
    assert(len(dates) == len(new_pollution_ltla_gdf[new_pollution_ltla_gdf['district_name'] == district]['date'].unique()))

# make sure no district is missing for each date
for date in dates:
    one_day_df = new_pollution_ltla_gdf[new_pollution_ltla_gdf['date'] == date]
    assert(len(one_day_df['district_name'].unique()) == len(ltlas_gdf['district_name'].unique()))

new_pollution_ltla_gdf = new_pollution_ltla_gdf.reset_index()
new_pollution_ltla_gdf = new_pollution_ltla_gdf.drop('index', axis=1)
new_pollution_ltla_gdf.head()

In [None]:
# for districts that have no air pollution monitoring station,
# use the district's location as the station location for spatial interpolation
new_pollution_ltla_gdf['station_lat'] = new_pollution_ltla_gdf['station_lat'].fillna(value=new_pollution_ltla_gdf['district_lat'])
new_pollution_ltla_gdf['station_lon'] = new_pollution_ltla_gdf['station_lon'].fillna(value=new_pollution_ltla_gdf['district_lon'])
new_pollution_ltla_gdf = new_pollution_ltla_gdf.sort_values(by=['district_name', 'station_name', 'date'])
new_pollution_ltla_gdf.head()

In [13]:
# Average daily values into LTLAs
new_pollution_ltla_gdf = new_pollution_ltla_gdf.groupby(['date', 'district_name', 'district_id', 'district_lon', 'district_lat'])[['pm2_5', 'pm10', 'no2', 'o3']].agg('mean').reset_index()

In [14]:
new_pollution_ltla_gdf['date'] = pd.to_datetime(new_pollution_ltla_gdf['date'])

air_pollution_short_term = new_pollution_ltla_gdf.copy()
air_pollution_short_term = air_pollution_short_term[(air_pollution_short_term["date"] >= start_date)
                                                  & (air_pollution_short_term["date"] <= end_date)]
air_pollution_short_term['date'] = air_pollution_short_term['date'].dt.strftime('%Y-%m-%d')

air_pollution_long_term = new_pollution_ltla_gdf.copy()
air_pollution_long_term = air_pollution_long_term[(air_pollution_long_term["date"] >= long_term_start_date)
                                                 & (air_pollution_long_term["date"] < start_date)]
air_pollution_long_term['date'] = air_pollution_long_term['date'].dt.strftime('%Y-%m-%d')

In [None]:
# Short-term exposure

# Temporal interpolation
for district in air_pollution_short_term['district_name'].unique():
    for pollutant in ['pm2_5', 'pm10', 'no2', 'o3']:
        district_df = air_pollution_short_term[air_pollution_short_term['district_name'] == district].reset_index().copy()
        pollutant_values = district_df[pollutant].interpolate(method="linear", limit=N_neighbors_temporal_interpolation)
        air_pollution_short_term.loc[(air_pollution_short_term['district_name'] == district), pollutant] = air_pollution_short_term.loc[(
            air_pollution_short_term['district_name'] == district), pollutant].fillna(value=pollutant_values)

# Spatial interpolation
def spatial_distance(X1, X2, missing_values=None):
    # X: 'pm2_5', 'pm10', 'no2', 'o3', 'district_lon', 'district_lat'
    return scipy.spatial.distance.euclidean(X1[4:6], X2[4:6])

unique_dates = pd.unique(air_pollution_short_term['date'])
for date in tqdm(unique_dates, desc="Short-term air pollution data spatial interpolation"):
    pollutant_matrix = air_pollution_short_term[air_pollution_short_term['date'] == date][[
        'pm2_5', 'pm10', 'no2', 'o3', 'district_lon', 'district_lat']]
    imputer = KNNImputer(n_neighbors=N_neighbors_spatial_interpolation, weights='distance', metric=spatial_distance)
    imputed_matrix = imputer.fit_transform(pollutant_matrix)
    imputed_df = pd.DataFrame(data=imputed_matrix[:, 0:4], index=pollutant_matrix.index, columns=['pm2_5', 'pm10', 'no2', 'o3'])
    air_pollution_short_term.loc[(air_pollution_short_term['date'] == date), ['pm2_5', 'pm10', 'no2', 'o3']] = air_pollution_short_term.loc[(
        air_pollution_short_term['date'] == date), ['pm2_5', 'pm10', 'no2', 'o3']].fillna(value=imputed_df)

air_pollution_short_term = air_pollution_short_term.drop('district_lon', axis=1)
air_pollution_short_term = air_pollution_short_term.drop('district_lat', axis=1)

air_pollution_short_term[['pm2_5', 'pm10', 'no2', 'o3']] = air_pollution_short_term[['pm2_5', 'pm10', 'no2', 'o3']].astype(float)
air_pollution_short_term.to_csv("../air_pollution_short_term.csv", float_format="%.1f", na_rep="N/A", index=False)

In [16]:
# Long-term exposure

pollutant_variables = ['pm2_5',	'pm10',	'no2',	'o3']
air_pollution_long_term = air_pollution_long_term.groupby(
    ['district_name', 'district_id', 'district_lon', 'district_lat'])[pollutant_variables].mean().reset_index()

# Spatial interpolation
def spatial_distance(X1, X2, missing_values=None):
    # X: 'pm2_5', 'pm10', 'no2', 'o3', 'district_lon', 'district_lat'
    return scipy.spatial.distance.euclidean(X1[4:6], X2[4:6])

pollutant_matrix = air_pollution_long_term[['pm2_5', 'pm10', 'no2', 'o3', 'district_lon', 'district_lat']]
imputer = KNNImputer(n_neighbors=N_neighbors_spatial_interpolation, weights='distance', metric=spatial_distance)
imputed_matrix = imputer.fit_transform(pollutant_matrix)
imputed_df = pd.DataFrame(data=imputed_matrix[:, 0:4], index=pollutant_matrix.index, columns=['pm2_5', 'pm10', 'no2', 'o3'])
air_pollution_long_term.loc[:, ['pm2_5', 'pm10', 'no2', 'o3']] = air_pollution_long_term.loc[:, ['pm2_5', 'pm10', 'no2', 'o3']].fillna(value=imputed_df)

air_pollution_long_term = air_pollution_long_term.drop('district_lon', axis=1)
air_pollution_long_term = air_pollution_long_term.drop('district_lat', axis=1)

air_pollution_long_term[['pm2_5', 'pm10', 'no2', 'o3']] = air_pollution_long_term[['pm2_5', 'pm10', 'no2', 'o3']].astype(float)
air_pollution_long_term.to_csv("../air_pollution_long_term.csv", float_format="%.1f", na_rep="N/A", index=False)