In [None]:
import numpy as np
import xarray as xr
import pandas as pd
import geopandas as gpd
import seaborn as sns
import functools
import itertools
import shapely
import datetime
import os
import copy
from pysal.lib import weights
from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler

In [None]:
#  spatial time series features
daymet_interp = xr.open_dataset(f"/content/drive/MyDrive/Spatial Time Series interpolation/daymet_fusion_xr_interp.nc")
tropomi_interp = xr.open_dataset(f"/content/drive/MyDrive/Spatial Time Series interpolation/tropomi_fusion_xr_interp.nc")
goes_16_interp = xr.open_dataset(f"/content/drive/MyDrive/Spatial Time Series interpolation/goes_16_bandst_fusion_xr_interp.nc")
rtma_interp = xr.open_dataset(f"/content/drive/MyDrive/Spatial Time Series interpolation/rtma_fusion_xr_interp.nc")

In [None]:
#  spatial raster features
grid_raster = gpd.read_file(f'/content/drive/MyDrive/spatial_features.shp')

grid_raster.columns = [
    'id', 'dem_mean', 'dem_std', 'bh_mean', 'bh_std', 'pop_den_mean', 'pop_den_std',
    'water', 'evergreen_needleleaf_vegetation', 'evergree_broadleaf_vegetation',
    'deciduous_needleleaf_vegetation', 'deciduou_broadleaf_vegetation', 'grass', 'urban', 'geometry'
]

In [None]:
#  spatial vector features
grid_vector = gpd.read_file(f'/content/drive/MyDrive/spatial_vector_features.shp')

grid_vector.columns = [
    'id', 'bus_stop_count', 'bus_route_count', 'bus_route_length', 'railroad_length', 'trail_length',
    'zone_commercial', 'zone_community_facility', 'zone_industrial', 'zone_others', 'zone_residential',
    'osm_road_length', 'osm_lanes_count', 'max_speed', 'geometry'
]

In [None]:
##  coordinates
grids_coords = gpd.read_file(f'/content/drive/reference_grids.shp')

grids_coords['lon'] = grids_coords['geometry'].centroid.x
grids_coords['lat'] = grids_coords['geometry'].centroid.y
grids_coords['lon'] = grids_coords['lon'].apply(lambda x: round(x, 6))
grids_coords['lat'] = grids_coords['lat'].apply(lambda x: round(x, 6))

In [None]:
# daymet
daymet_interp_df = daymet_interp[list(daymet_interp.keys())].to_dataframe().reset_index()
daymet_interp_df['lon'] = daymet_interp_df['lon'].apply(lambda x: round(x, 6))
daymet_interp_df['lat'] = daymet_interp_df['lat'].apply(lambda x: round(x, 6))
daymet_interp_df = daymet_interp_df.merge(grids_coords[['lon','lat','id']], how='left', on=['lon','lat'])
# tropomi
tropomi_interp_df = tropomi_interp[list(tropomi_interp.keys())].to_dataframe().reset_index()
tropomi_interp_df['lon'] = tropomi_interp_df['lon'].apply(lambda x: round(x, 6))
tropomi_interp_df['lat'] = tropomi_interp_df['lat'].apply(lambda x: round(x, 6))
tropomi_interp_df = tropomi_interp_df.merge(grids_coords[['lon','lat','id']], how='left', on=['lon','lat'])
# goes_16
goes_16_interp_df = goes_16_interp[list(goes_16_interp.keys())].to_dataframe().reset_index()
goes_16_interp_df['lon'] = goes_16_interp_df['lon'].apply(lambda x: round(x, 6))
goes_16_interp_df['lat'] = goes_16_interp_df['lat'].apply(lambda x: round(x, 6))
goes_16_interp_df = goes_16_interp_df.merge(grids_coords[['lon','lat','id']], how='left', on=['lon','lat'])
# rtma
rtma_interp_df = rtma_interp[list(rtma_interp.keys())].to_dataframe().reset_index()
rtma_interp_df['lon'] = rtma_interp_df['lon'].apply(lambda x: round(x, 6))
rtma_interp_df['lat'] = rtma_interp_df['lat'].apply(lambda x: round(x, 6))
rtma_interp_df = rtma_interp_df.merge(grids_coords[['lon','lat','id']], how='left', on=['lon','lat'])

In [None]:
training_data = functools.reduce(lambda x, y: pd.merge(x, y, on = ['lon','lat','time','id']),
                  [
                    daymet_interp_df,
                    tropomi_interp_df,
                    goes_16_interp_df,
                    rtma_interp_df
                  ]
                ).merge(grid_raster[[x for x in grid_raster.columns if x != "geometry"]],
                        how = "left", on = "id"
                ).merge(grid_vector[[x for x in grid_vector.columns if x != "geometry"]],
                        how = "left", on = "id"
                )

In [None]:
###  call new brunswick air quality monitoring station data
nb_stations = pd.read_excel(f"/content/drive/MyDrive/NB_Air_Quality_stations_metadata.xlsx")

# exclude stations that are not included or inactive
nb_stations = nb_stations[(nb_stations['city'].isin(['Saint John','Colson Cove'])) & (nb_stations['measurements'].isin(["SO2","TRS"]))]
nb_stations = nb_stations[~((nb_stations['station_name'] == "Forest Hills") & (nb_stations['owner'] == "Government of New Brunswick"))] 
nb_stations = nb_stations[nb_stations['station_name'] != "Musquash"]
nb_stations = nb_stations[nb_stations['station_name'] != "Milford"]
nb_stations = nb_stations[nb_stations['station_name'] != "Saint John Street"]

# projection
nb_stations['geometry'] = list(zip(nb_stations['longitude'], nb_stations['latitude']))
nb_stations['geometry'] = nb_stations['geometry'].apply(shapely.Point)
nb_stations = gpd.GeoDataFrame(nb_stations, geometry='geometry', crs = 'epsg:4326')

##  get grids with stations : for training
grid_stations = gpd.tools.sjoin(nb_stations, grids_coords, predicate="within", how='left')

In [None]:
grid_stations

Unnamed: 0,id_left,station_name,province,city,latitude,longitude,owner,measurements,geometry,index_right,id_right,lon,lat
39,3,West Side,New Brunswick,Saint John,45.25315,-66.079931,Government of New Brunswick,SO2,POINT (-66.07993 45.25315),1481,31_24,-66.082065,45.254983
40,8,Expansion Ave,New Brunswick,Saint John,45.275,-66.002222,Irving Oil Ltd,SO2,POINT (-66.00222 45.27500),2229,47_20,-66.002065,45.274983
41,9,Silver Falls Site,New Brunswick,Saint John,45.297944,-66.00725,Irving Oil Ltd,SO2,POINT (-66.00725 45.29794),2177,46_15,-66.007065,45.299983
43,19,Champlain Heights,New Brunswick,Saint John,45.291222,-66.002917,Irving Oil Ltd,SO2,POINT (-66.00292 45.29122),2226,47_17,-66.002065,45.289983
46,28,Midwood Ave.,New Brunswick,Saint John,45.267736,-66.017017,Irving Oil Ltd,SO2,POINT (-66.01702 45.26774),2089,44_21,-66.017065,45.269983
47,30,Castle Street,New Brunswick,Saint John,45.280278,-66.056111,Government of New Brunswick,SO2,POINT (-66.05611 45.28028),1711,36_19,-66.057065,45.279983
49,76,Grandview west site,New Brunswick,Saint John,45.273342,-66.013944,Irving Oil Ltd,SO2,POINT (-66.01394 45.27334),2135,45_20,-66.012065,45.274983
51,11,Sherbrook St.,New Brunswick,Saint John,45.252778,-66.095278,Irving Pulp and Paper Ltd,SO2,POINT (-66.09528 45.25278),1340,28_24,-66.097065,45.254983
53,13,Bridge Street,New Brunswick,Saint John,45.275556,-66.088611,Irving Pulp and Paper Ltd,SO2,POINT (-66.08861 45.27556),1430,30_20,-66.087065,45.274983
55,31,Lorneville cemetary,New Brunswick,Colson Cove,45.174903,-66.166046,&#201;nergie NB Power,SO2,POINT (-66.16605 45.17490),698,14_40,-66.167065,45.174983


In [None]:
###   extract training pixels
grid_training_data = training_data[training_data['id'].isin(grid_stations['id_right'])]

In [None]:
grid_training_data

Unnamed: 0,id,lon,lat,time,dayl,prcp,srad,tmax,tmin,vp,...,railroad_length,trail_length,zone_commercial,zone_community_facility,zone_industrial,zone_others,zone_residential,osm_road_length,osm_lanes_count,max_speed
531178,14_40,-66.167065,45.174983,2021-12-01,31622.119141,0.000000,156.824640,3.358348,-5.599938,402.515904,...,0.000000,0.006915,0.0,0.000284,0.0,0.94463,0.055086,0.005172,2.0,
531179,14_40,-66.167065,45.174983,2021-12-02,31541.019531,13.712173,145.450810,9.937601,-5.044690,419.832087,...,0.000000,0.006915,0.0,0.000284,0.0,0.94463,0.055086,0.005172,2.0,
531180,14_40,-66.167065,45.174983,2021-12-03,31463.750000,0.000000,177.297360,6.609003,-5.176153,415.732090,...,0.000000,0.006915,0.0,0.000284,0.0,0.94463,0.055086,0.005172,2.0,
531181,14_40,-66.167065,45.174983,2021-12-04,31390.369141,0.000000,133.391192,-1.880342,-9.124002,306.386582,...,0.000000,0.006915,0.0,0.000284,0.0,0.94463,0.055086,0.005172,2.0,
531182,14_40,-66.167065,45.174983,2021-12-05,31320.919922,0.000000,163.932311,0.681379,-9.308628,301.977123,...,0.000000,0.006915,0.0,0.000284,0.0,0.94463,0.055086,0.005172,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1697025,47_20,-66.002065,45.274983,2023-12-27,30813.529297,6.809342,52.391535,4.952975,1.295010,670.519798,...,0.009147,0.000000,0.0,0.000000,1.0,0.00000,0.000000,0.004401,0.0,
1697026,47_20,-66.002065,45.274983,2023-12-28,30843.380859,10.234112,74.031475,5.916368,0.608096,638.360699,...,0.009147,0.000000,0.0,0.000000,1.0,0.00000,0.000000,0.004401,0.0,
1697027,47_20,-66.002065,45.274983,2023-12-29,30877.580078,8.694092,31.994636,1.884968,-0.101750,606.194419,...,0.009147,0.000000,0.0,0.000000,1.0,0.00000,0.000000,0.004401,0.0,
1697028,47_20,-66.002065,45.274983,2023-12-30,30916.109375,1.947879,45.677716,1.811728,-0.308118,597.075452,...,0.009147,0.000000,0.0,0.000000,1.0,0.00000,0.000000,0.004401,0.0,


In [None]:
##  assess correlation between lags of time series features and SO2 outcome

lag_window = 16
spts_features = [
 'dayl',
 'prcp',
 'srad',
 'tmax',
 'tmin',
 'vp',
 'cloud_top_height_mean',
 'cloud_base_height_mean',
 'cloud_top_pressure_mean',
 'cloud_base_pressure_mean',
 'surface_albedo_mean',
 'SO2_column_number_density_15km_mean',
 'absorbing_aerosol_index_mean',
 'cloud_fraction_mean',
 'CMI_C01_mean',
 'CMI_C02_mean',
 'CMI_C03_mean',
 'CMI_C08_mean',
 'CMI_C09_mean',
 'CMI_C10_mean',
 'CMI_C11_mean',
 'CMI_C12_mean',
 'UGRD_mean',
 'VGRD_mean',
 'WDIR_mean',
 'WIND_mean',
 'GUST_mean',
 'VIS_mean',
]

grid_training_data_place_holder_time_lags = copy.deepcopy(grid_training_data[['id','lon','lat','time']])

sts_names = []
sts_names_lags = []
sts_corr_stat = []
sts_corr_p = []

for sts_f in spts_features:
    sts_names.append(sts_f)
    sts_names_lags.append(0)
    sts_corr_stat.append(spearmanr(grid_training_data[sts_f], grid_training_data['mean_so2'], nan_policy='omit').statistic)
    sts_corr_p.append(spearmanr(grid_training_data[sts_f], grid_training_data['mean_so2'], nan_policy='omit').pvalue)
    for n in range(lag_window):
        grid_training_data_place_holder_time_lags[sts_f] = grid_training_data.groupby(['id'])[sts_f].shift(n + 1)
        sts_names.append(sts_f)
        sts_names_lags.append(n + 1)
        sts_corr_stat.append(spearmanr(grid_training_data.groupby(['id'])[sts_f].shift(n + 1).iloc[n + 1:], 
                                       grid_training_data['mean_so2'].iloc[n + 1:], nan_policy='omit').statistic)
        sts_corr_p.append(spearmanr(grid_training_data.groupby(['id'])[sts_f].shift(n + 1).iloc[n + 1:], 
                                    grid_training_data['mean_so2'].iloc[n + 1:], nan_policy='omit').pvalue)

check_corr_temporal_effects = pd.DataFrame({
    "name": sts_names,
    "lag": sts_names_lags,
    "corr": sts_corr_stat,
    "pvalue": sts_corr_p
})

check_corr_temporal_effects = check_corr_temporal_effects.pivot_table(index=["name"], columns=["lag"], values=["corr"]).reset_index()
check_corr_temporal_effects['max_lag'] = check_corr_temporal_effects.iloc[:,1:].abs().idxmax(axis=1)

check_corr_temporal_effects['chosen_lag'] = pd.Series(
    [
        1, 1, 1, 3, 3, 3, 3, 3, 2, 5, 1, 1, 1, 2, 2, 2, 1, 3, 0, 1, 3, 0, 0, 0, 3, 1, 1, 1
    ]
)

check_corr_temporal_effects['corr>0.25'] = check_corr_temporal_effects.iloc[:,1:-2].abs().apply(lambda x: (x>=0.025).sum(), axis=1)

check_corr_temporal_effects.columns = ["name"] + ["lag_" + str(x) for x in range(16+1)] + ["max_lag", "chosen_lag","corr>0.25"]

In [None]:
##  create time lag dataframe for the training dataset

grid_training_data_lags = copy.deepcopy(grid_training_data)

for sts_f in spts_features:
    lag_num = check_corr_temporal_effects[check_corr_temporal_effects['name'] == sts_f]['chosen_lag'].iloc[0]

    for l in range(lag_num):
        grid_training_data_lags[sts_f + "_lag_" + str(l + 1)] = grid_training_data_lags.groupby(['id'])[sts_f].shift(l + 1)

grid_training_data_lags = grid_training_data_lags[
    grid_training_data_lags['time'] >= datetime.datetime.date(datetime.datetime.strptime('2022-01-01', "%Y-%m-%d"))
]

In [None]:
grid_training_data_lags

Unnamed: 0,id,lon,lat,time,dayl,prcp,srad,tmax,tmin,vp,...,CMI_C12_mean_lag_3,UGRD_mean_lag_1,VGRD_mean_lag_1,WDIR_mean_lag_1,WDIR_mean_lag_2,WIND_mean_lag_1,WIND_mean_lag_2,GUST_mean_lag_1,GUST_mean_lag_2,VIS_mean_lag_1
31,14_40,-66.167065,45.174983,2022-01-01,31058.339844,3.078496,71.404086,5.806877,-0.813123,575.572571,...,2607.690196,-0.434240,-3.168987,66.043739,214.731399,3.423341,2.832630,5.162143,5.160947,10126.966949
32,14_40,-66.167065,45.174983,2022-01-02,31109.419922,4.655179,78.547347,4.036223,-3.215499,481.804755,...,2585.232512,-2.384828,0.603799,92.609969,66.043739,4.279731,3.423341,5.601187,5.162143,13801.115710
33,14_40,-66.167065,45.174983,2022-01-03,31164.650391,0.000000,104.149027,-5.948529,-12.904969,226.460513,...,2585.300263,-0.922555,-3.899955,56.987571,92.609969,4.904693,4.279731,7.497472,5.601187,6741.196412
34,14_40,-66.167065,45.174983,2022-01-04,31223.990234,0.000000,160.193705,-5.751938,-17.842593,150.296401,...,2584.762901,3.391147,-5.751408,312.644484,56.987571,6.788105,4.904693,10.138053,7.497472,13388.340915
35,14_40,-66.167065,45.174983,2022-01-05,31287.410156,14.987514,159.928036,6.936341,-11.159033,260.711121,...,2346.805712,2.550958,-5.360283,215.624444,312.644484,6.803008,6.788105,8.903457,10.138053,15843.258667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9127,47_20,-66.002065,45.274983,2023-12-27,30813.529297,6.809342,52.391535,4.952975,1.295010,670.519798,...,2578.641563,2.117842,2.738792,231.327076,163.235249,3.625900,1.557898,5.535815,2.649163,14574.373808
9128,47_20,-66.002065,45.274983,2023-12-28,30843.380859,10.234112,74.031475,5.916368,0.608096,638.360699,...,2622.016620,-0.048200,0.170883,153.912369,231.327076,0.768074,3.625900,2.414619,5.535815,13618.583674
9129,47_20,-66.002065,45.274983,2023-12-29,30877.580078,8.694092,31.994636,1.884968,-0.101750,606.194419,...,2574.034823,-1.225095,-1.729374,104.233845,153.912369,2.582216,0.768074,4.897467,2.414619,11875.080161
9130,47_20,-66.002065,45.274983,2023-12-30,30916.109375,1.947879,45.677716,1.811728,-0.308118,597.075452,...,2474.125792,-0.995014,-4.386865,40.488894,104.233845,4.868534,2.582216,7.774146,4.897467,9684.157388


In [None]:
###  get neighbour grid indices for each grid

for index, row in grids_coords.iterrows():
    neighbors = grids_coords[grids_coords.geometry.touches(row['geometry'])].id.tolist()
    grids_coords.at[index, "neighbors"] = ", ".join(neighbors)

In [None]:
##  extract neighbourhood ids

grid_training_data_lags['neighbour_1'] = grid_training_data_lags.apply(lambda x: x['neighbors'].split(",")[0].strip(), axis=1)
grid_training_data_lags['neighbour_2'] = grid_training_data_lags.apply(lambda x: x['neighbors'].split(",")[1].strip(), axis=1)
grid_training_data_lags['neighbour_3'] = grid_training_data_lags.apply(lambda x: x['neighbors'].split(",")[2].strip(), axis=1)
grid_training_data_lags['neighbour_4'] = grid_training_data_lags.apply(lambda x: x['neighbors'].split(",")[3].strip(), axis=1)
grid_training_data_lags['neighbour_5'] = grid_training_data_lags.apply(lambda x: x['neighbors'].split(",")[4].strip(), axis=1)
grid_training_data_lags['neighbour_6'] = grid_training_data_lags.apply(lambda x: x['neighbors'].split(",")[5].strip(), axis=1)
grid_training_data_lags['neighbour_7'] = grid_training_data_lags.apply(lambda x: x['neighbors'].split(",")[6].strip(), axis=1)
grid_training_data_lags['neighbour_8'] = grid_training_data_lags.apply(lambda x: x['neighbors'].split(",")[7].strip(), axis=1)

In [None]:
grids_coords

Unnamed: 0,id,geometry,lon,lat,neighbors
0,0_0,"POLYGON ((-66.23956 45.37748, -66.23456 45.377...",-66.237065,45.374983,"0_1, 1_0, 1_1"
1,0_1,"POLYGON ((-66.23956 45.37248, -66.23456 45.372...",-66.237065,45.369983,"0_0, 0_2, 1_0, 1_1, 1_2"
2,0_2,"POLYGON ((-66.23956 45.36748, -66.23456 45.367...",-66.237065,45.364983,"0_1, 0_3, 1_1, 1_2, 1_3"
3,0_3,"POLYGON ((-66.23956 45.36248, -66.23456 45.362...",-66.237065,45.359983,"0_2, 0_4, 1_2, 1_3, 1_4"
4,0_4,"POLYGON ((-66.23956 45.35748, -66.23456 45.357...",-66.237065,45.354983,"0_3, 0_5, 1_3, 1_4, 1_5"
...,...,...,...,...,...
3614,76_42,"POLYGON ((-65.85956 45.16748, -65.85456 45.167...",-65.857065,45.164983,"75_41, 75_42, 75_43, 76_41, 76_43"
3615,76_43,"POLYGON ((-65.85956 45.16248, -65.85456 45.162...",-65.857065,45.159983,"75_42, 75_43, 75_44, 76_42, 76_44"
3616,76_44,"POLYGON ((-65.85956 45.15748, -65.85456 45.157...",-65.857065,45.154983,"75_43, 75_44, 75_45, 76_43, 76_45"
3617,76_45,"POLYGON ((-65.85956 45.15248, -65.85456 45.152...",-65.857065,45.149983,"75_44, 75_45, 75_46, 76_44, 76_46"


In [None]:
###  store a list of neighbour features

nn_features = []

for nn in range(8):
    grid_training_data_lags_nn = grid_training_data_lags[['neighbour_' + str(nn + 1), 'time']].merge(
        training_data, 
        left_on=['neighbour_' + str(nn + 1), 'time'], right_on=['id','time'], how="left"
        )
    grid_training_data_lags_nn = grid_training_data_lags_nn.drop(["id","lon","lat"], axis=1)
    grid_training_data_lags_nn.columns = ['neighbour_' + str(nn + 1), 'time'] + \
                                         ["nn" + str(nn + 1) + "_" + x for x in grid_training_data_lags_nn.columns[2:]]

    nn_features.append(grid_training_data_lags_nn)

grid_training_data_lags_nn = pd.concat([x.iloc[:,2:] for x in nn_features], axis=1)

In [None]:
grid_training_data_lags_nn

Unnamed: 0,nn1_dayl,nn1_prcp,nn1_srad,nn1_tmax,nn1_tmin,nn1_vp,nn1_SO2_column_number_density_15km_mean,nn1_cloud_top_height_mean,nn1_cloud_base_height_mean,nn1_cloud_top_pressure_mean,...,nn8_UGRD_mean_lag_1,nn8_VGRD_mean_lag_1,nn8_WDIR_mean_lag_1,nn8_WDIR_mean_lag_2,nn8_WIND_mean_lag_1,nn8_WIND_mean_lag_2,nn8_GUST_mean_lag_1,nn8_GUST_mean_lag_2,nn8_VIS_mean_lag_1,nn8_is_edge
0,31058.339844,3.086212,71.420054,5.799699,-0.820301,575.275425,-0.000362,727.649779,29.621426,93106.570102,...,-0.474092,-3.201869,65.137207,215.374503,3.479132,2.906294,5.197589,5.170059,10186.576847,0
1,31109.419922,4.675640,78.573370,4.022940,-3.228781,481.306833,0.000016,5419.749023,4419.749023,51014.652344,...,-2.444098,0.630502,91.322784,65.137207,4.370503,3.479132,5.647206,5.197589,13846.402120,0
2,31164.650391,0.000000,104.048840,-5.974019,-12.924892,226.145153,0.000122,7771.303221,6771.303225,36259.410531,...,-0.965189,-3.989025,57.937766,91.322784,5.031250,4.370503,7.519479,5.647206,6802.664383,0
3,31223.990234,0.000000,160.256700,-5.759654,-17.856413,150.126072,-0.000749,,,,...,3.434808,-5.897815,311.666104,57.937766,6.939020,5.031250,10.150039,7.519479,13437.284021,0
4,31287.410156,14.914809,159.950989,6.928624,-11.167287,260.517137,0.000236,,,,...,2.580625,-5.522941,211.589231,311.666104,6.983966,6.939020,8.923939,10.150039,15823.186907,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,30813.529297,6.783496,52.567623,4.942736,1.267902,669.360001,-0.000044,2003.640031,1003.640143,80145.517435,...,2.288472,2.729505,234.248892,168.156267,3.717127,1.573556,5.585123,2.755984,14493.918463,0
8756,30843.380859,10.218635,74.198167,5.916809,0.592290,637.714650,0.000662,,,,...,0.006710,0.210575,157.334942,234.248892,0.793917,3.717127,2.517767,5.585123,13508.786130,0
8757,30877.580078,8.692737,32.008866,1.875459,-0.111093,605.731797,0.000014,6312.113620,5312.113639,44888.866757,...,-1.371500,-1.666249,88.733349,157.334942,2.601825,0.793917,5.047617,2.517767,11750.515248,0
8758,30916.109375,1.982723,46.190379,1.795917,-0.321194,596.575797,0.000183,2574.883077,1574.883014,72722.976324,...,-1.305164,-4.292547,44.447757,88.733349,4.936901,2.601825,8.062611,5.047617,9628.985971,0


In [None]:
grid_train = pd.concat([grid_training_data_lags.reset_index(drop=True), grid_training_data_lags_nn.reset_index(drop=True)], axis=1)
grid_train = grid_train.drop(["geometry","neighbors"], axis=1)
grid_train = grid_train[['id','lon','lat','time','mean_so2','max_so2'] + [x for x in grid_train.columns if x not in ['id','lon','lat','time','mean_so2','max_so2']]]

grid_train.to_pickle(f"/content/drive/MyDrive/training_data_grids_final.pkl")

In [None]:
grid_train.shape

(8760, 964)

In [None]:
nn_queen_less_rook = {}
nn_rook = {}

for feature_name in list(grid_train.columns)[4:]:
    nn_queen_less_rook["dir_diagonal_" + feature_name] = grid_train[
        [x for x in grid_train.columns if feature_name in x and ("nn1_" in x or "nn3_" in x or "nn6_" in x or "nn8_" in x)]
    ].mean(axis=1)
    
    nn_rook["dir_cardinal_" + feature_name] = grid_train[
        [x for x in grid_train.columns if feature_name in x and ("nn2_" in x or "nn4_" in x or "nn5_" in x or "nn7_" in x)]
    ].mean(axis=1)

    print("completed : " + feature_name)

grid_training_data_lags_nn_queen_less_rook = pd.DataFrame(nn_queen_less_rook)
grid_training_data_lags_nn_rook = pd.DataFrame(nn_rook)

In [None]:
grid_training_data_lags_nn_queen_less_rook.to_pickle(f"/content/drive/MyDrive/training_data_grids_nn_queen_less_rook.pkl")
grid_training_data_lags_nn_rook.to_pickle(f"/content/drive/MyDrive/training_data_grids_nn_rook.pkl")