# A1. Preprocessing

**INPUTS DIFFER FROM PREVIOUS VERSIONS:** Some files are renamed from what O2 sent originally. The input rasters must now have lowercase titles to be consistent with previous files. What was initially sent as `_va.tif` should be renamed to `_vaL.tif` to signify that this contains variance on the linear scale. This document computes new files denoted `_va.tif` which contain the variance on the probability scale.

In [20]:
import os
import tqdm
import pickle
import pyreadr
import rasterio
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from torch.distributions import Normal
import torch
from scipy.stats import norm as scipy_norm

Add basic features to `XData`.

In [21]:
# data_path should contain meta.RData and species-specific folders with prior predictions, a_maps, and va_maps
data_path = "/scratch/project_2003104/gtikhono/bird_app/single_species/data/"
#data_path = "/scratch/project_2003104/gtikhono/bird_app/data/acanthis_flammea/"

meta = pyreadr.read_r(data_path + "meta.RData")
XData = meta["XData"]
XData["log_duration"] = np.log(XData["duration"]+1e-6)
XData["rec_class"] = ""
XData.loc[XData["rec_type"] == "point", "rec_class"] = "fixed"
XData.loc[(XData["duration"] <= 300)&(XData["rec_type"] != "point"), "rec_class"] = "short"
XData.loc[(XData["duration"] > 300)&(XData["rec_type"] != "point"), "rec_class"] = "long"

print("Missing values:")
print(XData.isna().sum(axis=0))

PyreadrError: File b'/scratch/project_2003104/gtikhono/bird_app/single_species/data/meta.RData' does not exist!

Extract prior migration parameters.

In [None]:
prior_params = meta["migration.pars"]
index_style = dict(zip(prior_params.index, [x.lower().replace(" ", "_") for x in prior_params.index]))
prior_params.rename(index=index_style, inplace=True)
with open(data_path + "migration_prior_params.pickle", "wb") as handle:
    pickle.dump(prior_params, handle, protocol=pickle.HIGHEST_PROTOCOL)

Load a single species and perform basic feature engineerng.

In [None]:
sp = "Turdus merula"
sp_lower = sp.lower().replace(" ", "_").split(".")[0]
sp_dir = data_path + sp_lower + "/"
species_raw = pyreadr.read_r(sp_dir + sp + "_prior.RData")
species = pd.concat([species_raw[k] for k in species_raw.keys()], axis=1)
species["complete"] = species.isna().sum(axis=1) == 0

Extract different sets of training data (e.g., all data from 2023) and all data from Helsinki. Make sure the extracted XData data is in the same order as the species data.

In [None]:
year1 = XData["j.date"] <= 365
helsinki = (60 <= XData["lat"])&(XData["lat"] <= 60.5)&(24.5 <= XData["lon"])&(XData["lon"] <= 25.5)
XData1 = XData[year1]
species1 = species[year1]
XData1_helsinki = XData[year1&helsinki]
species1_helsinki = species[year1&helsinki]

print("XData in same order as species data:")
print("2023:", (XData1.index==species1.index).all())
print("2023 in Helsinki:", (XData1_helsinki.index==species1_helsinki.index).all())

Save final training XData to `data_path` and training species data `data_path/spname`.

In [None]:
with open(data_path + "XData.pickle", "wb") as handle:
    pickle.dump(XData, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(data_path + "XData_2023.pickle", "wb") as handle:
    pickle.dump(XData1, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(data_path + "XData_2023_helsinki.pickle", "wb") as handle:
    pickle.dump(XData1_helsinki, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
sp_lower = sp.lower().replace(" ", "_").split(".")[0]
sp_dir = data_path + sp_lower + "/"
with open(sp_dir + sp_lower + "_prior.pickle", "wb") as handle:
    pickle.dump(species, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(sp_dir + sp_lower +"_2023_prior.pickle", "wb") as handle:
    pickle.dump(species1, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(sp_dir + sp_lower + "_2023_helsinki_prior.pickle", "wb") as handle:
    pickle.dump(species1_helsinki, handle, protocol=pickle.HIGHEST_PROTOCOL)

Convert variance map to probability scale instead of linear scale. This is a slow approximation. 

**@Gleb - I will try to derive a fast and accurate approximation, but failing that we probably need to calculate these with many samples.**

GT: for whatever reason this uses only one core.

In [None]:
m = Normal(torch.tensor([0.0]), torch.tensor([1.0]))
m.cdf(torch.tensor(np.arange(10))).numpy()

In [None]:
with rasterio.open(sp_dir+sp_lower+"_a.tif") as src:
    a_map = src.read(1) 
    profile = src.profile
    
with rasterio.open(sp_dir+sp_lower+"_vaL.tif") as src:
    vaL_map = src.read(1) 
vaL_map[(np.isnan(vaL_map))&(~np.isnan(a_map))] = 1.0 # ensure a_map != nan implies va_map != nan
vaL_map[np.isnan(a_map)] = np.nan

aL_map = scipy_norm.ppf(a_map)
idx = ~np.isnan(vaL_map)
va_map = np.nan*vaL_map
aL_map, vaL_map = aL_map[idx], vaL_map[idx]
dn = Normal(torch.tensor([0.0]), torch.tensor([1.0]))

# sharing randomness across cells to save computation
# running in loop and using moment formula because broadcasting kept crashing
n_mc = 100
E_Phi = np.zeros(idx.sum())
E_Phi_squared = np.zeros(idx.sum())
for _ in tqdm.tqdm(range(n_mc)):
    L_sample = aL_map + np.sqrt(vaL_map)*np.random.normal(0, 1)
    p_sample = scipy_norm.cdf(L_sample)
    p_sample = dn.cdf(torch.tensor(np.arange(10))).numpy()
    E_Phi += p_sample
    E_Phi_squared += np.square(p_sample)
E_Phi /= n_mc
E_Phi_squared /= n_mc

va_map[idx] = E_Phi_squared - np.square(E_Phi)

with rasterio.open(sp_dir + sp_lower + "_va.tif", "w", **profile) as dst:
    dst.write(va_map.astype(np.float32), 1)

In [None]:
plt.imshow(vaL_map)
plt.colorbar();

In [None]:
plt.imshow(va_map)
plt.colorbar();