In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics

In [4]:
def comp_distance(X1, X2):
    """
    Compute distance of locations in X1 and X2. 
    Use earth radius R = 6,371 km.
    
    Args:
    X1: lat and lon, shape (m1, 2)
    X2: lat and lon, shape (m2, 2)
    
    Returns:
    ouput: distance matrix, shape (m1, m2)
    """
    R = 6371.
    X1 = X1 * np.pi/180.
    X2 = X2 * np.pi/180.
    
    A = np.cos(X1[:,[1]]) @ np.cos(X2[:,[1]]).T
    A = A * np.cos(X1[:,[0]] - X2[:,[0]].T)
    A += np.sin(X1[:,[1]]) @ np.sin(X2[:,[1]]).T
    A = np.where(A > 1., 1., A)  # for stability    
    
    return R * np.arccos(A)


def interpolate_pm(
        unknown_loc, known_loc, known_pm, num_neighbors):
    """
    Interpolate the PM2.5 values of unknown locations, 
    using k nearest known stations. 
    
    unkown_pm: array (batch, 1, H, W) if reshape=True,
    else shape (batch, num_unknown)
    """
    distance = comp_distance(unknown_loc, known_loc)
    distance = np.where(distance < 1e-6, 1e-6, distance)     
    bound = np.partition(
        distance, num_neighbors - 1, axis=1
    )[:, [num_neighbors - 1]]
    neighbor_mask = np.where(distance <= bound, 1., np.nan)
        
    neighbor_dist = distance * neighbor_mask
    R = 1 / neighbor_dist
    weight = R / np.nansum(R, axis=1, keepdims=True)
    weight = np.nan_to_num(weight, nan=0.)
    
    unknown_pm = known_pm @ weight.T
    return unknown_pm


def split_dataset(filepath):
    """
    Implement a 60:20:20 contiguous split.
    """
    pm_df = pd.read_csv(
        filepath, header=None, skiprows=1)
    length = len(pm_df)
    train_df = pm_df.loc[: int(0.6 * length)]
    valid_df = pm_df.loc[int(0.6 * length): int(0.8 * length)]
    test_df = pm_df.loc[int(0.8 * length):]
    return train_df, valid_df, test_df


In [5]:
data_path = ("/content/drive/MyDrive/pm2.5/"
            "data/test_data/long_pm2.5.csv")
loc_path = ("/content/drive/MyDrive/pm2.5/"
           "data/test_data/long_locations.csv")
test_mode = True
if test_mode:
    data_path = "../data/test_data/long_pm2.5.csv"
    loc_path = "../data/test_data/long_locations.csv"

# train/target/test indices
test_idx = [0,1,4,11,15,24,25,27,32,33,37,39]   
train_idx = list(set(range(40)) - set(test_idx))

# load dataset
loc_df = pd.read_csv(loc_path, header=None, skiprows=1)
train_loc = loc_df.iloc[train_idx].to_numpy()
test_loc = loc_df.iloc[test_idx].to_numpy()

pm_df = pd.read_csv(data_path, header=None, skiprows=1)
train_pm = pm_df.iloc[:, train_idx].to_numpy()
test_pm = pm_df.iloc[:, test_idx].to_numpy()

# compute loss
pred_pm = interpolate_pm(
    test_loc, train_loc, train_pm, num_neighbors=10)

rmse = np.sqrt(metrics.mean_squared_error(pred_pm, test_pm))
mae = metrics.mean_absolute_error(pred_pm, test_pm)
mape = metrics.mean_absolute_percentage_error(pred_pm, test_pm)

print(f'val loss  : {rmse:>.4f} | {mae:>.4f} | {mape:>.4f}')

val loss  : 4.6801 | 3.2924 | 0.3387
