# Compute KNN similarities

Computes similarities between each pair of dates based on how skillfully the history of one date predicts the history of the other.

In [1]:
## Package loading

# Autoreload packages that are modified
%load_ext autoreload
%autoreload 2

# Plotting magic
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# Load relevant packages
import numpy as np
import pandas as pd
from sklearn import *
import sys
import subprocess
from datetime import datetime, timedelta
import netCDF4
import time
from functools import partial
import os

if os.path.basename(os.getcwd()) == "experiments":
    os.chdir(os.path.join("..",".."))

# Adds 'experiments' folder to path to load experiments_util
sys.path.insert(0, 'src/experiments')
# Load general utility functions
from experiments_util import *
# Load functionality for fitting and predicting
from fit_and_predict import *
# Load functionality for evaluation
from skill import *

## Prepare experimental results directory structure

# Set hindcast_year to None to obtain forecasts and to a specific year to obtain hindcasts
hindcast_year = None

# Choose the name of this experiment
experiment = "knn"
if hindcast_year is not None:
    experiment = "knn-hindcast_{}".format(hindcast_year) ### For hindcasts
    
# Name of cache directory for storing non-submission-date specific
# intermediate files
cache_dir = os.path.join('results', experiment, 'shared')
# if cache_dir doesn't exist, create it
if not os.path.isdir(cache_dir):
    os.makedirs(cache_dir)

## Select target variable

In [2]:
# the variable to be predicted
#gt_id = "contest_precip" # "contest_precip" or "contest_tmp2m"
gt_id = "contest_tmp2m" # "contest_precip" or "contest_tmp2m"

## Process inputs

# Identify measurement variable name
measurement_variable = get_measurement_variable(gt_id) # 'tmp2m' or 'prate'

# column names for gt_col, clim_col and anom_col 
gt_col = measurement_variable
clim_col = measurement_variable+"_clim"
anom_col = get_measurement_variable(gt_id)+"_anom" # 'tmp2m_anom' or 'prate_anom'

## Compute ground truth cosine similarities between pairs of dates

In [3]:
if experiment == "knn":
    # Non-hindcast version
    # Load ground truth anomalies
    anoms = get_lat_lon_date_features(anom_ids = [gt_id], first_year=get_first_year(gt_id))
else:
    # Hindcast version
    tic()
    # Load ground truth data
    anoms = get_lat_lon_date_features(gt_ids = [gt_id], first_year=get_first_year(gt_id))
    # Load ground truth data climatology
    climatology = get_climatology(gt_id)
    # Identify ground truth data from this hold out year
    first_holdout_date = datetime(month=4, day=18, year=hindcast_year)
    last_holdout_date = datetime(month=4, day=17, year=hindcast_year+1)
    gt_col = get_measurement_variable(gt_id)
    holdout = anoms.loc[(anoms.start_date >= first_holdout_date)
                        &(anoms.start_date <= last_holdout_date), 
                        ['lat','lon','start_date',gt_col]]
    # Merge the hindcast year ground truth data into climatology dataframe
    climatology = pd.merge(
        holdout[[gt_col]], climatology,
        left_on=[holdout.lat, holdout.lon, holdout.start_date.dt.month,
                 holdout.start_date.dt.day],
        right_on=[climatology.lat, climatology.lon,
                  climatology.start_date.dt.month,
                  climatology.start_date.dt.day],
        how='left', suffixes=('', '_clim'))
    clim_col = gt_col+"_clim"
    # Remove the influence of hindcast year from 30-year climatology average
    years_in_clim = 30
    climatology[clim_col] = (climatology[clim_col]*years_in_clim - climatology[gt_col])/(years_in_clim-1)
    # Merge modified climatology into dataset
    anoms = pd.merge(anoms, climatology[[clim_col]],
                      left_on=['lat', 'lon', anoms.start_date.dt.month,
                               anoms.start_date.dt.day],
                      right_on=[climatology.lat, climatology.lon,
                                climatology.start_date.dt.month,
                                climatology.start_date.dt.day],
                      how='left', suffixes=('', '_clim'))
    # Compute ground-truth anomalies using new climatology
    anom_col = gt_col+"_anom"
    anoms[anom_col] = anoms[gt_col] - anoms[clim_col]
    toc()
    
# Drop unnecessary columns
anoms = anoms.loc[:,['lat','lon','start_date',anom_col]]
# Pivot dataframe to have one row per start date and one column per (lat,lon)
tic(); anoms = anoms.set_index(['lat','lon','start_date']).unstack(['lat','lon']); toc()
# Drop start dates that have no measurements (e.g., leap days, which have no climatology)
anoms = anoms.dropna(axis='index', how='all')
# Normalize each start_date's measurements by its Euclidean norm
tic()
norms = np.sqrt(np.square(anoms).sum(axis=1))
anoms = anoms.divide(norms, axis=0)
toc()
# Compute the cosine similarity between each pair of dates by computing all inner products
tic(); gt_cosines = anoms.dot(anoms.transpose()); toc()

Getting contest_tmp2m_shiftNone with anomalies
Elapsed: 11.048771619796753s
Elapsed time: 3.671401 seconds.

Elapsed time: 0.152220 seconds.

Elapsed time: 6.189561 seconds.



## Define similarity measure

In [4]:
# Each date is represented by its past_days most recent observed measurements (i.e., 
# the past_days most recent measurements at least start_delta days before the date).
# The similarity of two dates is the average cosine similarity their past_days
# associated measurements.

# The number of past days that should contribute to measure of similarity
past_days = 60

## Compute similarity measure between pairs of target dates assuming start_delta = 0
That is, assuming that we have access to the ground truth measurement with start date equal to the target date.
Later we will shift by start_delta.

In [6]:
# Check if base similarities have been computed previously
regen_similarities0 = True
similarities0_file = os.path.join(
    cache_dir,'similarities0-{}-days{}.h5'.format(gt_id,past_days))
if regen_similarities0 or not os.path.isfile(similarities0_file):
    # Initially incorporate unshifted cosine similarities 
    # (representing the cosine similarity of the first past day)
    tic()
    similarities0 = gt_cosines.copy()
    toc()

    # Now, for each remaining past day, sum over additionally shifted measurements
    # NOTE: this has the effect of ignoring (i.e., skipping over) dates that don't 
    # exist in gt_cosines like leap days
    tic()
    for m in range(1,past_days):
        similarities0 += gt_cosines.shift(m, axis='rows').shift(m, axis='columns')
        sys.stdout.write(str(m)+' ')
    toc()

    # Normalize similarities by number of past days
    similarities0 /= past_days
    # Write similarities0 to file
    print("Saving similarities0 to ", similarities0_file); tic()
    similarities0.to_hdf(similarities0_file, key="data", mode="w"); toc()
else:
    # Read base similarities from disk
    print("Reading similarities0 from ", similarities0_file, tic())
    similarities0 = pd.read_hdf(similarities0_file); toc()

Elapsed time: 3.593677 seconds.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 Elapsed time: 100.443004 seconds.

Saving similarities0 to  results/knn/shared/similarities0-contest_tmp2m-days60.h5 None
Elapsed time: 24.187716 seconds.



## Define prediction horizon

In [7]:
# Prediction horizon
target_horizon = "56w" # "34w" or "56w"

# Only use measurements available this many days prior to 
# official contest submission date
days_early = 365 - (14 + get_forecast_delta(target_horizon, days_early = 0)) 

## Process inputs

# Number of days between start date of most recently observed measurement
# (2 weeks to observe complete measurement) and start date of target period 
# (2 or 4 weeks plus days early days ahead)
aggregation_days = 14
start_delta = (aggregation_days + 
               get_forecast_delta(target_horizon, days_early = days_early))

## Shift similarities by start_delta
The rows and columns of similarities represent target dates, and the similarities are now based on ground truth measurements from start_delta days prior to each target date.

In [8]:
# The earliest measurement available is from start_delta days prior to target day, 
# so shift rows and columns of similarities by start_delta and extend index accordingly
# NOTE: For some reason, shifting columns doesn't extend column index, so I'm transposing and shifting
# rows
tic()
similarities = similarities0.shift(start_delta, axis='rows', freq='D').transpose().shift(start_delta, axis='rows', freq='D')
toc()
# Index extension has the side effect of creating leap days (e.g., 2012-02-29) and removing 
# the date start_delta days later (e.g., datetime.date(2012,2,29) + timedelta(start_delta))
# Add one day to each date in the range [datetime.date(2012,2,29), 
# datetime.date(2012,2,29) + timedelta(start_delta)) to remove leap days
def fix_date(date):
    if date.is_leap_year:
        # Identify the affected dates in this current date's year
        affected_dates = pd.date_range('{}-02-29'.format(date.year), periods=start_delta, freq='D')
    elif date.replace(year=date.year-1).is_leap_year:
        # Identify the affected dates starting from prior year
        affected_dates = pd.date_range('{}-02-29'.format(date.year-1), periods=start_delta, freq='D')
    else:
        # Only modify leap year dates and dates following leap year
        return date
    # Shift date by 1 day if affected
    return date + timedelta(1) if date in affected_dates else date
tic()
new_index = [fix_date(date) for date in similarities.index]
toc()
tic()
similarities = similarities.reindex(new_index)
similarities.columns = new_index
toc()

Elapsed time: 4.042113 seconds.

Elapsed time: 2.154238 seconds.

Elapsed time: 0.710397 seconds.



## Restrict similarities to viable neighbors
Viable neighbors are those with available ground truth data (as evidenced by anoms or gt_cosines)

In [10]:
# Check if viable similarities have been computed previously
regen_viable_similarities = True
viable_similarities_file = os.path.join(
    cache_dir,'viable_similarities-{}-{}-days{}-early{}.h5'.format(gt_id,target_horizon,past_days,days_early))
if regen_viable_similarities or not os.path.isfile(viable_similarities_file):
    viable_similarities = similarities[similarities.index.isin(gt_cosines.index)]
    print("Saving viable_similarities to ", viable_similarities_file); tic()
    viable_similarities.to_hdf(viable_similarities_file, key="data", mode="w"); toc()
else:
    # Read viable similarities from disk
    print("Reading viable similarities from ", viable_similarities_file); tic()
    viable_similarities = pd.read_hdf(viable_similarities_file); toc()

Saving viable_similarities to  results/knn/shared/viable_similarities-contest_tmp2m-56w-days60-early323.h5
Elapsed time: 25.193596 seconds.

