# Get KNN neighbor predictions

Generates the predictions of the most similar viable neighbors for all dates based on saved KNN similarities (generated by knn_step_1-compute_similarities.ipynb) and saves the predictions to disk.

In [1]:
## Package loading

# Autoreload packages that are modified
%load_ext autoreload
%autoreload 2

# Plotting magic
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# Load relevant packages
import numpy as np

import pandas as pd
from sklearn import *
import sys
import subprocess
from datetime import datetime, timedelta
import netCDF4
import time
from functools import partial
import os

if os.path.basename(os.getcwd()) == "experiments":
    os.chdir(os.path.join("..",".."))

# Adds 'experiments' folder to path to load experiments_util
sys.path.insert(0, 'src/experiments')
# Load general utility functions
from experiments_util import *
# Load functionality for fitting and predicting
from fit_and_predict import *
# Load functionality for evaluation
from skill import *
# Load functionality for knn
from knn_util import *

## User inputs

In [2]:
# Output experiment name determined by inputs above
experiment = "knn"
# the variable to be predicted
gt_id = "contest_tmp2m" # "contest_precip" or "contest_tmp2m"
# Prediction horizon
target_horizon = "56w" # "34w" or "56w"
# The number of past days that should contribute to measure of similarity
past_days = 60
# Only use measurements available this many days prior to 
# official contest submission date
days_early = 365 - (14 + get_forecast_delta(target_horizon, days_early = 0)) 
print(days_early)
# Maximum number of neighbors
max_num_nbrs = 20

## Process inputs

# Identify measurement variable name
measurement_variable = get_measurement_variable(gt_id) # 'tmp2m' or 'prate'

# column names for gt_col, clim_col and anom_col 
gt_col = measurement_variable
clim_col = measurement_variable+"_clim"
anom_col = get_measurement_variable(gt_id)+"_anom" # 'tmp2m_anom' or 'prate_anom'

# nbr_start_delta = minimum number of days between start date of most recent neighbor to consider
# (aggregation_days = 2 weeks to observe complete measurement) and start date of target period 
# (2 or 4 weeks plus days early days ahead)
aggregation_days = 14
nbr_start_delta = (aggregation_days + 
                   get_forecast_delta(target_horizon, days_early = days_early))

323


In [3]:
#
# Load ground truth anomalies (using complete climatology)
#
anoms = get_lat_lon_date_features(anom_ids = [gt_id], first_year=get_first_year(gt_id))
# Drop unnecessary columns
anoms = anoms.loc[:,['lat','lon','start_date',gt_col,anom_col,clim_col]]
# Pivot dataframe to have one row per start date
tic(); anoms = anoms.set_index(['lat','lon','start_date']).unstack(['lat','lon']); toc()
# Drop start dates that have no measurements (e.g., leap days, which have no climatology)
anoms = anoms.dropna(axis='index', how='all')
tic()
# Determine which neighbor start_dates are viable
viable_neighbors = anoms.index
# Stack anoms dataframe to have lat, lon, start_date columns
anoms = anoms.stack(['lat','lon']).reset_index()

Getting contest_tmp2m_shiftNone with anomalies
Elapsed: 10.837227821350098s
Elapsed time: 9.075320 seconds.



In [4]:
# Read viable similarities from disk
cache_dir = os.path.join("results", experiment, "shared")
viable_similarities_file = os.path.join(
    cache_dir,'viable_similarities-{}-{}-days{}-early{}.h5'.format(gt_id,target_horizon,past_days,days_early))
print("Reading viable similarities from ", viable_similarities_file); tic()
viable_similarities = pd.read_hdf(viable_similarities_file); toc()

Reading viable similarities from  results/knn/shared/viable_similarities-contest_tmp2m-56w-days60-early323.h5
Elapsed time: 0.609106 seconds.



## Form and save neighbor predictions

In [6]:
# 
# Form predictions
#

# Prepare dataframes for storing predictions and similarities
preds = pd.DataFrame(columns = ['lat','lon','start_date']+['knn'+str(i+1) for i in range(max_num_nbrs)])

# Target dates are dates for which viable similarities are not all NaN
all_target_dates = viable_similarities.loc[~viable_similarities.isnull().all(axis=1)].index

# Process results from each year
for target in all_target_dates:
    # Find the neighbors
    nbrs = get_target_neighbors(
        target, target_horizon, gt_id, 
        nbr_start_delta, past_days, viable_similarities, False,
        False)[0:max_num_nbrs]

    if nbrs.size != max_num_nbrs:
        continue
        
    # Get predictions of each neighbor
    nbr_preds = anoms.loc[anoms.start_date.isin(nbrs), ['lat','lon','start_date',anom_col]].copy()
    nbr_preds_wide = nbr_preds.pivot_table(index=['lat','lon'], columns='start_date')
    nbr_dates = nbr_preds_wide.columns.levels[1]
    nbr_preds_wide = pd.DataFrame(nbr_preds_wide.to_records())
    nbr_preds_wide.columns = ['lat','lon'] + nbr_dates.tolist()
    
    # Reorder columns in order of most similar to least similar neighbor
    nbr_preds_wide = nbr_preds_wide.loc[:,['lat','lon'] + nbrs.tolist()]
    nbr_preds_wide.columns = ['lat','lon'] + ['knn'+str(i+1) for i in range(max_num_nbrs)]
    
    # Associate with target date
    nbr_preds_wide['start_date'] = target
    
    # Store predictions
    preds = preds.append(nbr_preds_wide)
    
    if target.month == 1 and target.day == 1:
        print(target)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


1981-01-01 00:00:00
1982-01-01 00:00:00
1983-01-01 00:00:00
1984-01-01 00:00:00
1985-01-01 00:00:00
1986-01-01 00:00:00
1987-01-01 00:00:00
1988-01-01 00:00:00
1989-01-01 00:00:00
1990-01-01 00:00:00
1991-01-01 00:00:00
1992-01-01 00:00:00
1993-01-01 00:00:00
1994-01-01 00:00:00
1995-01-01 00:00:00
1996-01-01 00:00:00
1997-01-01 00:00:00
1998-01-01 00:00:00
1999-01-01 00:00:00
2000-01-01 00:00:00
2001-01-01 00:00:00
2002-01-01 00:00:00
2003-01-01 00:00:00
2004-01-01 00:00:00
2005-01-01 00:00:00
2006-01-01 00:00:00
2007-01-01 00:00:00
2008-01-01 00:00:00
2009-01-01 00:00:00
2010-01-01 00:00:00
2011-01-01 00:00:00
2012-01-01 00:00:00
2013-01-01 00:00:00
2014-01-01 00:00:00
2015-01-01 00:00:00
2016-01-01 00:00:00
2017-01-01 00:00:00
2018-01-01 00:00:00


In [8]:
# Save results to file
cache_dir = os.path.join('results', experiment)
if not os.path.isdir(cache_dir):
    os.makedirs(cache_dir)
preds_file = os.path.join(
    cache_dir,'knn-{}-{}-days{}-early{}-maxnbrs{}.h5'.format(gt_id,target_horizon,past_days,days_early,max_num_nbrs))
print("Saving predictions to ", preds_file); tic()
preds.to_hdf(preds_file, key="data", mode="w"); toc()

Saving predictions to  results/knn/knn-contest_tmp2m-56w-days60-early323-maxnbrs20.h5
Elapsed time: 15.193359 seconds.

