### import libraries

In [1]:
#
# basic libraries
#
import numpy as np
import pandas as pd
from sklearn.covariance import EmpiricalCovariance, MinCovDet
import sklearn.covariance
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import math
# plotting stuff
import matplotlib.pyplot as plt
import matplotlib as mpl
import collections
import seaborn as sb
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 8, 6
from timeit import default_timer as timer
import warnings

# Function Definitions here

In [2]:
def find_ALL_location_NNs(future_df, analog_pool, numeric_feat, NN_count):
    local_sites = future_df.location.unique()
    #
    # initiate dataframes to speed up
    # 
    row_count = future_df.shape[0]
    needed_col_count = NN_count * 2

    all_NNs_df   = future_df[['year', 'location']].copy() # data frame containing (year, location)
    all_dists_df = future_df[['year', 'location']].copy() # data frame containing distances


    ## concatenate new data frame to above ones, to speed up
    NNs_new_cols, dists_new_cols = create_colnames(NN_count)

    NNs_df_help = pd.DataFrame('-999', index = all_NNs_df.index,  columns = NNs_new_cols)
    dists_df_help = pd.DataFrame('-999', index = all_dists_df.index,  columns = dists_new_cols)

    all_NNs_df = pd.concat([all_NNs_df, NNs_df_help], axis=1)
    all_dists_df = pd.concat([all_dists_df, dists_df_help], axis=1)

    del(NNs_df_help, dists_df_help, NNs_new_cols, dists_new_cols)

    for loc in local_sites:
        # pick up one location data
        curr_loc_df = future_df[future_df.location == loc].copy()
        complete_hist_df = analog_pool.copy()
        
        output = find_1_location_NNs_builtin(curr_location_df = curr_loc_df, 
                                             complete_hist_df = analog_pool, 
                                             numeric_feat=numeric_feat, 
                                             NN_count=NN_count)
        
        all_NNs_df[all_NNs_df.index.isin(list(output[0].index))] = output[0]
        all_dists_df[all_NNs_df.index.isin(list(output[1].index))] = output[1]
        del(output)
    return(all_NNs_df, all_dists_df)

def find_1_location_NNs_builtin(curr_location_df, complete_hist_df, numeric_feat, NN_count):
    future_yr_count = curr_location_df.shape[0]
    needed_col_count = NN_count * 2
    
    NNs_df   = curr_location_df[['year', 'location']].copy() # data frame containing (year, location)
    dists_df = curr_location_df[['year', 'location']].copy() # data frame containing distances
    
    ## concatenate new data frame to above ones, to speed up
    NNs_df_new_cols, dists_df_new_cols = create_colnames(NN_count)
    
    NNs_df_helper = pd.DataFrame('-999', index=NNs_df.index,  columns=NNs_df_new_cols)
    dists_df_helper = pd.DataFrame('-999', index=dists_df.index,  columns=dists_df_new_cols)
    
    NNs_df = pd.concat([NNs_df, NNs_df_helper], axis=1)
    dists_df = pd.concat([dists_df, dists_df_helper], axis=1)
    
    del(NNs_df_helper, dists_df_helper, NNs_df_new_cols, dists_df_new_cols)
    
    
    # form the ICV to compute its covariance to remove inter-annual variability
    ICV = complete_hist_df.copy()
    ICV = ICV.loc[ICV['location'] == curr_location_df.location.unique()[0]] # filter corresponding location
    #############################################################################
    #
    #          Normalize before doing anything
    #
    #############################################################################
    ICV_means = ICV.loc[:, numeric_feat].mean()
    ICV_stds = ICV.loc[:, numeric_feat].std()
    ICV_stds[ICV_stds.le(10**(-10))] = 1
    
    ICV = (ICV.loc[:, numeric_feat] - ICV_means) / ICV_stds
    curr_location_df.loc[:, numeric_feat] = (curr_location_df.loc[:, numeric_feat] - ICV_means) / ICV_stds
    complete_hist_df.loc[:, numeric_feat] = (complete_hist_df.loc[:, numeric_feat] - ICV_means) / ICV_stds
    #
    # pick numerical part of the data frame to do the operations:
    #
    complete_hist_df_numeric = complete_hist_df.loc[:, numeric_feat].copy()
    future_numeric = curr_location_df.loc[:, numeric_feat].copy()
    ICV = ICV.loc[:, numeric_feat]
    
    ### Apply PCA here and use those to find analogs
    pca = PCA(n_components = detect_effective_compon(ICV))
    pca.fit(ICV);
    #
    # transform data into PCA space to compute analogs
    ICV_pca = pca.transform(ICV)
    hist_pca = pca.transform(complete_hist_df_numeric)
    future_pca = pca.transform(future_numeric)

    # the following is the same as [(1/N) * np.matmul(M.transpose(), M)]. which is not even divided by N-1
    cov = sklearn.covariance.empirical_covariance(ICV_pca, assume_centered=False)
    
    # there is no difference between the following line and adding metric_params={'V': cov} to it
    neigh = NearestNeighbors(n_neighbors=NN_count, metric = "mahalanobis", algorithm="brute")
    neigh.fit(hist_pca);
    for yr in np.arange(curr_location_df.shape[0]):
        result = neigh.kneighbors([future_pca[yr, ]])

        NNs_distances = result[0][0]
        NNs_idx = result[1][0]

        # find and reshape the NNs
        # reshape the nearest neighbros from long to wide, so, every other column is (year, location) of ith NN
        #
        curr_NNs = complete_hist_df.loc[NNs_idx, ['year', 'location']].copy()
        curr_NNs = list(np.hstack(np.split(curr_NNs, NN_count))[0])
        NNs_df.iloc[yr, 2:] = curr_NNs
        
        dists_df.iloc[yr, 2:] = NNs_distances
    return(NNs_df, dists_df)

def filter_locations(all_dt, local_dt):
    # list of unique locations in the data
    local_sites = local_dt.location.unique()
    all_sites = all_dt.location.unique()

    # find the local sites that exist in all_usa_data
    local_sites = np.intersect1d(local_sites, all_sites)

    # select the rows corresponding to existing sites
    local_dt = local_dt.loc[local_dt['location'].isin(local_sites)]
    return (local_dt)

def detect_effective_compon(matriks):
    n_comp = matriks.shape[1]
    pca = PCA(n_components = n_comp)
    pca.fit(matriks)
    return (len(pca.explained_variance_[pca.explained_variance_ > 0.01])) 

def create_colnames(NN_count):
    year_loc_cols = pd.Series(['year_NN_', 'location_NN_'] * NN_count)
    numbers = pd.Series(np.arange(1, NN_count+1).repeat(2))
    year_loc_cols = year_loc_cols.astype(str) + numbers.astype(str)
    year_loc_cols = list(year_loc_cols)
    
    dist_cols = pd.Series(['dist_NN_'] * NN_count)
    dist_cols = list(pd.Series(['dist_NN_'] * NN_count) + pd.Series(np.arange(1, NN_count+1)).astype(str))
    return (year_loc_cols, dist_cols)

### Define directories

In [3]:
in_dir = "/Users/hn/Desktop/Desktop/Kirti/check_point/analogs/"
out_dir = "/Users/hn/Desktop/"

## Determine needed columns
If we want to drop some columns like `Gen_4`, `preci`:

In [4]:
gen_3_drop = False
gen_4_drop = False
precip_drop = False

In [5]:
numeric_feat = ['medianDoY', 'NumLarvaGens_Aug', 
                'mean_escaped_Gen1', 'mean_escaped_Gen2', 'mean_escaped_Gen3', 'mean_escaped_Gen4', 
                'mean_precip', 'mean_gdd']

non_numeric_feat = ['year', 'location', 'ClimateScenario']

if gen_3_drop == True:
    numeric_feat.remove('mean_escaped_Gen3')

if gen_4_drop == True:
    numeric_feat.remove('mean_escaped_Gen4')

if precip_drop == True:
    numeric_feat.remove('mean_precip')

print(numeric_feat)

['medianDoY', 'NumLarvaGens_Aug', 'mean_escaped_Gen1', 'mean_escaped_Gen2', 'mean_escaped_Gen3', 'mean_escaped_Gen4', 'mean_precip', 'mean_gdd']


### Read data

In [6]:
hist_orig = pd.read_csv(in_dir + "all_data_usa.csv")
hist_orig = hist_orig.loc[:, non_numeric_feat + numeric_feat] # drop unwanted columns

In [7]:
future_orig = pd.read_csv(in_dir + "averaged_data_rcp45.csv")
future_orig = future_orig.loc[:, non_numeric_feat + numeric_feat] # drop unwanted columns

In [8]:
future_orig = future_orig.iloc[0:1000].copy()

# Driver


#### Filter the locations
Some locations in local data are not in all USA. So, here we choose the local (future) data in whose
sites do exist in all_usa_data

In [16]:
future_orig = filter_locations(hist_orig, future_orig)

In [17]:
# pick up one location data
# curr_location_df = future_orig[future_orig.location == future_orig.location.iloc[0]].copy()
# complete_hist_df = hist_orig.copy()
# curr_location_df.head(2)

In [18]:
# a, b = find_ALL_location_NNs(future_df=future_orig, analog_pool=hist_orig, numeric_feat=numeric_feat, NN_count=10)

In [19]:
future_df = future_orig.copy()
analog_pool = hist_orig.copy()
complete_hist_df = analog_pool.copy()
local_sites = future_df.location.unique()

In [20]:
loc = local_sites[0]
curr_loc_df_0 = future_df[future_df.location == loc].copy()

In [21]:
loc = "43.59375_-116.84375"
curr_loc_df = future_df[future_df.location == loc].copy()

In [22]:
curr_loc_df.head()

Unnamed: 0,year,location,ClimateScenario,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
2,2026,43.59375_-116.84375,ensembe_mean,92,2.354509,23.614348,28.257612,3.129602,0.03505,250.316667,3655.09832
297,2027,43.59375_-116.84375,ensembe_mean,92,2.458505,26.068053,31.351712,3.084039,0.0,237.625,3743.843369
592,2028,43.59375_-116.84375,ensembe_mean,100,2.283814,28.689409,27.379969,1.420594,0.0,289.2625,3591.516519
887,2029,43.59375_-116.84375,ensembe_mean,98,2.62721,23.735109,34.692865,5.595354,0.006759,241.75,3958.690529


In [23]:
curr_loc_df_0.head()

Unnamed: 0,year,location,ClimateScenario,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
1,2026,43.59375_-116.78125,ensembe_mean,92,2.336201,24.035184,27.50817,2.912981,0.027137,260.75,3623.723188
296,2027,43.59375_-116.78125,ensembe_mean,93,2.428946,26.711047,30.58016,2.788,0.0,245.4,3713.421219
591,2028,43.59375_-116.78125,ensembe_mean,100,2.25723,29.324188,26.550554,1.229009,0.0,299.3,3561.228505
886,2029,43.59375_-116.78125,ensembe_mean,99,2.60442,24.263662,34.112987,5.307106,0.004458,250.379167,3933.708741


In [24]:
ICV_0 = complete_hist_df.copy()
ICV_0 = ICV_0.loc[ICV_0['location'] == curr_loc_df_0.location.unique()[0]] # filter corresponding location
ICV_means_0 = ICV_0.loc[:, numeric_feat].mean()
ICV_stds_0 = ICV_0.loc[:, numeric_feat].std()
ICV_stds_0[ICV_stds_0.le(10**(-10))] = 1
ICV_0 = (ICV_0.loc[:, numeric_feat] - ICV_means_0) / ICV_stds_0

In [25]:
ICV = complete_hist_df.copy()
ICV = ICV.loc[ICV['location'] == curr_loc_df.location.unique()[0]] # filter corresponding location
ICV_means = ICV.loc[:, numeric_feat].mean()
ICV_stds = ICV.loc[:, numeric_feat].std()
ICV_stds[ICV_stds.le(10**(-10))] = 1
ICV = (ICV.loc[:, numeric_feat] - ICV_means) / ICV_stds

In [27]:
ICV.head(3)

Unnamed: 0,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
851,0.409134,0.023446,0.057804,0.052812,-0.471743,0.0,0.420734,0.314193
2144,0.110359,-0.441195,0.024538,-0.616615,-0.858829,0.0,1.143168,-0.869296
3437,0.010767,-0.40645,0.482279,-0.851704,-0.761663,0.0,1.822352,-0.628517


In [28]:
ICV_0.head(3)

Unnamed: 0,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
850,0.448902,0.069532,0.081023,0.036207,-0.493531,0.0,0.349742,0.308051
2143,0.154061,-0.357316,0.035862,-0.588812,-0.786956,0.0,1.234399,-0.85389
3436,0.154061,-0.340402,0.509677,-0.884514,-0.730017,0.0,1.708604,-0.610098


In [29]:
curr_loc_df_0.loc[:, numeric_feat] = (curr_loc_df_0.loc[:, numeric_feat] - ICV_means_0) / ICV_stds_0

In [31]:
curr_loc_df.loc[:, numeric_feat] = (curr_loc_df.loc[:, numeric_feat] - ICV_means) / ICV_stds

In [32]:
curr_loc_df_0.head(2)

Unnamed: 0,year,location,ClimateScenario,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
1,2026,43.59375_-116.78125,ensembe_mean,-1.811546,2.015791,-2.213596,1.28234,5.814381,0.027137,0.120708,1.907442
296,2027,43.59375_-116.78125,ensembe_mean,-1.713266,2.699969,-1.720775,1.741583,5.530739,0.0,-0.12092,2.279523


In [33]:
curr_loc_df.head(2)

Unnamed: 0,year,location,ClimateScenario,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
2,2026,43.59375_-116.84375,ensembe_mean,-1.781887,2.06608,-2.118405,1.221179,5.557634,0.03505,0.12506,1.87734
297,2027,43.59375_-116.84375,ensembe_mean,-1.781887,2.856216,-1.678922,1.672883,5.463818,0.0,-0.078242,2.245446


In [35]:
complete_hist_df_0 = complete_hist_df.copy()

In [37]:
complete_hist_df_0.loc[:, numeric_feat]

Unnamed: 0,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
0,68,2.939262,1.319697,7.080458,1.348167,0.0,300.525,4821.668922
1,68,2.94605,1.245847,7.104994,1.39698,0.0,303.925,4831.770062
2,62,3.563884,0.568308,10.111412,7.650414,0.63345,176.5,6068.648203


In [38]:
complete_hist_df_0.loc[:, numeric_feat]=(complete_hist_df_0.loc[:, numeric_feat] - ICV_means_0) / ICV_stds_0
complete_hist_df.loc[:, numeric_feat] = (complete_hist_df.loc[:, numeric_feat] - ICV_means) / ICV_stds

In [39]:
complete_hist_df_0.head(2)

Unnamed: 0,year,location,ClimateScenario,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
0,1979,32.46875_-109.90625,observed,-4.170276,6.464557,-6.397165,-1.771473,2.263058,0.0,0.746815,6.876706
1,1979,32.46875_-109.96875,observed,-4.170276,6.514632,-6.410766,-1.767805,2.373839,0.0,0.800335,6.918607


In [40]:
complete_hist_df.head(2)

Unnamed: 0,year,location,ClimateScenario,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
0,1979,32.46875_-109.90625,observed,-4.172091,6.508868,-6.111602,-1.870443,1.889636,0.0,0.929321,6.716166
1,1979,32.46875_-109.96875,observed,-4.172091,6.560442,-6.124829,-1.866861,1.990142,0.0,0.983784,6.758064


In [41]:
complete_hist_df_numeric_0 = complete_hist_df_0.loc[:, numeric_feat].copy()
complete_hist_df_numeric_0.head()

Unnamed: 0,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
0,-4.170276,6.464557,-6.397165,-1.771473,2.263058,0.0,0.746815,6.876706
1,-4.170276,6.514632,-6.410766,-1.767805,2.373839,0.0,0.800335,6.918607
2,-4.759958,11.072382,-6.53555,-1.318365,16.565921,0.63345,-1.205491,12.049369
3,-2.106388,1.007092,-5.723346,-1.718302,-0.553012,0.0,38.060599,2.176845
4,-1.811546,0.207762,-5.469583,-1.953759,-0.779314,0.0,37.432524,1.021552


In [42]:
complete_hist_df_numeric = complete_hist_df.loc[:, numeric_feat].copy()
complete_hist_df_numeric.head(3)

Unnamed: 0,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
0,-4.172091,6.508868,-6.111602,-1.870443,1.889636,0.0,0.929321,6.716166
1,-4.172091,6.560442,-6.124829,-1.866861,1.990142,0.0,0.983784,6.758064
2,-4.769642,11.254568,-6.246183,-1.427959,14.866047,0.63345,-1.057372,11.888519


In [43]:
future_numeric_0 = curr_loc_df_0.loc[:, numeric_feat].copy()
future_numeric_0.head(3)

Unnamed: 0,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
1,-1.811546,2.015791,-2.213596,1.28234,5.814381,0.027137,0.120708,1.907442
296,-1.713266,2.699969,-1.720775,1.741583,5.530739,0.0,-0.12092,2.279523
591,-1.025303,1.433219,-1.239506,1.139182,1.99263,0.0,0.727532,1.648204


In [45]:
future_numeric = curr_loc_df.loc[:, numeric_feat].copy()
future_numeric.head(3)

Unnamed: 0,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
2,-1.781887,2.06608,-2.118405,1.221179,5.557634,0.03505,0.12506,1.87734
297,-1.781887,2.856216,-1.678922,1.672883,5.463818,0.0,-0.078242,2.245446
592,-0.985152,1.528966,-1.20941,1.093054,2.038762,0.0,0.748913,1.613609


In [46]:
ICV_0 = ICV_0.loc[:, numeric_feat]
ICV_0.head()

Unnamed: 0,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
850,0.448902,0.069532,0.081023,0.036207,-0.493531,0.0,0.349742,0.308051
2143,0.154061,-0.357316,0.035862,-0.588812,-0.786956,0.0,1.234399,-0.85389
3436,0.154061,-0.340402,0.509677,-0.884514,-0.730017,0.0,1.708604,-0.610098
4729,1.038585,-0.587205,1.160177,-1.152667,-0.782422,0.0,0.948302,-1.461776
6022,0.940304,-0.717674,0.872488,-1.168711,-0.796589,0.0,2.435061,-1.22601


In [47]:
ICV = ICV.loc[:, numeric_feat]
ICV.head()

Unnamed: 0,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
851,0.409134,0.023446,0.057804,0.052812,-0.471743,0.0,0.420734,0.314193
2144,0.110359,-0.441195,0.024538,-0.616615,-0.858829,0.0,1.143168,-0.869296
3437,0.010767,-0.40645,0.482279,-0.851704,-0.761663,0.0,1.822352,-0.628517
4730,1.106277,-0.634346,1.204438,-1.152141,-0.845444,0.0,0.911701,-1.490716
6023,0.907093,-0.673209,0.866328,-1.13305,-0.870428,0.0,2.456684,-1.166166


In [50]:
pca_0 = PCA(n_components = detect_effective_compon(ICV_0))

In [51]:
pca = PCA(n_components = detect_effective_compon(ICV))

In [54]:
pca.fit(ICV_0);
pca.fit(ICV);

In [55]:
ICV_pca_0 = pca.transform(ICV_0)

In [56]:
ICV_pca = pca.transform(ICV)

In [62]:
hist_pca_0 = pca.transform(complete_hist_df_numeric_0)
hist_pca_0

array([[ 9.29893166, -3.50097669,  1.2753332 , ...,  2.48857566,
         2.3119056 , -6.60306711],
       [ 9.37354183, -3.55696931,  1.3432269 , ...,  2.43623368,
         2.33365802, -6.62283823],
       [19.80328452, -2.64987556,  6.54827063, ..., -5.93848186,
         6.31968076, -8.64550553],
       ...,
       [-6.81868953,  1.85251941,  3.02622422, ..., -4.88010935,
        -1.03413715,  3.90902293],
       [-1.43025983,  1.0666877 ,  4.13384865, ..., -4.58692341,
        -1.68485515,  6.03219722],
       [-2.1048631 ,  1.34183268,  3.98584765, ..., -4.45541463,
        -1.65284898,  5.69401947]])

In [63]:
hist_pca = pca.transform(complete_hist_df_numeric)
hist_pca

array([[ 8.90987347, -3.61255767,  1.32525239, ...,  2.5149001 ,
         2.14351143, -6.61119338],
       [ 8.98064648, -3.66890263,  1.39269069, ...,  2.46909956,
         2.16194205, -6.63187403],
       [18.95414473, -2.67459743,  6.44407585, ..., -5.05099305,
         5.75519999, -8.7702177 ],
       ...,
       [-7.0815901 ,  1.70023786,  2.89782493, ..., -4.75426348,
        -1.00749332,  3.87767834],
       [-1.78563448,  0.91517582,  3.99596759, ..., -4.39349115,
        -1.69078412,  5.89257411],
       [-2.43101858,  1.19020817,  3.85229926, ..., -4.29236359,
        -1.64878801,  5.56561818]])

In [64]:
future_pca_0 = pca.transform(future_numeric_0)
future_pca_0

In [65]:
future_pca = pca.transform(future_numeric)
future_pca

In [70]:
NN_count=10

In [71]:
neigh_0 = NearestNeighbors(n_neighbors=NN_count, metric = "mahalanobis", algorithm="brute")

In [72]:
neigh = NearestNeighbors(n_neighbors=NN_count, metric = "mahalanobis", algorithm="brute")

In [73]:
neigh_0.fit(hist_pca_0)
neigh.fit(hist_pca)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='mahalanobis',
         metric_params=None, n_jobs=1, n_neighbors=10, p=2, radius=1.0)

Unnamed: 0,year,location,ClimateScenario,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
1,2026,43.59375_-116.78125,ensembe_mean,-1.811546,2.015791,-2.213596,1.28234,5.814381,0.027137,0.120708,1.907442
296,2027,43.59375_-116.78125,ensembe_mean,-1.713266,2.699969,-1.720775,1.741583,5.530739,0.0,-0.12092,2.279523
591,2028,43.59375_-116.78125,ensembe_mean,-1.025303,1.433219,-1.239506,1.139182,1.99263,0.0,0.727532,1.648204
886,2029,43.59375_-116.78125,ensembe_mean,-1.123584,3.994433,-2.171516,2.269718,11.247815,0.004458,-0.042542,3.19331


In [77]:
future_yr_count_0 = curr_loc_df_0.shape[0]
needed_col_count = NN_count * 2

NNs_df_0   = curr_loc_df_0[['year', 'location']].copy() # data frame containing (year, location)
dists_df_0 = curr_loc_df_0[['year', 'location']].copy() # data frame containing distances

## concatenate new data frame to above ones, to speed up
NNs_df_new_cols_0, dists_df_new_cols_0 = create_colnames(NN_count)

NNs_df_helper_0 = pd.DataFrame('-999', index = NNs_df_0.index,  columns = NNs_df_new_cols_0)
dists_df_helper_0 = pd.DataFrame('-999', index = dists_df_0.index,  columns = dists_df_new_cols_0)

NNs_df_0 = pd.concat([NNs_df_0, NNs_df_helper_0], axis=1)
dists_df_0 = pd.concat([dists_df_0, dists_df_helper_0], axis=1)

del(NNs_df_helper_0, dists_df_helper_0, NNs_df_new_cols_0, dists_df_new_cols_0)

In [79]:
future_yr_count = curr_loc_df.shape[0]
needed_col_count = NN_count * 2

NNs_df   = curr_loc_df[['year', 'location']].copy() # data frame containing (year, location)
dists_df = curr_loc_df[['year', 'location']].copy() # data frame containing distances

## concatenate new data frame to above ones, to speed up
NNs_df_new_cols, dists_df_new_cols = create_colnames(NN_count)

NNs_df_helper = pd.DataFrame('-999', index=NNs_df.index,  columns=NNs_df_new_cols)
dists_df_helper = pd.DataFrame('-999', index=dists_df.index,  columns=dists_df_new_cols)

NNs_df = pd.concat([NNs_df, NNs_df_helper], axis=1)
dists_df = pd.concat([dists_df, dists_df_helper], axis=1)

del(NNs_df_helper, dists_df_helper, NNs_df_new_cols, dists_df_new_cols)

In [85]:
for yr in np.arange(curr_loc_df_0.shape[0]):
    result_0 = neigh_0.kneighbors([future_pca_0[yr, ]])
    NNs_distances_0 = result_0[0][0]
    NNs_idx_0 = result_0[1][0]

    curr_NNs_0 = complete_hist_df_0.loc[NNs_idx_0, ['year', 'location']].copy()
    curr_NNs_0 = list(np.hstack(np.split(curr_NNs_0, NN_count))[0])
    NNs_df_0.iloc[yr, 2:] = curr_NNs_0
      
    dists_df_0.iloc[yr, 2:] = NNs_distances_0

In [86]:
for yr in np.arange(curr_loc_df.shape[0]):
    result = neigh.kneighbors([future_pca_0[yr, ]])
    NNs_distances = result[0][0]
    NNs_idx = result[1][0]

    curr_NNs = complete_hist_df.loc[NNs_idx, ['year', 'location']].copy()
    curr_NNs = list(np.hstack(np.split(curr_NNs, NN_count))[0])
    NNs_df.iloc[yr, 2:] = curr_NNs
      
    dists_df.iloc[yr, 2:] = NNs_distances

In [87]:
NNs_df

Unnamed: 0,year,location,year_NN_1,location_NN_1,year_NN_2,location_NN_2,year_NN_3,location_NN_3,year_NN_4,location_NN_4,...,year_NN_6,location_NN_6,year_NN_7,location_NN_7,year_NN_8,location_NN_8,year_NN_9,location_NN_9,year_NN_10,location_NN_10
2,2026,43.59375_-116.84375,2007,43.96875_-116.90625,2007,43.59375_-116.78125,2007,43.65625_-116.84375,2007,43.53125_-116.78125,...,1994,43.53125_-116.78125,1994,43.59375_-116.78125,1994,43.59375_-116.84375,1994,43.65625_-116.84375,1994,43.96875_-116.90625
297,2027,43.59375_-116.84375,2007,43.96875_-116.90625,1994,43.96875_-116.90625,2007,43.53125_-116.78125,2007,43.59375_-116.78125,...,2007,43.65625_-116.84375,1994,43.53125_-116.78125,1994,43.59375_-116.78125,1994,43.59375_-116.84375,1994,43.65625_-116.84375
592,2028,43.59375_-116.84375,1994,43.96875_-116.90625,2015,43.65625_-116.84375,1987,46.28125_-119.78125,2015,43.59375_-116.78125,...,1988,43.59375_-116.84375,2015,43.59375_-116.84375,1988,43.65625_-116.84375,1988,43.53125_-116.78125,1987,46.28125_-119.84375
887,2029,43.59375_-116.84375,2013,40.40625_-122.21875,1994,45.71875_-120.21875,1990,40.40625_-122.21875,2004,46.15625_-119.03125,...,1994,46.15625_-119.03125,2004,46.65625_-119.90625,1989,39.65625_-122.28125,1989,39.59375_-122.09375,1989,39.71875_-122.15625


In [98]:
NNs_df_0.head()

Unnamed: 0,year,location,year_NN_1,location_NN_1,year_NN_2,location_NN_2,year_NN_3,location_NN_3,year_NN_4,location_NN_4,...,year_NN_6,location_NN_6,year_NN_7,location_NN_7,year_NN_8,location_NN_8,year_NN_9,location_NN_9,year_NN_10,location_NN_10
1,2026,43.59375_-116.78125,2007,43.59375_-116.78125,2007,43.96875_-116.90625,2007,43.65625_-116.84375,2007,43.59375_-116.84375,...,1994,43.53125_-116.78125,1994,43.59375_-116.78125,1994,43.59375_-116.84375,1994,43.65625_-116.84375,1994,43.96875_-116.90625
296,2027,43.59375_-116.78125,2007,43.96875_-116.90625,2007,43.59375_-116.78125,1994,43.96875_-116.90625,2007,43.65625_-116.84375,...,2007,43.59375_-116.84375,1994,43.53125_-116.78125,1994,43.59375_-116.78125,1994,43.65625_-116.84375,1994,43.59375_-116.84375
591,2028,43.59375_-116.78125,1994,43.96875_-116.90625,1988,43.65625_-116.84375,1987,46.28125_-119.78125,1988,43.59375_-116.84375,...,1988,43.59375_-116.78125,1988,43.53125_-116.78125,1988,43.96875_-116.90625,2015,43.59375_-116.78125,2014,43.59375_-116.84375
886,2029,43.59375_-116.78125,2007,43.96875_-116.90625,2007,43.53125_-116.78125,1989,39.65625_-122.28125,2013,40.40625_-122.21875,...,1989,39.59375_-122.09375,1989,39.53125_-122.09375,1989,39.46875_-122.15625,1989,39.59375_-122.28125,1989,39.40625_-122.15625


In [99]:
NNs_df.head()

Unnamed: 0,year,location,year_NN_1,location_NN_1,year_NN_2,location_NN_2,year_NN_3,location_NN_3,year_NN_4,location_NN_4,...,year_NN_6,location_NN_6,year_NN_7,location_NN_7,year_NN_8,location_NN_8,year_NN_9,location_NN_9,year_NN_10,location_NN_10
2,2026,43.59375_-116.84375,2007,43.96875_-116.90625,2007,43.59375_-116.78125,2007,43.65625_-116.84375,2007,43.53125_-116.78125,...,1994,43.53125_-116.78125,1994,43.59375_-116.78125,1994,43.59375_-116.84375,1994,43.65625_-116.84375,1994,43.96875_-116.90625
297,2027,43.59375_-116.84375,2007,43.96875_-116.90625,1994,43.96875_-116.90625,2007,43.53125_-116.78125,2007,43.59375_-116.78125,...,2007,43.65625_-116.84375,1994,43.53125_-116.78125,1994,43.59375_-116.78125,1994,43.59375_-116.84375,1994,43.65625_-116.84375
592,2028,43.59375_-116.84375,1994,43.96875_-116.90625,2015,43.65625_-116.84375,1987,46.28125_-119.78125,2015,43.59375_-116.78125,...,1988,43.59375_-116.84375,2015,43.59375_-116.84375,1988,43.65625_-116.84375,1988,43.53125_-116.78125,1987,46.28125_-119.84375
887,2029,43.59375_-116.84375,2013,40.40625_-122.21875,1994,45.71875_-120.21875,1990,40.40625_-122.21875,2004,46.15625_-119.03125,...,1994,46.15625_-119.03125,2004,46.65625_-119.90625,1989,39.65625_-122.28125,1989,39.59375_-122.09375,1989,39.71875_-122.15625


In [101]:
dists_df.head(2)

Unnamed: 0,year,location,dist_NN_1,dist_NN_2,dist_NN_3,dist_NN_4,dist_NN_5,dist_NN_6,dist_NN_7,dist_NN_8,dist_NN_9,dist_NN_10
2,2026,43.59375_-116.84375,0.872497,0.932375,0.934942,0.965893,0.974032,1.0142,1.0572,1.06732,1.12301,1.14416
297,2027,43.59375_-116.84375,0.715198,0.908402,0.943608,0.945686,0.960596,0.96124,0.965372,1.01383,1.05112,1.09329


In [100]:
dists_df_0.head(2)

Unnamed: 0,year,location,dist_NN_1,dist_NN_2,dist_NN_3,dist_NN_4,dist_NN_5,dist_NN_6,dist_NN_7,dist_NN_8,dist_NN_9,dist_NN_10
1,2026,43.59375_-116.78125,0.797874,0.804299,0.8108,0.886148,0.896228,0.939739,0.969887,0.985259,1.00034,1.05951
296,2027,43.59375_-116.78125,0.683017,0.849194,0.857648,0.875618,0.901377,0.901643,0.933502,0.968948,1.01154,1.01184


In [None]:
for yr in np.arange(curr_location_df.shape[0]):
    result = neigh.kneighbors([future_pca[yr, ]])
    NNs_distances = result[0][0]
    NNs_idx = result[1][0]
    
    curr_NNs = complete_hist_df.loc[NNs_idx, ['year', 'location']].copy()
    curr_NNs = list(np.hstack(np.split(curr_NNs, NN_count))[0])
    NNs_df.iloc[yr, 2:] = curr_NNs
        
    dists_df.iloc[yr, 2:] = NNs_distances

In [None]:
def find_1_location_NNs_builtin(curr_location_df, complete_hist_df, numeric_feat, NN_count):
    future_yr_count = curr_location_df.shape[0]
    needed_col_count = NN_count * 2
    
    NNs_df   = curr_location_df[['year', 'location']].copy() # data frame containing (year, location)
    dists_df = curr_location_df[['year', 'location']].copy() # data frame containing distances
    
    ## concatenate new data frame to above ones, to speed up
    NNs_df_new_cols, dists_df_new_cols = create_colnames(NN_count)
    
    NNs_df_helper = pd.DataFrame('-999', index=NNs_df.index,  columns=NNs_df_new_cols)
    dists_df_helper = pd.DataFrame('-999', index=dists_df.index,  columns=dists_df_new_cols)
    
    NNs_df = pd.concat([NNs_df, NNs_df_helper], axis=1)
    dists_df = pd.concat([dists_df, dists_df_helper], axis=1)
    
    del(NNs_df_helper, dists_df_helper, NNs_df_new_cols, dists_df_new_cols)
    
    
    # form the ICV to compute its covariance to remove inter-annual variability
    ICV = complete_hist_df.copy()
    ICV = ICV.loc[ICV['location'] == curr_location_df.location.unique()[0]] # filter corresponding location
    #############################################################################
    #
    #          Normalize before doing anything
    #
    #############################################################################
    ICV_means = ICV.loc[:, numeric_feat].mean()
    ICV_stds = ICV.loc[:, numeric_feat].std()
    ICV_stds[ICV_stds.le(10**(-10))] = 1
    
    ICV = (ICV.loc[:, numeric_feat] - ICV_means) / ICV_stds
    curr_location_df.loc[:, numeric_feat] = (curr_location_df.loc[:, numeric_feat] - ICV_means) / ICV_stds
    complete_hist_df.loc[:, numeric_feat] = (complete_hist_df.loc[:, numeric_feat] - ICV_means) / ICV_stds
    #
    # pick numerical part of the data frame to do the operations:
    #
    complete_hist_df_numeric = complete_hist_df.loc[:, numeric_feat].copy()
    future_numeric = curr_location_df.loc[:, numeric_feat].copy()
    ICV = ICV.loc[:, numeric_feat]
    
    ### Apply PCA here and use those to find analogs
    pca = PCA(n_components = detect_effective_compon(ICV))
    pca.fit(ICV);
    ICV_pca = pca.transform(ICV)
    hist_pca = pca.transform(complete_hist_df_numeric)
    future_pca = pca.transform(future_numeric)

    # the following is the same as [(1/N) * np.matmul(M.transpose(), M)]. which is not even divided by N-1
    cov = sklearn.covariance.empirical_covariance(ICV_pca, assume_centered=False)
    
    # there is no difference between the following line and adding metric_params={'V': cov} to it
    neigh = NearestNeighbors(n_neighbors=NN_count, metric = "mahalanobis", algorithm="brute")
    neigh.fit(hist_pca);
    for yr in np.arange(curr_location_df.shape[0]):
        result = neigh.kneighbors([future_pca[yr, ]])

        NNs_distances = result[0][0]
        NNs_idx = result[1][0]

        # find and reshape the NNs
        # reshape the nearest neighbros from long to wide, so, every other column is (year, location) of ith NN
        #
        curr_NNs = complete_hist_df.loc[NNs_idx, ['year', 'location']].copy()
        curr_NNs = list(np.hstack(np.split(curr_NNs, NN_count))[0])
        NNs_df.iloc[yr, 2:] = curr_NNs
        
        dists_df.iloc[yr, 2:] = NNs_distances

In [None]:
curr_loc_df = future_df[future_df.location == loc].copy()

In [None]:
curr_loc_df.shape