### import libraries

In [1]:
# basic libraries

import numpy as np
import pandas as pd
from sklearn.covariance import EmpiricalCovariance, MinCovDet
import sklearn.covariance
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import math
# plotting stuff
import matplotlib.pyplot as plt
import matplotlib as mpl
import collections
import seaborn as sb
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 8, 6
from timeit import default_timer as timer
import warnings

In [2]:
gen_3_drop = False
gen_4_drop = False
precip_drop = False

### Define directories

In [3]:
in_dir = "/Users/hn/Desktop/Desktop/Kirti/check_point/analogs/"
out_dir = "/Users/hn/Desktop/"

## Determine needed columns
If we want to drop some columns like `Gen_4`, `preci`:

In [4]:
numeric_feat = ['medianDoY', 'NumLarvaGens_Aug', 
                'mean_escaped_Gen1', 'mean_escaped_Gen2', 'mean_escaped_Gen3', 'mean_escaped_Gen4', 
                'mean_precip', 'mean_gdd']

non_numeric_feat = ['year', 'location', 'ClimateScenario']

if gen_3_drop == True:
    numeric_feat.remove('mean_escaped_Gen3')

if gen_4_drop == True:
    numeric_feat.remove('mean_escaped_Gen4')

if precip_drop == True:
    numeric_feat.remove('mean_precip')

print(numeric_feat)

['medianDoY', 'NumLarvaGens_Aug', 'mean_escaped_Gen1', 'mean_escaped_Gen2', 'mean_escaped_Gen3', 'mean_escaped_Gen4', 'mean_precip', 'mean_gdd']


### Read data

In [5]:
hist_orig = pd.read_csv(in_dir + "all_data_usa.csv")
hist_orig = hist_orig.loc[:, non_numeric_feat + numeric_feat] # drop unwanted columns
hist_orig.head(2)

Unnamed: 0,year,location,ClimateScenario,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
0,1979,32.46875_-109.90625,observed,68,2.939262,1.319697,7.080458,1.348167,0.0,300.525,4821.668922
1,1979,32.46875_-109.96875,observed,68,2.94605,1.245847,7.104994,1.39698,0.0,303.925,4831.770062


In [6]:
future_orig = pd.read_csv(in_dir + "averaged_data_rcp45.csv")
future_orig = future_orig.loc[:, non_numeric_feat + numeric_feat] # drop unwanted columns

# Function Definitions here

In [7]:

def filter_locations(all_dt, local_dt):
    # list of unique locations in the data
    local_sites = local_dt.location.unique()
    all_sites = all_dt.location.unique()

    # find the local sites that exist in all_usa_data
    local_sites = np.intersect1d(local_sites, all_sites)

    # select the rows corresponding to existing sites
    local_dt = local_dt.loc[local_dt['location'].isin(local_sites)]
    return (local_dt)

def detect_effective_compon(matriks):
    n_comp = matriks.shape[1]
    pca = PCA(n_components = n_comp)
    pca.fit(matriks)
    return (len(pca.explained_variance_[pca.explained_variance_ > 0.01])) 

def create_colnames(NN_count):
    year_loc_cols = pd.Series(['year_NN_', 'location_NN_'] * NN_count)
    numbers = pd.Series(np.arange(1, NN_count+1).repeat(2))
    year_loc_cols = year_loc_cols.astype(str) + numbers.astype(str)
    year_loc_cols = list(year_loc_cols)
    
    dist_cols = pd.Series(['dist_NN_'] * NN_count)
    dist_cols = list(pd.Series(['dist_NN_'] * NN_count) + pd.Series(np.arange(1, NN_count+1)).astype(str))
    return (year_loc_cols, dist_cols)

# Driver


#### Filter the locations
Some locations in local data are not in all USA. So, here we choose the local (future) data in whose
sites do exist in all_usa_data

In [8]:
future_orig = filter_locations(hist_orig, future_orig)

In [9]:
# pick up one location data
curr_location_df = future_orig[future_orig.location == future_orig.location.iloc[0]].copy()
complete_hist_df = hist_orig.copy()
curr_location_df.head(2)

Unnamed: 0,year,location,ClimateScenario,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_escaped_Gen4,mean_precip,mean_gdd
1,2026,43.59375_-116.78125,ensembe_mean,92,2.336201,24.035184,27.50817,2.912981,0.027137,260.75,3623.723188
296,2027,43.59375_-116.78125,ensembe_mean,93,2.428946,26.711047,30.58016,2.788,0.0,245.4,3713.421219


In [11]:
NN_count = 5

In [12]:
future_yr_count = curr_location_df.shape[0]
needed_col_count = NN_count * 2

NNs_df   = curr_location_df[['year', 'location']].copy() # data frame containing (year, location)
dists_df = curr_location_df[['year', 'location']].copy() # data frame containing distances

## concatenate new data frame to above ones, to speed up
NNs_df_new_cols, dists_df_new_cols = create_colnames(NN_count)

NNs_df_helper = pd.DataFrame('-999', index=NNs_df.index,  columns=NNs_df_new_cols)
dists_df_helper = pd.DataFrame('-999', index=dists_df.index,  columns=dists_df_new_cols)

NNs_df = pd.concat([NNs_df, NNs_df_helper], axis=1)
dists_df = pd.concat([dists_df, dists_df_helper], axis=1)

del(NNs_df_helper, dists_df_helper, NNs_df_new_cols, dists_df_new_cols)


# form the ICV to compute its covariance to remove inter-annual variability
ICV = complete_hist_df.copy()
ICV = ICV.loc[ICV['location'] == curr_location_df.location.unique()[0]] # filter corresponding location

ICV_means = ICV.loc[:, numeric_feat].mean()
ICV_stds = ICV.loc[:, numeric_feat].std()
ICV_stds[ICV_stds.le(10**(-10))] = 1
ICV = (ICV.loc[:, numeric_feat] - ICV_means) / ICV_stds

curr_location_df.loc[:, numeric_feat] = (curr_location_df.loc[:, numeric_feat] - ICV_means) / ICV_stds
complete_hist_df.loc[:, numeric_feat]  = (complete_hist_df.loc[:, numeric_feat] - ICV_means) / ICV_stds

In [19]:
#
# pick numerical part of the data frame to do the operations:
#
complete_hist_df_numeric = complete_hist_df.loc[:, numeric_feat].copy()
future_numeric = curr_location_df.loc[:, numeric_feat].copy()
ICV = ICV.loc[:, numeric_feat]

### Apply PCA here and use those to find analogs
pca = PCA(n_components = detect_effective_compon(ICV))
pca.fit(ICV);
#
# transform data into PCA space to compute analogs
ICV_pca = pca.transform(ICV)
hist_pca = pca.transform(complete_hist_df_numeric)
future_pca = pca.transform(future_numeric)

# the following is the same as [(1/N) * np.matmul(M.transpose(), M)]. which is not even divided by N-1
cov = sklearn.covariance.empirical_covariance(ICV_pca, assume_centered=False)

# there is no difference between the following line and adding metric_params={'V': cov} to it
neigh = NearestNeighbors(n_neighbors=NN_count, metric = "mahalanobis", algorithm="brute")
neigh.fit(hist_pca);

In [20]:
yr = 0
result = neigh.kneighbors([future_pca[yr, ]])

In [21]:
NNs_distances = result[0][0]
NNs_idx = result[1][0]

In [22]:
curr_NNs = complete_hist_df.loc[NNs_idx, ['year', 'location']].copy()

In [25]:
curr_NNs = list(np.hstack(np.split(curr_NNs, n_ngbrs))[0])
NNs_df.iloc[yr, 2:] = curr_NNs

dists_df.iloc[yr, 2:] = NNs_distances



In [28]:
yr = 1
result = neigh.kneighbors([future_pca[yr, ]])

NNs_distances = result[0][0]
NNs_idx = result[1][0]

# find and reshape the NNs
# reshape the nearest neighbros from long to wide, so, every other column is (year, location) of ith NN
#
curr_NNs = complete_hist_df.loc[NNs_idx, ['year', 'location']].copy()
curr_NNs = list(np.hstack(np.split(curr_NNs, NN_count))[0])
NNs_df.iloc[yr, 2:] = curr_NNs

dists_df.iloc[yr, 2:] = NNs_distances



In [31]:
dists_df.head(5)

Unnamed: 0,year,location,dist_NN_1,dist_NN_2,dist_NN_3,dist_NN_4,dist_NN_5
1,2026,43.59375_-116.78125,0.797874,0.804299,0.8108,0.886148,0.896228
296,2027,43.59375_-116.78125,0.683017,0.849194,0.857648,0.875618,0.901377
591,2028,43.59375_-116.78125,-999.0,-999.0,-999.0,-999.0,-999.0
886,2029,43.59375_-116.78125,-999.0,-999.0,-999.0,-999.0,-999.0
1181,2030,43.59375_-116.78125,-999.0,-999.0,-999.0,-999.0,-999.0


In [None]:
n_ngbrs = 5
neigh = NearestNeighbors(n_neighbors=n_ngbrs, metric = "mahalanobis", metric_params={'V': cov}, algorithm="brute")
neigh.fit(hist_pca);
result = neigh.kneighbors([future_pca[1, ]])

NNs_distances = result[0][0]

# find and reshape the NNs
# reshape the nearest neighbros from long to wide, so, every other column is (year, location) of ith NN
#
NNs_idx = result[1][0]
curr_NNs = complete_hist_df.loc[NNs_idx, ['year', 'location']].copy()
curr_NNs = pd.DataFrame(np.hstack(np.split(curr_NNs, NN_count)))


In [None]:
NNs_df.iloc[0, 2:12] = list(curr_NNs.iloc[0, ])
NNs_df.head(2)

In [34]:
def one_location_dist_builtin(curr_location_df, complete_hist_df, numeric_feat, NN_count):
    future_yr_count = curr_location_df.shape[0]
    needed_col_count = NN_count * 2
    
    NNs_df   = curr_location_df[['year', 'location']].copy() # data frame containing (year, location)
    dists_df = curr_location_df[['year', 'location']].copy() # data frame containing distances
    
    ## concatenate new data frame to above ones, to speed up
    NNs_df_new_cols, dists_df_new_cols = create_colnames(NN_count)
    
    NNs_df_helper = pd.DataFrame('-999', index=NNs_df.index,  columns=NNs_df_new_cols)
    dists_df_helper = pd.DataFrame('-999', index=dists_df.index,  columns=dists_df_new_cols)
    
    NNs_df = pd.concat([NNs_df, NNs_df_helper], axis=1)
    dists_df = pd.concat([dists_df, dists_df_helper], axis=1)
    
    del(NNs_df_helper, dists_df_helper, NNs_df_new_cols, dists_df_new_cols)
    
    
    # form the ICV to compute its covariance to remove inter-annual variability
    ICV = complete_hist_df.copy()
    ICV = ICV.loc[ICV['location'] == curr_location_df.location.unique()[0]] # filter corresponding location
    #############################################################################
    #
    #          Normalize before doing anything
    #
    #############################################################################
    ICV_means = ICV.loc[:, numeric_feat].mean()
    ICV_stds = ICV.loc[:, numeric_feat].std()
    ICV_stds[ICV_stds.le(10**(-10))] = 1
    
    ICV = (ICV.loc[:, numeric_feat] - ICV_means) / ICV_stds
    curr_location_df.loc[:, numeric_feat] = (curr_location_df.loc[:, numeric_feat] - ICV_means) / ICV_stds
    complete_hist_df.loc[:, numeric_feat] = (complete_hist_df.loc[:, numeric_feat] - ICV_means) / ICV_stds
    #
    # pick numerical part of the data frame to do the operations:
    #
    complete_hist_df_numeric = complete_hist_df.loc[:, numeric_feat].copy()
    future_numeric = curr_location_df.loc[:, numeric_feat].copy()
    ICV = ICV.loc[:, numeric_feat]
    
    ### Apply PCA here and use those to find analogs
    pca = PCA(n_components = detect_effective_compon(ICV))
    pca.fit(ICV);
    #
    # transform data into PCA space to compute analogs
    ICV_pca = pca.transform(ICV)
    hist_pca = pca.transform(complete_hist_df_numeric)
    future_pca = pca.transform(future_numeric)

    # the following is the same as [(1/N) * np.matmul(M.transpose(), M)]. which is not even divided by N-1
    cov = sklearn.covariance.empirical_covariance(ICV_pca, assume_centered=False)
    
    # there is no difference between the following line and adding metric_params={'V': cov} to it
    neigh = NearestNeighbors(n_neighbors=NN_count, metric = "mahalanobis", algorithm="brute")
    neigh.fit(hist_pca);
    for yr in np.arange(curr_location_df.shape[0]):
        result = neigh.kneighbors([future_pca[yr, ]])

        NNs_distances = result[0][0]
        NNs_idx = result[1][0]

        # find and reshape the NNs
        # reshape the nearest neighbros from long to wide, so, every other column is (year, location) of ith NN
        #
        curr_NNs = complete_hist_df.loc[NNs_idx, ['year', 'location']].copy()
        curr_NNs = list(np.hstack(np.split(curr_NNs, NN_count))[0])
        NNs_df.iloc[yr, 2:] = curr_NNs
        
        dists_df.iloc[yr, 2:] = NNs_distances

    return(NNs_df, dists_df)

In [35]:
a, b = one_location_dist_builtin(curr_location_df, complete_hist_df, numeric_feat, NN_count)

In [37]:
b

Unnamed: 0,year,location,dist_NN_1,dist_NN_2,dist_NN_3,dist_NN_4,dist_NN_5
1,2026,43.59375_-116.78125,0.797874,0.804299,0.8108,0.886148,0.896228
296,2027,43.59375_-116.78125,0.683017,0.849194,0.857648,0.875618,0.901377
591,2028,43.59375_-116.78125,0.613924,0.691352,0.692246,0.726502,0.734785
886,2029,43.59375_-116.78125,1.9074,1.95769,1.9683,1.97954,1.98294
1181,2030,43.59375_-116.78125,0.954512,0.994383,1.0683,1.10056,1.10398
1476,2031,43.59375_-116.78125,0.671194,0.681865,0.690564,0.691943,0.696764
1771,2032,43.59375_-116.78125,0.816631,0.926952,0.963989,0.988894,1.00898
2066,2033,43.59375_-116.78125,2.01943,2.03389,2.07135,2.09749,2.1357
2361,2034,43.59375_-116.78125,1.02401,1.04326,1.06528,1.12587,1.14767
2656,2035,43.59375_-116.78125,0.742176,0.800303,0.878685,0.882782,0.886884


In [None]:
dists_df

In [None]:
def one_location_dist(curr_location_df, complete_hist_df, numeric_feat, NN_count):
    """
    input: curr_location_df: data frame of current location, including one location, all future years
           complete_hist_df: historical data frame (analog pool)
           numeric_feat: list of column names that are numeric
           NN_count: number of NNs we want
    
    output: two data frames:
                includes list of years and locations of nearest neighbors
                includes list of distaces of NNs to the queries.
    """

    # initiatae data frames to attach the locations that are NNs
    NNs_df   = curr_location_df[['year', 'location']].copy() # data frame containing (year, location)
    dists_df = curr_location_df[['year', 'location']].copy() # data frame containing distances
    """
    ## Make the ICV
    Copy all the historical data into `ICV` so they are separate
    and we are clear on what is going on

    `ICV` is used to remove inter anual variability! 
    This is the one we have to get covariance matrix from.
    """
    
    # form the ICV to compute its covariance to remove inter-annual variability
    ICV = complete_hist_df.copy()
    ICV = ICV.loc[ICV['location'] == curr_location_df.location.unique()[0]] # filter corresponding location
    #############################################################################
    #
    #          Normalize before doing anything
    #
    #############################################################################
    ICV_means = curr_location_df.loc[:, numeric_feat].mean()
    ICV_stds = curr_location_df.loc[:, numeric_feat].std()
    
    ICV = (ICV.loc[:, numeric_feat] - ICV_means) / ICV_stds
    curr_location_df = (curr_location_df.loc[:, numeric_feat] - ICV_means) / ICV_stds
    complete_hist_df = (complete_hist_df.loc[:, numeric_feat] - ICV_means) / ICV_stds
    
    #
    # pick numerical part of the data frame to do the operations:
    #
    complete_hist_df_numeric = complete_hist_df.loc[:, numeric_feat].copy()
    future_numeric = curr_location_df.loc[:, numeric_feat].copy()
    ICV = ICV.loc[:, numeric_feat]
    
    ### Apply PCA here and use those to find analogs
    pca = PCA(n_components = detect_effective_compon(ICV))
    pca.fit(ICV)
    #
    # transform data into PCA space to compute analogs
    ICV_pca = pca.transform(ICV)
    hist_pca = pca.transform(complete_hist_df_numeric)
    future_pca = pca.transform(future_numeric)
    
    # compute covariance of ICV_pca
    # the robust thing changes every time! is it based on a random start
    # of an interative method?
    """
    robust_cov = MinCovDet().fit(ICV_pca)
    robust_cov = robust_cov.covariance_
    robust_cov_inv = np.linalg.inv(robust_cov)
    # np.cov(ICV_pca);
    """
    # the following is the same as [(1/N) * np.matmul(M.transpose(), M)]. which is not even divided by N-1
    cov = sklearn.covariance.empirical_covariance(ICV_pca, assume_centered=False)
    cov_inv = np.linalg.inv(cov)
    ##
    ## Find nearest neighbors
    ##
    future_yr_count = len(curr_location_df.year.unique())
    
    for yr in np.arange(future_yr_count):
        # list of years and locations in historical data to use to attach the distances to
        hist_loc_year_frame = complete_hist_df[['year', 'location']].copy()
        
        curr_ft = future_pca[yr, ]
        curr_dists = one_sample_dist(curr_future=curr_ft, hist_dt=hist_pca, conar_inv=cov_inv)
       
        # add the distanced to the year_location data frame, 
        # so we know how far each point is from the query.
        hist_loc_year_frame['distance'] = curr_distance
        hist_loc_year_frame = hist_loc_year_frame.sort_values(by="distance") 
        hist_loc_year_frame = hist_loc_year_frame.iloc[0: NN_count,] # grab needed number of nearest neighbors
        
        
###################################################################################
#                                                                                 #
#        compute distance of one sample point to all points in analog pool        #
#                                                                                 #
###################################################################################
def one_sample_dist(curr_future, hist_dt, conar_inv):
    """
    inputs here are of np.ndarray type that are projections into PCA space
    
    inputs: curr_future: future data for one location, one year (a vector)
            hist_dt: historical data to find analogs in
            conar_inv: inverse of covariance matrix for M. distance

    output: list of distances of the given sample, (one_loc, one_year),
            from all historical samples. 
            (1293 locations * 37 years = 47841 distances)
    """
    diff_matrix = curr_future - hist_dt
    square_dists_matrix = np.matmul(diff_matrix, np.matmul(conar_inv, diff_matrix.transpose()))

    # take diagonal entries which are distances^2, and then take the sqrt.
    distances = np.sqrt(np.diagonal(square_dists_matrix))
    return (distances)