### import libraries

In [1]:
#
# basic libraries
#
import numpy as np
import pandas as pd
from sklearn.covariance import EmpiricalCovariance, MinCovDet
import sklearn.covariance
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import math
# plotting stuff
import matplotlib.pyplot as plt
import matplotlib as mpl
import collections
import seaborn as sb
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 8, 6
from timeit import default_timer as timer
import warnings

# Function Definitions here

In [2]:
def find_ALL_location_NNs(future_df, analog_pool, numeric_feat, NN_count):
    local_sites = future_df.location.unique()
    #
    # initiate dataframes to speed up
    # 
    row_count = future_df.shape[0]
    needed_col_count = NN_count * 2

    all_NNs_df   = future_df[['year', 'location']].copy() # data frame containing (year, location)
    all_dists_df = future_df[['year', 'location']].copy() # data frame containing distances


    ## concatenate new data frame to above ones, to speed up
    NNs_new_cols, dists_new_cols = create_colnames(NN_count)

    NNs_df_help = pd.DataFrame('-999', index = all_NNs_df.index,  columns = NNs_new_cols)
    dists_df_help = pd.DataFrame('-999', index = all_dists_df.index,  columns = dists_new_cols)

    all_NNs_df = pd.concat([all_NNs_df, NNs_df_help], axis=1)
    all_dists_df = pd.concat([all_dists_df, dists_df_help], axis=1)

    del(NNs_df_help, dists_df_help, NNs_new_cols, dists_new_cols)

    for loc in local_sites:
        # pick up one location data
        curr_loc_df = future_df[future_df.location == loc].copy()
        complete_hist_df = analog_pool.copy()
        
        output = find_1_location_NNs_builtin(curr_location_df = curr_loc_df, 
                                             complete_hist_df = analog_pool, 
                                             numeric_feat=numeric_feat, 
                                             NN_count=NN_count)
        
        all_NNs_df[all_NNs_df.index.isin(list(output[0].index))] = output[0]
        all_dists_df[all_NNs_df.index.isin(list(output[1].index))] = output[1]
        del(output)
    return(all_NNs_df, all_dists_df)

def find_1_location_NNs_builtin(curr_location_df, complete_hist_df, numeric_feat, NN_count, meter="euclidean"):
    curr_location_df_cp = curr_location_df.copy()
    complete_hist_df_cp = complete_hist_df.copy()
    
    future_yr_count = curr_location_df_cp.shape[0]
    needed_col_count = NN_count * 2
    
    NNs_df   = curr_location_df_cp[['year', 'location']].copy() # data frame containing (year, location)
    dists_df = curr_location_df_cp[['year', 'location']].copy() # data frame containing distances
    
    ## concatenate new data frame to above ones, to speed up
    NNs_df_new_cols, dists_df_new_cols = create_colnames(NN_count)
    
    NNs_df_helper = pd.DataFrame('-999', index=NNs_df.index,  columns=NNs_df_new_cols)
    dists_df_helper = pd.DataFrame('-999', index=dists_df.index,  columns=dists_df_new_cols)
    
    NNs_df = pd.concat([NNs_df, NNs_df_helper], axis=1)
    dists_df = pd.concat([dists_df, dists_df_helper], axis=1)
    
    del(NNs_df_helper, dists_df_helper, NNs_df_new_cols, dists_df_new_cols)
    
    
    # form the ICV to compute its covariance to remove inter-annual variability
    ICV = complete_hist_df.copy()
    ICV = ICV.loc[ICV['location'] == curr_location_df_cp.location.unique()[0]] # filter corresponding location
    #############################################################################
    #
    #          Normalize before doing anything
    #
    #############################################################################
    ICV_means = ICV.loc[:, numeric_feat].mean()
    ICV_stds = ICV.loc[:, numeric_feat].std()
    ICV_stds[ICV_stds.le(10**(-10))] = 1
    
    ICV = (ICV.loc[:, numeric_feat] - ICV_means) / ICV_stds
    curr_location_df_cp.loc[:, numeric_feat] = (curr_location_df_cp.loc[:, numeric_feat] - ICV_means) / ICV_stds
    complete_hist_df.loc[:, numeric_feat] = (complete_hist_df.loc[:, numeric_feat] - ICV_means) / ICV_stds
    #
    # pick numerical part of the data frame to do the operations:
    #
    complete_hist_df_numeric = complete_hist_df.loc[:, numeric_feat].copy()
    future_numeric = curr_location_df_cp.loc[:, numeric_feat].copy()
    ICV = ICV.loc[:, numeric_feat]
    
    ### Apply PCA here and use those to find analogs
    pca = PCA(n_components = detect_effective_compon(ICV))
    pca.fit(ICV);
    #
    # transform data into PCA space to compute analogs
    ICV_pca = pca.transform(ICV)
    hist_pca = pca.transform(complete_hist_df_numeric)
    future_pca = pca.transform(future_numeric)

    # the following is the same as [(1/N) * np.matmul(M.transpose(), M)]. which is not even divided by N-1
    # cov = sklearn.covariance.empirical_covariance(ICV_pca, assume_centered=False)
    
    # there is no difference between the following line and adding metric_params={'V': cov} to it
    neigh = NearestNeighbors(n_neighbors=NN_count, metric = meter, algorithm="brute")
    neigh.fit(hist_pca);
    for yr in np.arange(2):
#         print("future_pca[yr, ]")
#         print(future_pca[yr, ])
#         print("________________________")
        result = neigh.kneighbors([future_pca[yr, ]])

        NNs_distances = result[0][0]
        NNs_idx = result[1][0]
#         print ("NNs_distances:")
#         print (NNs_distances)
#         print ("_________________")
#         print ("NNs_idx")
#         print (NNs_idx)
#         print ("_________________")
        # find and reshape the NNs
        # reshape the nearest neighbros from long to wide, so, every other column is (year, location) of ith NN
        curr_NNs = complete_hist_df.loc[NNs_idx, ['year', 'location']].copy()
        curr_NNs = list(np.hstack(np.split(curr_NNs, NN_count))[0])
        NNs_df.iloc[yr, 2:] = curr_NNs
        dists_df.iloc[yr, 2:] = NNs_distances
    return(NNs_df, dists_df)

def filter_locations(all_dt, local_dt):
    # list of unique locations in the data
    local_sites = local_dt.location.unique()
    all_sites = all_dt.location.unique()

    # find the local sites that exist in all_usa_data
    local_sites = np.intersect1d(local_sites, all_sites)

    # select the rows corresponding to existing sites
    local_dt = local_dt.loc[local_dt['location'].isin(local_sites)]
    return (local_dt)

def detect_effective_compon(matriks):
    n_comp = matriks.shape[1]
    pca = PCA(n_components = n_comp)
    pca.fit(matriks)
    return (len(pca.explained_variance_[pca.explained_variance_ > 0.01])) 

def create_colnames(NN_count):
    year_loc_cols = pd.Series(['year_NN_', 'location_NN_'] * NN_count)
    numbers = pd.Series(np.arange(1, NN_count+1).repeat(2))
    year_loc_cols = year_loc_cols.astype(str) + numbers.astype(str)
    year_loc_cols = list(year_loc_cols)
    
    dist_cols = pd.Series(['dist_NN_'] * NN_count)
    dist_cols = list(pd.Series(['dist_NN_'] * NN_count) + pd.Series(np.arange(1, NN_count+1)).astype(str))
    return (year_loc_cols, dist_cols)

In [3]:
def find_ALL_location_NNs_not_efficient(future_df, analog_pool, numeric_feat, NN_count):
    local_sites = future_df.location.unique()
    #
    # initiate dataframes to speed up
    #
    needed_col_count = NN_count * 2

    NNs_new_cols, dists_new_cols = create_colnames(NN_count)
    NNs_new_cols = ['year', 'location'] + NNs_new_cols
    dists_new_cols = ['year', 'location'] + dists_new_cols
    # data frame containing (year, location)
    all_NNs_df = pd.DataFrame(columns=dists_new_cols)
    
    # data frame containing distances
    all_dists_df = pd.DataFrame(columns=np.arange(NN_count+2))
    
    all_NNs_df = pd.DataFrame()
    all_dists_df = pd.DataFrame()

    for loc in local_sites:
        # pick up one location data
        curr_loc_df = future_df[future_df.location == loc].copy()
        complete_hist_df = analog_pool.copy()
        
        output = find_1_location_NNs_builtin(curr_location_df = curr_loc_df, 
                                             complete_hist_df = analog_pool, 
                                             numeric_feat=numeric_feat, 
                                             NN_count=NN_count)
        
        all_NNs_df = pd.concat([all_NNs_df, output[0]])
        all_dists_df = pd.concat([all_dists_df, output[1]])
        del(output)
    return(all_NNs_df, all_dists_df)

### Define directories

In [4]:
in_dir = "/Users/hn/Desktop/Desktop/Kirti/check_point/analogs/"
out_dir = "/Users/hn/Desktop/"

## Determine needed columns
If we want to drop some columns like `Gen_4`, `preci`:

In [5]:
gen_3_drop = False
gen_4_drop = False
precip_drop = False

In [6]:
numeric_feat = ['medianDoY', 'NumLarvaGens_Aug', 
                'mean_escaped_Gen1', 'mean_escaped_Gen2', 'mean_escaped_Gen3', 'mean_escaped_Gen4', 
                'mean_precip', 'mean_gdd']

non_numeric_feat = ['year', 'location', 'ClimateScenario']

if gen_3_drop == True:
    numeric_feat.remove('mean_escaped_Gen3')

if gen_4_drop == True:
    numeric_feat.remove('mean_escaped_Gen4')

if precip_drop == True:
    numeric_feat.remove('mean_precip')

print(numeric_feat)

['medianDoY', 'NumLarvaGens_Aug', 'mean_escaped_Gen1', 'mean_escaped_Gen2', 'mean_escaped_Gen3', 'mean_escaped_Gen4', 'mean_precip', 'mean_gdd']


### Read data

In [7]:
hist_orig = pd.read_csv(in_dir + "all_data_usa.csv")
hist_orig = hist_orig.loc[:, non_numeric_feat + numeric_feat] # drop unwanted columns

In [8]:
future_orig = pd.read_csv(in_dir + "averaged_data_rcp45.csv")
future_orig = future_orig.loc[:, non_numeric_feat + numeric_feat] # drop unwanted columns

In [9]:
# future_orig = future_orig.iloc[0:1000].copy()

# Driver


#### Filter the locations
Some locations in local data are not in all USA. So, here we choose the local (future) data in whose
sites do exist in all_usa_data

In [10]:
future_orig = filter_locations(hist_orig, future_orig)

In [None]:
# pick up one location data
# curr_location_df = future_orig[future_orig.location == future_orig.location.iloc[0]].copy()
# complete_hist_df = hist_orig.copy()
# curr_location_df.head(2)

In [None]:
future_dtt = future_orig[future_orig.location=='43.59375_-116.78125'].copy()
all_NNs_df, all_dists_df = find_ALL_location_NNs_not_efficient(future_df = future_dtt, 
                                                               analog_pool=hist_orig, 
                                                               numeric_feat=numeric_feat, 
                                                               NN_count=20)



In [None]:
A_def = all_NNs_df[all_NNs_df.location=='43.59375_-116.78125']

In [None]:
all_NNs_df.iloc[:, 2:].drop_duplicates()

In [None]:
all_NNs_df_0, all_dists_df_0 = find_ALL_location_NNs_not_efficient(future_df = future_dtt.iloc[0:1, :], 
                                                               analog_pool=hist_orig, 
                                                               numeric_feat=numeric_feat, 
                                                               NN_count=20)

In [None]:
all_NNs_df_1, all_dists_df_1 = find_ALL_location_NNs_not_efficient(future_df = future_dtt.iloc[1:2, :], 
                                                                   analog_pool=hist_orig, 
                                                                   numeric_feat=numeric_feat, 
                                                                   NN_count=20)

In [None]:
a = pd.concat([all_NNs_df_1, all_NNs_df_0])

In [None]:
a.iloc[:, 2:].drop_duplicates()

In [None]:
future_df = future_orig.copy()
analog_pool = hist_orig.copy()
complete_hist_df = analog_pool.copy()
local_sites = future_df.location.unique()

In [None]:
loc = local_sites[0]
curr_loc_df_0 = future_df[future_df.location == loc].copy()

In [None]:
loc = "43.59375_-116.84375"
curr_loc_df = future_df[future_df.location == loc].copy()

In [None]:
curr_loc_df.head()

In [None]:
curr_loc_df_0.head()

In [None]:
ICV_0 = complete_hist_df.copy()
ICV_0 = ICV_0.loc[ICV_0['location'] == curr_loc_df_0.location.unique()[0]] # filter corresponding location
ICV_means_0 = ICV_0.loc[:, numeric_feat].mean()
ICV_stds_0 = ICV_0.loc[:, numeric_feat].std()
ICV_stds_0[ICV_stds_0.le(10**(-10))] = 1
ICV_0 = (ICV_0.loc[:, numeric_feat] - ICV_means_0) / ICV_stds_0

In [None]:
ICV = complete_hist_df.copy()
ICV = ICV.loc[ICV['location'] == curr_loc_df.location.unique()[0]] # filter corresponding location
ICV_means = ICV.loc[:, numeric_feat].mean()
ICV_stds = ICV.loc[:, numeric_feat].std()
ICV_stds[ICV_stds.le(10**(-10))] = 1
ICV = (ICV.loc[:, numeric_feat] - ICV_means) / ICV_stds

In [None]:
ICV.head(3)

In [None]:
ICV_0.head(3)

In [None]:
curr_loc_df_0.loc[:, numeric_feat] = (curr_loc_df_0.loc[:, numeric_feat] - ICV_means_0) / ICV_stds_0

In [None]:
curr_loc_df.loc[:, numeric_feat] = (curr_loc_df.loc[:, numeric_feat] - ICV_means) / ICV_stds

In [None]:
curr_loc_df_0.head(2)

In [None]:
curr_loc_df.head(2)

In [None]:
complete_hist_df_0 = complete_hist_df.copy()

In [None]:
complete_hist_df_0.loc[:, numeric_feat]

In [None]:
complete_hist_df_0.loc[:, numeric_feat]=(complete_hist_df_0.loc[:, numeric_feat] - ICV_means_0) / ICV_stds_0
complete_hist_df.loc[:, numeric_feat] = (complete_hist_df.loc[:, numeric_feat] - ICV_means) / ICV_stds

In [None]:
complete_hist_df_0.head(2)

In [None]:
complete_hist_df.head(2)

In [None]:
complete_hist_df_numeric_0 = complete_hist_df_0.loc[:, numeric_feat].copy()
complete_hist_df_numeric_0.head()

In [None]:
complete_hist_df_numeric = complete_hist_df.loc[:, numeric_feat].copy()
complete_hist_df_numeric.head(3)

In [None]:
future_numeric_0 = curr_loc_df_0.loc[:, numeric_feat].copy()
future_numeric_0.head(3)

In [None]:
future_numeric = curr_loc_df.loc[:, numeric_feat].copy()
future_numeric.head(3)

In [None]:
ICV_0 = ICV_0.loc[:, numeric_feat]
ICV_0.head()

In [None]:
ICV = ICV.loc[:, numeric_feat]
ICV.head()

In [None]:
pca_0 = PCA(n_components = detect_effective_compon(ICV_0))

In [None]:
pca = PCA(n_components = detect_effective_compon(ICV))

In [None]:
pca.fit(ICV_0);
pca.fit(ICV);

In [None]:
ICV_pca_0 = pca.transform(ICV_0)

In [None]:
ICV_pca = pca.transform(ICV)

In [None]:
hist_pca_0 = pca.transform(complete_hist_df_numeric_0)
hist_pca_0

In [None]:
hist_pca = pca.transform(complete_hist_df_numeric)
hist_pca

In [None]:
future_pca_0 = pca.transform(future_numeric_0)
future_pca_0

In [None]:
future_pca = pca.transform(future_numeric)
future_pca

In [None]:
NN_count=10

In [None]:
neigh_0 = NearestNeighbors(n_neighbors=NN_count, metric = "mahalanobis", algorithm="brute")

In [None]:
neigh = NearestNeighbors(n_neighbors=NN_count, metric = "mahalanobis", algorithm="brute")

In [None]:
neigh_0.fit(hist_pca_0)
neigh.fit(hist_pca)

In [None]:
future_yr_count_0 = curr_loc_df_0.shape[0]
needed_col_count = NN_count * 2

NNs_df_0   = curr_loc_df_0[['year', 'location']].copy() # data frame containing (year, location)
dists_df_0 = curr_loc_df_0[['year', 'location']].copy() # data frame containing distances

## concatenate new data frame to above ones, to speed up
NNs_df_new_cols_0, dists_df_new_cols_0 = create_colnames(NN_count)

NNs_df_helper_0 = pd.DataFrame('-999', index = NNs_df_0.index,  columns = NNs_df_new_cols_0)
dists_df_helper_0 = pd.DataFrame('-999', index = dists_df_0.index,  columns = dists_df_new_cols_0)

NNs_df_0 = pd.concat([NNs_df_0, NNs_df_helper_0], axis=1)
dists_df_0 = pd.concat([dists_df_0, dists_df_helper_0], axis=1)

del(NNs_df_helper_0, dists_df_helper_0, NNs_df_new_cols_0, dists_df_new_cols_0)

In [None]:
future_yr_count = curr_loc_df.shape[0]
needed_col_count = NN_count * 2

NNs_df   = curr_loc_df[['year', 'location']].copy() # data frame containing (year, location)
dists_df = curr_loc_df[['year', 'location']].copy() # data frame containing distances

## concatenate new data frame to above ones, to speed up
NNs_df_new_cols, dists_df_new_cols = create_colnames(NN_count)

NNs_df_helper = pd.DataFrame('-999', index=NNs_df.index,  columns=NNs_df_new_cols)
dists_df_helper = pd.DataFrame('-999', index=dists_df.index,  columns=dists_df_new_cols)

NNs_df = pd.concat([NNs_df, NNs_df_helper], axis=1)
dists_df = pd.concat([dists_df, dists_df_helper], axis=1)

del(NNs_df_helper, dists_df_helper, NNs_df_new_cols, dists_df_new_cols)

In [None]:
for yr in np.arange(curr_loc_df_0.shape[0]):
    result_0 = neigh_0.kneighbors([future_pca_0[yr, ]])
    NNs_distances_0 = result_0[0][0]
    NNs_idx_0 = result_0[1][0]

    curr_NNs_0 = complete_hist_df_0.loc[NNs_idx_0, ['year', 'location']].copy()
    curr_NNs_0 = list(np.hstack(np.split(curr_NNs_0, NN_count))[0])
    NNs_df_0.iloc[yr, 2:] = curr_NNs_0
      
    dists_df_0.iloc[yr, 2:] = NNs_distances_0

In [None]:
for yr in np.arange(curr_loc_df.shape[0]):
    result = neigh.kneighbors([future_pca_0[yr, ]])
    NNs_distances = result[0][0]
    NNs_idx = result[1][0]

    curr_NNs = complete_hist_df.loc[NNs_idx, ['year', 'location']].copy()
    curr_NNs = list(np.hstack(np.split(curr_NNs, NN_count))[0])
    NNs_df.iloc[yr, 2:] = curr_NNs
      
    dists_df.iloc[yr, 2:] = NNs_distances

In [None]:
NNs_df

In [None]:
NNs_df_0.head()

In [None]:
NNs_df.head()

In [None]:
dists_df.head(2)

In [None]:
dists_df_0.head(2)

In [None]:
for yr in np.arange(curr_location_df.shape[0]):
    result = neigh.kneighbors([future_pca[yr, ]])
    NNs_distances = result[0][0]
    NNs_idx = result[1][0]
    
    curr_NNs = complete_hist_df.loc[NNs_idx, ['year', 'location']].copy()
    curr_NNs = list(np.hstack(np.split(curr_NNs, NN_count))[0])
    NNs_df.iloc[yr, 2:] = curr_NNs
        
    dists_df.iloc[yr, 2:] = NNs_distances

# Below this line

In [23]:
def find_1_location_NNs_builtin(curr_location_df, complete_hist_df, numeric_feat, NN_count, meter="euclidean"):
    curr_location_df_cp = curr_location_df.copy()
    complete_hist_df_cp = complete_hist_df.copy()
    
    future_yr_count = curr_location_df_cp.shape[0]
    needed_col_count = NN_count * 2
    
    NNs_df   = curr_location_df_cp[['year', 'location']].copy() # data frame containing (year, location)
    dists_df = curr_location_df_cp[['year', 'location']].copy() # data frame containing distances
    
    ## concatenate new data frame to above ones, to speed up
    NNs_df_new_cols, dists_df_new_cols = create_colnames(NN_count)
    
    NNs_df_helper = pd.DataFrame('-999', index=NNs_df.index,  columns=NNs_df_new_cols)
    dists_df_helper = pd.DataFrame('-999', index=dists_df.index,  columns=dists_df_new_cols)
    
    NNs_df = pd.concat([NNs_df, NNs_df_helper], axis=1)
    dists_df = pd.concat([dists_df, dists_df_helper], axis=1)
    
    del(NNs_df_helper, dists_df_helper, NNs_df_new_cols, dists_df_new_cols)
    
    
    # form the ICV to compute its covariance to remove inter-annual variability
    ICV = complete_hist_df.copy()
    ICV = ICV.loc[ICV['location'] == curr_location_df_cp.location.unique()[0]] # filter corresponding location
    #############################################################################
    #
    #          Normalize before doing anything
    #
    #############################################################################
    ICV_means = ICV.loc[:, numeric_feat].mean()
    ICV_stds = ICV.loc[:, numeric_feat].std()
    ICV_stds[ICV_stds.le(10**(-10))] = 1
    
    ICV = (ICV.loc[:, numeric_feat] - ICV_means) / ICV_stds
    curr_location_df_cp.loc[:, numeric_feat] = (curr_location_df_cp.loc[:, numeric_feat] - ICV_means) / ICV_stds
    complete_hist_df.loc[:, numeric_feat] = (complete_hist_df.loc[:, numeric_feat] - ICV_means) / ICV_stds
    #
    # pick numerical part of the data frame to do the operations:
    #
    complete_hist_df_numeric = complete_hist_df.loc[:, numeric_feat].copy()
    future_numeric = curr_location_df_cp.loc[:, numeric_feat].copy()
    ICV = ICV.loc[:, numeric_feat]
    
    ### Apply PCA here and use those to find analogs
    pca = PCA(n_components = detect_effective_compon(ICV))
    pca.fit(ICV);
    #
    # transform data into PCA space to compute analogs
    ICV_pca = pca.transform(ICV)
    hist_pca = pca.transform(complete_hist_df_numeric)
    future_pca = pca.transform(future_numeric)

    # the following is the same as [(1/N) * np.matmul(M.transpose(), M)]. which is not even divided by N-1
    # cov = sklearn.covariance.empirical_covariance(ICV_pca, assume_centered=False)
    
    # there is no difference between the following line and adding metric_params={'V': cov} to it
    neigh = NearestNeighbors(n_neighbors=NN_count, metric = meter, algorithm="brute")
    neigh.fit(hist_pca);
    for yr in np.arange(2):
#         print("future_pca[yr, ]")
#         print(future_pca[yr, ])
#         print("________________________")
        result = neigh.kneighbors([future_pca[yr, ]])

        NNs_distances = result[0][0]
        NNs_idx = result[1][0]
#         print ("NNs_distances:")
#         print (NNs_distances)
#         print ("_________________")
#         print ("NNs_idx")
#         print (NNs_idx)
#         print ("_________________")
        # find and reshape the NNs
        # reshape the nearest neighbros from long to wide, so, every other column is (year, location) of ith NN
        curr_NNs = complete_hist_df.loc[NNs_idx, ['year', 'location']].copy()
        curr_NNs = list(np.hstack(np.split(curr_NNs, NN_count))[0])
        NNs_df.iloc[yr, 2:] = curr_NNs
        dists_df.iloc[yr, 2:] = NNs_distances
    return(NNs_df, dists_df)

In [61]:
site_of_int = future_orig[future_orig.location == "43.59375_-116.78125"]
NN_count = 5
meter = "euclidean" # mahalanobis, 

curr_location_df = site_of_int.copy()
complete_hist_df = hist_orig.copy()

In [62]:
curr_location_df_cp = curr_location_df.copy()
complete_hist_df_cp = complete_hist_df.copy()

future_yr_count = curr_location_df_cp.shape[0]
needed_col_count = NN_count * 2

NNs_df   = curr_location_df_cp[['year', 'location']].copy() # data frame containing (year, location)
dists_df = curr_location_df_cp[['year', 'location']].copy() # data frame containing distances

## concatenate new data frame to above ones, to speed up
NNs_df_new_cols, dists_df_new_cols = create_colnames(NN_count)

NNs_df_helper = pd.DataFrame('-999', index=NNs_df.index,  columns=NNs_df_new_cols)
dists_df_helper = pd.DataFrame('-999', index=dists_df.index,  columns=dists_df_new_cols)

NNs_df = pd.concat([NNs_df, NNs_df_helper], axis=1)
dists_df = pd.concat([dists_df, dists_df_helper], axis=1)

del(NNs_df_helper, dists_df_helper, NNs_df_new_cols, dists_df_new_cols)

In [63]:
ICV = complete_hist_df.copy()
ICV = ICV.loc[ICV['location'] == curr_location_df_cp.location.unique()[0]] # filter corresponding location
#############################################################################
#
#          Normalize before doing anything
#
#############################################################################
ICV_means = ICV.loc[:, numeric_feat].mean()
ICV_stds = ICV.loc[:, numeric_feat].std()
ICV_stds[ICV_stds.le(10**(-10))] = 1

ICV = (ICV.loc[:, numeric_feat] - ICV_means) / ICV_stds
curr_location_df_cp.loc[:, numeric_feat] = (curr_location_df_cp.loc[:, numeric_feat] - ICV_means) / ICV_stds
complete_hist_df.loc[:, numeric_feat] = (complete_hist_df.loc[:, numeric_feat] - ICV_means) / ICV_stds
#
# pick numerical part of the data frame to do the operations:
#
complete_hist_df_numeric = complete_hist_df.loc[:, numeric_feat].copy()
future_numeric = curr_location_df_cp.loc[:, numeric_feat].copy()
ICV = ICV.loc[:, numeric_feat]

In [64]:
detect_effective_compon(ICV)

7

In [76]:
ICV.shape

(37, 8)

In [77]:
### Apply PCA here and use those to find analogs
pca = PCA(n_components = 8)
pca.fit(ICV);
#
# transform data into PCA space to compute analogs
ICV_pca = pca.transform(ICV)
hist_pca = pca.transform(complete_hist_df_numeric)
future_pca = pca.transform(future_numeric)

In [78]:
np.dot(ICV_pca, ICV_pca.transpose())[0]

array([ 0.67501134,  0.58296693,  0.82467531,  0.74516984,  1.26769116,
        0.85198479, -0.67452197, -0.7902211 , -0.39207248, -0.27255373,
       -0.27965518, -0.59185053,  0.27674365, -2.1516196 ,  0.81403737,
       -1.29193397,  0.2272856 ,  0.08476283, -0.03999558,  1.23218582,
       -0.07925489, -0.23312085,  0.12281806, -0.41657607, -0.07447846,
       -0.72805407,  0.46049125,  0.51987213, -2.01270611,  0.6814595 ,
        0.14134909,  0.22474309,  0.75333838,  0.15086148, -0.146713  ,
        0.6102927 , -1.07241274])

In [79]:
np.dot(ICV, ICV.transpose())[0]

array([ 0.67501134,  0.58296693,  0.82467531,  0.74516984,  1.26769116,
        0.85198479, -0.67452197, -0.7902211 , -0.39207248, -0.27255373,
       -0.27965518, -0.59185053,  0.27674365, -2.1516196 ,  0.81403737,
       -1.29193397,  0.2272856 ,  0.08476283, -0.03999558,  1.23218582,
       -0.07925489, -0.23312085,  0.12281806, -0.41657607, -0.07447846,
       -0.72805407,  0.46049125,  0.51987213, -2.01270611,  0.6814595 ,
        0.14134909,  0.22474309,  0.75333838,  0.15086148, -0.146713  ,
        0.6102927 , -1.07241274])

In [84]:
np.matmul(ICV_pca, ICV_pca.transpose())

array([[ 0.67501134,  0.58296693,  0.82467531, ..., -0.146713  ,
         0.6102927 , -1.07241274],
       [ 0.58296693,  3.37156484,  3.8890029 , ..., -2.39836335,
         0.19731406, -4.77002201],
       [ 0.82467531,  3.8890029 ,  5.00621334, ..., -2.91636435,
         0.74780131, -5.20816261],
       ...,
       [-0.146713  , -2.39836335, -2.91636435, ...,  2.74199704,
        -0.71934626,  1.16097701],
       [ 0.6102927 ,  0.19731406,  0.74780131, ..., -0.71934626,
         1.62246561,  1.95746039],
       [-1.07241274, -4.77002201, -5.20816261, ...,  1.16097701,
         1.95746039, 13.69479919]])

In [110]:
A = np.random.rand(20, 3)
np.cov(A).shape

(20, 20)

In [112]:
A = np.random.rand(20, 3)
pca = PCA(n_components = 3)
pca.fit(A);
A_pca = pca.transform(A)
A_pca_1 = sklearn.covariance.empirical_covariance(A, assume_centered=False)
print(np.cov(A_pca)[0])

[ 0.2099751   0.1602145  -0.00314589 -0.04802889  0.06953303  0.12085592
 -0.18602613  0.01842461 -0.09123304  0.00043655 -0.13665469  0.0618712
 -0.04276755  0.01870939 -0.00386734 -0.15509927 -0.01483284  0.02354324
 -0.05794508  0.05603718]


In [13]:
one_ICV = hist_orig[hist_orig.location == hist_orig.location[0]].copy()

In [18]:
one_ICV_numeric = one_ICV.drop(['year', 'location', 'ClimateScenario', 'mean_escaped_Gen4'], axis=1)

In [21]:
one_ICV_numeric_centered = one_ICV_numeric - one_ICV_numeric

Unnamed: 0,medianDoY,NumLarvaGens_Aug,mean_escaped_Gen1,mean_escaped_Gen2,mean_escaped_Gen3,mean_precip,mean_gdd
0,68,2.939262,1.319697,7.080458,1.348167,300.525,4821.668922
1293,57,3.061175,1.004624,7.226781,2.052017,276.025,5031.662206


In [None]:
pca = PCA(n_components = detect_effective_compon(ICV))
pca.fit(ICV);
#
# transform data into PCA space to compute analogs
ICV_pca = pca.transform(ICV)
hist_pca = pca.transform(complete_hist_df_numeric)
future_pca = pca.transform(future_numeric)