### Dataset from ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED")
https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S2_SR_HARMONIZED

In [1]:
import datetime
import geopandas
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

#from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib

 # to enable ipympl interactive interface for plots
%matplotlib widget

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

from matplotlib.backends.backend_pdf import PdfPages

In [2]:
import time
import matplotlib
from distinctipy import distinctipy # generate N distinct colors

In [3]:
#local scripts
from scripts import veg_indices, utilities, plots
#from scripts.utilities import *

In [4]:
BANDS_DICT = {   'B2': 'Blue',
            'B3': 'Green',
            'B4': 'Red',
            'B5': 'Red_Edge_1',
            'B6': 'Red_Edge_2',
            'B7': 'Red_Edge_3',
            'B8': 'NIR',
            'B8A': 'Red_Edge_4',
            'B11': 'SWIR_1',
            'B12': 'SWIR_2'}

BANDS = list(BANDS_DICT.keys())

In [5]:
df, NUMERIC_COLS, NUM_SAMPLES = utilities.get_df("merged_images_train", veg_indices, BANDS)
# For each 3-week image, standarize each column
df = utilities.get_rm_outlier_standarize(df, NUMERIC_COLS, rm_outliers=False)

Added:  ['RVI', 'ARVI', 'PSSRa', 'NDI45', 'GNDVI', 'MCARI', 'IRECI', 'CIr', 'MTCI', 'NDVIre', 'NIRv', 'EVI', 'NDTI', 'NDMI', 'MSI', 'GCI', 'NBRI', 'BSI', 'NDWI', 'NDSI']
(not in place), created : ['RVI_diff', 'ARVI_diff', 'PSSRa_diff', 'NDI45_diff', 'GNDVI_diff', 'MCARI_diff', 'IRECI_diff', 'CIr_diff', 'MTCI_diff', 'NDVIre_diff', 'NIRv_diff', 'EVI_diff', 'NDTI_diff', 'NDMI_diff', 'MSI_diff', 'GCI_diff', 'NBRI_diff', 'BSI_diff', 'NDWI_diff', 'NDSI_diff', 'NDVI_diff']
Index(['index', 'B11', 'B12', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A',
       'NDVI', 'finHarvDat', 'lat', 'lon', 'point_idx', 'start_date',
       'end_date', 'har_evnt', 'image_idx', 'geometry', 'RVI', 'ARVI', 'PSSRa',
       'NDI45', 'GNDVI', 'MCARI', 'IRECI', 'CIr', 'MTCI', 'NDVIre', 'NIRv',
       'EVI', 'NDTI', 'NDMI', 'MSI', 'GCI', 'NBRI', 'BSI', 'NDWI', 'NDSI',
       'RVI_diff', 'ARVI_diff', 'PSSRa_diff', 'NDI45_diff', 'GNDVI_diff',
       'MCARI_diff', 'IRECI_diff', 'CIr_diff', 'MTCI_diff', 'NDVIre_diff',


In [6]:
# For each 3-week image, standarize each column, after removing outliers
df_trimmed = utilities.get_rm_outlier_standarize(df, NUMERIC_COLS)

print(df_trimmed.columns, df_trimmed.shape)

Index(['index', 'B11', 'B12', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A',
       'NDVI', 'finHarvDat', 'lat', 'lon', 'point_idx', 'start_date',
       'end_date', 'har_evnt', 'image_idx', 'geometry', 'RVI', 'ARVI', 'PSSRa',
       'NDI45', 'GNDVI', 'MCARI', 'IRECI', 'CIr', 'MTCI', 'NDVIre', 'NIRv',
       'EVI', 'NDTI', 'NDMI', 'MSI', 'GCI', 'NBRI', 'BSI', 'NDWI', 'NDSI',
       'RVI_diff', 'ARVI_diff', 'PSSRa_diff', 'NDI45_diff', 'GNDVI_diff',
       'MCARI_diff', 'IRECI_diff', 'CIr_diff', 'MTCI_diff', 'NDVIre_diff',
       'NIRv_diff', 'EVI_diff', 'NDTI_diff', 'NDMI_diff', 'MSI_diff',
       'GCI_diff', 'NBRI_diff', 'BSI_diff', 'NDWI_diff', 'NDSI_diff',
       'NDVI_diff'],
      dtype='object') (15326, 62)


# NOTE: df_trimmed has NaN because we set outliers to NaN

In [None]:
stretched_df_trimmed = utilities.stretch_cols(df_trimmed, NUMERIC_COLS)
stretched_df_trimmed

### Producing Box Plots per Numeric Column

In [None]:
%%script echo skipping

def save_multi_image(filename):
    pp = PdfPages(filename)
    for sampleIdx in stretched_df_trimmed.image_idx.unique():
        curr_df = stretched_df_trimmed[stretched_df_trimmed.image_idx == sampleIdx]
        fig = plt.figure(figsize=(16,int(len(NUMERIC_COLS) * 1.5)))            # set showfliers to False to remove outliers
        sns.boxplot(data=curr_df, x="value", y="class", hue="har_evnt", showfliers = True).set(title= ("Sample Index:", sampleIdx))
        fig.savefig(pp, format='pdf')
        plt.close() # closing figure
    pp.close()

save_multi_image("../plots/box_plots/everything_trimmed_standarized.pdf")

### Producing Separability Plots

In [None]:
%%script echo skipping

def plot_lines_Mean_95CI(df: pd.DataFrame):
    sns.set(rc={'figure.figsize':(len(BANDS) * 8, len(BANDS) * 5)})

    fig, axes = plt.subplots(len(BANDS), 1)
    fig.suptitle('Mean and 95% CI by Band')
    fig.subplots_adjust(hspace=0.5, wspace=0.5)


    for i in range(len(BANDS)):
        band_name = BANDS[i]
        sns.lineplot(ax=axes[i], x="finHarvDat", y=band_name,
            hue="har_evnt",
            data=df)
        sns.scatterplot(ax=axes[i], data=df, x="finHarvDat", y=band_name, hue="har_evnt", style="har_evnt", alpha=0.5)

        ax2 = axes[i].twinx()
        sns.histplot(ax=ax2, data=df, x="finHarvDat", bins=40, element="step", fill=False, color='green')
        if(i % 2 == 0):
            #axes[i].tick_params(bottom=False)
            #axes[i].legend_.remove()
            pass
        title = (BANDS_DICT[band_name] + f' ({band_name})')
        axes[i].set_title(title)
    plt.close() # closing figure


    utilities.saveFigsAsPDF([fig], "../plots/line_plots/mean&CIByBand.pdf")

plot_lines_Mean_95CI(df.dropna(subset=["finHarvDat"]))

In [None]:
%%script echo skipping

def plot_lines_Median_IQR(df: pd.DataFrame):

    sns.set(rc={'figure.figsize':(len(BANDS) * 8, len(BANDS) * 5)})
    fig, axes = plt.subplots(len(BANDS), 1)
    fig.suptitle('Median and IQR by Band')
    fig.subplots_adjust(hspace=0.5, wspace=0.5)

    # https://stackoverflow.com/questions/52525476/seaborn-lineplot-using-median-instead-of-mean
    for i in range(len(BANDS)):
        band_name = BANDS[i]
        sns.lineplot(ax=axes[i], x="finHarvDat", y=band_name,
            hue="har_evnt", estimator="median",errorbar=("pi", 50),# show inner quartile range #https://seaborn.pydata.org/tutorial/error_bars.html
            data=df)
        
        sns.scatterplot(ax=axes[i], data=df, x="finHarvDat", y=band_name, hue="har_evnt", style="har_evnt", alpha=0.5)

        ax2 = axes[i].twinx()
        sns.histplot(ax=ax2, data=df, x="finHarvDat", bins=40, element="step", fill=False, color='green')

        if(i % 2 == 0):
            #axes[i].tick_params(bottom=False)
            #axes[i].legend_.remove()
            pass

        title = (BANDS_DICT[band_name] + f' ({band_name})')
        axes[i].set_title(title)
    plt.close() # closing figure
    utilities.saveFigsAsPDF([fig], "../plots/line_plots/median&IQRByBand.pdf")

plot_lines_Median_IQR(df.dropna(subset=["finHarvDat"]))

## normalize using max value in each 3week period (aka image)


In [9]:
def get_top_N_features(N:int, COLUMN_NAMES:list([str]), df_trimmed: pd.DataFrame):
    dics = []
    for i in range(0,4):
        dataframes_dic = plots.plot_per_period(utilities.get_classes_colors(COLUMN_NAMES), df_trimmed,
                                            COLUMN_NAMES, f"../plots/bar_plots/seperability{i}.pdf", metric=i)
        dics.append(dataframes_dic)
    
    df_scores = pd.DataFrame({"class" : np.array(COLUMN_NAMES), 
                                "score": np.zeros(len(COLUMN_NAMES))}).set_index('class')

    for i in range(4): # for each separability metric
        curr_dic = dics[i]
        for image_idx in sorted(dics[0].keys()): # [i6 i7 i8 i9]
            curr_df = curr_dic[image_idx]
            curr_df['score'] = np.array(curr_df['value'][:])
            
            if(i >= 2): # because the two other separability metrics results say that they are separable
                            #   when the result is lower, opposite to the first two where the result is more 
                            #  separable when the value is higher
                curr_df['score'] = 1/curr_df['score']
                
            curr_df.sort_values(by=['score'], ascending=False, inplace=True)
            
            # normalize based on the max value
            curr_df['score'] = curr_df['score']/np.max(curr_df['score'])
            
            joined_df = df_scores.join(curr_df.set_index("class"), how="left", rsuffix="_right")
            joined_df['score_right'] = joined_df['score_right'].fillna(0) # replace nans with zeros
            df_scores['score'] = df_scores['score'] + joined_df['score_right']
        
        
    df_scores.sort_values(by=['score'],  ascending=False, inplace=True)
    
    TOP_FEATURES = list(df_scores[:N].index)
    print()
    print(TOP_FEATURES)
    print()
    
    return TOP_FEATURES

get_top_N_features(15, NUMERIC_COLS, df_trimmed)

  return np.sum(np.where(A != 0, np.multiply(A, np.log(np.divide(A, M))), 0))
  return np.sum(np.where(A != 0, np.multiply(A, np.log(np.divide(A, M))), 0))
  return np.sum(np.where(A != 0, np.multiply(A, np.log(np.divide(A, M))), 0))
  return np.sum(np.where(A != 0, np.multiply(A, np.log(np.divide(A, M))), 0))
  return np.sum(np.where(A != 0, np.multiply(A, np.log(np.divide(A, M))), 0))
  return np.sum(np.where(A != 0, np.multiply(A, np.log(np.divide(A, M))), 0))
  return np.sum(np.where(A != 0, np.multiply(A, np.log(np.divide(A, M))), 0))
  return np.sum(np.where(A != 0, np.multiply(A, np.log(np.divide(A, M))), 0))



['B8', 'RVI', 'IRECI_diff', 'B4', 'NBRI_diff', 'NDTI_diff', 'MSI', 'NDMI_diff', 'CIr', 'NDI45', 'NDMI', 'NDI45_diff', 'BSI', 'NIRv', 'ARVI']



['B8',
 'RVI',
 'IRECI_diff',
 'B4',
 'NBRI_diff',
 'NDTI_diff',
 'MSI',
 'NDMI_diff',
 'CIr',
 'NDI45',
 'NDMI',
 'NDI45_diff',
 'BSI',
 'NIRv',
 'ARVI']