### Dataset from ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED")
https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S2_SR_HARMONIZED

In [2]:
import datetime
import geopandas
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

#from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib

 # to enable ipympl interactive interface for plots
%matplotlib widget

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

from matplotlib.backends.backend_pdf import PdfPages


In [3]:
import time
import matplotlib
from distinctipy import distinctipy # generate N distinct colors


In [4]:
#local scripts
from scripts import veg_indices, utilities, plots
#from scripts.utilities import *

In [5]:
BANDS_DICT = {   'B2': 'Blue',
            'B3': 'Green',
            'B4': 'Red',
            'B5': 'Red_Edge_1',
            'B6': 'Red_Edge_2',
            'B7': 'Red_Edge_3',
            'B8': 'NIR',
            'B8A': 'Red_Edge_4',
            'B11': 'SWIR_1',
            'B12': 'SWIR_2'}

BANDS = list(BANDS_DICT.keys())

In [6]:
DF = geopandas.read_file('../data/merged_images.geojson')
DF.rename(columns = {'is_within_period':'har_evnt'}, inplace = True)
NUM_SAMPLES = len(np.unique(DF.image_idx)) - 1

In [8]:
df = DF.copy()
df = df[(df.NDVI) != 0] # drop invalid points
VEG_INDICES_NAMES = veg_indices.add_veg_indices(df) + ['NDVI'] 
df, VEG_DIFF_NAMES = veg_indices.get_added_veg_diff(df, VEG_INDICES_NAMES)
NUMERIC_COLS = BANDS + VEG_INDICES_NAMES + VEG_DIFF_NAMES

df.columns, df.shape

Added:  ['RVI', 'ARVI', 'PSSRa', 'NDI45', 'GNDVI', 'MCARI', 'IRECI', 'CIr', 'MTCI', 'NDVIre', 'NIRv', 'EVI', 'NDTI', 'NDMI', 'MSI', 'GCI', 'NBRI', 'BSI', 'NDWI', 'NDSI']
Added:  ['image_idx', 'RVI_diff', 'ARVI_diff', 'PSSRa_diff', 'NDI45_diff', 'GNDVI_diff', 'MCARI_diff', 'IRECI_diff', 'CIr_diff', 'MTCI_diff', 'NDVIre_diff', 'NIRv_diff', 'EVI_diff', 'NDTI_diff', 'NDMI_diff', 'MSI_diff', 'GCI_diff', 'NBRI_diff', 'BSI_diff', 'NDWI_diff', 'NDSI_diff', 'NDVI_diff']


(Index(['B11', 'B12', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'NDVI',
        'finHarvDat', 'lat', 'lon', 'point_idx', 'start_date', 'end_date',
        'har_evnt', 'image_idx', 'geometry', 'RVI', 'ARVI', 'PSSRa', 'NDI45',
        'GNDVI', 'MCARI', 'IRECI', 'CIr', 'MTCI', 'NDVIre', 'NIRv', 'EVI',
        'NDTI', 'NDMI', 'MSI', 'GCI', 'NBRI', 'BSI', 'NDWI', 'NDSI', 'pt_idx',
        'img_idx', 'RVI_diff', 'ARVI_diff', 'PSSRa_diff', 'NDI45_diff',
        'GNDVI_diff', 'MCARI_diff', 'IRECI_diff', 'CIr_diff', 'MTCI_diff',
        'NDVIre_diff', 'NIRv_diff', 'EVI_diff', 'NDTI_diff', 'NDMI_diff',
        'MSI_diff', 'GCI_diff', 'NBRI_diff', 'BSI_diff', 'NDWI_diff',
        'NDSI_diff', 'NDVI_diff', 'image_idx'],
       dtype='object'),
 (6678, 64))

In [8]:
df_trimmed = utilities.get_rm_outlier_standarize(df, NUMERIC_COLS)
df_trimmed # replacing outliers with NaN

Unnamed: 0,B11,B12,B2,B3,B4,B5,B6,B7,B8,B8A,...,NDTI_diff,NDMI_diff,MSI_diff,GCI_diff,NBRI_diff,BSI_diff,NDWI_diff,NDSI_diff,NDVI_diff,image_idx
560,-1.758315,-1.228066,,,,,,,,,...,0.107886,,-1.860549,-0.030922,,-1.810909,,,,s0
562,0.093869,-0.017076,-0.006116,-0.028410,0.314507,-0.357688,-0.638427,-0.653510,-0.569851,-0.685480,...,1.306184,-0.548800,0.591765,-0.529621,0.610562,0.642642,0.023557,-0.506998,-1.009715,s0
563,-0.072285,0.124070,-0.587259,-0.638744,-0.402065,-0.770173,-0.926422,-0.864083,-0.834291,-0.878740,...,0.457577,-0.718428,0.964065,-0.879390,0.273475,0.974112,-0.500572,-0.787805,-0.441273,s0
564,-0.481550,-0.126515,,,,,,,,,...,2.249689,,-1.632557,-0.176661,,-1.573719,,,,s0
565,0.898406,0.965831,0.367475,0.071644,0.190128,0.071748,-0.143986,-0.209713,-0.211944,-0.228465,...,-0.509591,-0.329283,0.575539,0.260371,-0.251442,0.643998,-0.421109,-0.414234,0.097295,s0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9475,,,,,,,,,,,...,,2.503322,-1.260644,,-1.557954,-1.564169,1.083308,2.457773,0.566755,s15
9478,-1.859430,-1.270119,,,,,,,,,...,-1.380441,,,0.332125,,,,,,s15
9481,2.283580,1.993840,1.022129,1.208319,1.691890,1.539519,0.706555,0.686441,0.932873,0.961789,...,-0.130376,-0.242088,0.173819,0.799855,-0.457603,0.198682,-0.532579,0.007235,0.259201,s15
9482,0.072878,-0.033267,-1.031873,-0.543741,-0.490727,-0.638186,-0.365838,-0.436975,-0.300813,-0.414587,...,0.344133,-0.276465,0.286687,0.711421,-0.554375,0.387551,-0.814779,-0.117184,0.585370,s15


In [9]:
stretched_df_trimmed = utilities.stretch_cols(df_trimmed, NUMERIC_COLS)
stretched_df_trimmed

Unnamed: 0,value,class,har_evnt,image_idx,start_date
560,,B2,0.0,s0,2022-01-29
562,-0.006116,B2,0.0,s0,2022-01-29
563,-0.587259,B2,0.0,s0,2022-01-29
564,,B2,0.0,s0,2022-01-29
565,0.367475,B2,0.0,s0,2022-01-29
...,...,...,...,...,...
9475,0.566755,NDVI_diff,False,s15,2022-12-10
9478,,NDVI_diff,False,s15,2022-12-10
9481,0.259201,NDVI_diff,False,s15,2022-12-10
9482,0.585370,NDVI_diff,False,s15,2022-12-10


### Producing Box Plots per Numeric Column

In [10]:
%%script echo skipping

def save_multi_image(filename):
    pp = PdfPages(filename)
    for sampleIdx in stretched_df_trimmed.image_idx.unique():
        curr_df = stretched_df_trimmed[stretched_df_trimmed.image_idx == sampleIdx]
        fig = plt.figure(figsize=(16,int(len(NUMERIC_COLS) * 1.5)))            # set showfliers to False to remove outliers
        sns.boxplot(data=curr_df, x="value", y="class", hue="har_evnt", showfliers = True).set(title= ("Sample Index:", sampleIdx))
        fig.savefig(pp, format='pdf')
        plt.close() # closing figure
    pp.close()

save_multi_image("../plots/box_plots/everything_trimmed_standarized.pdf")

skipping


### Producing Separability Plots

In [11]:
#%%script echo skipping
dics = []
for i in range(0,4):
    dataframes_dic = plots.plot_per_period(utilities.get_classes_colors(NUMERIC_COLS), df_trimmed, NUMERIC_COLS, f"../plots/bar_plots/seperability{i}.pdf", metric=i)
    dics.append(dataframes_dic)

Future exception was never retrieved
future: <Future finished exception=BrokenPipeError(32, 'Broken pipe')>
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/unix_events.py", line 676, in write
    n = os.write(self._fileno, data)
BrokenPipeError: [Errno 32] Broken pipe
  return np.sum(np.where(A != 0, np.multiply(A, np.log(np.divide(A, M))), 0))
  return np.sum(np.where(A != 0, np.multiply(A, np.log(np.divide(A, M))), 0))
  return np.sum(np.where(A != 0, np.multiply(A, np.log(np.divide(A, M))), 0))
  return np.sum(np.where(A != 0, np.multiply(A, np.log(np.divide(A, M))), 0))


In [12]:
print(len(dics)) # first two: more means separate, last two, less means separate
indices = dics[0].keys() # s6 s7 s8 s9
print(indices)

4
dict_keys(['s6', 's7', 's8', 's9'])


In [13]:

for i in range(4): # for each separability metric
    for image_idx in indices:
        pass

In [None]:

fig = plt.figure(figsize=(16,len(BANDS)))
sns.lineplot(x="start_date", y="value",
         hue="har_evnt", style="class",
         data=stretched_df_trimmed)
plt.close() # closing figure


fig = plt.figure(figsize=(16,len(BANDS)))
sns.lineplot(x="start_date", y="value",
         hue="har_evnt",
         data=stretched_df_trimmed)
plt.close() # closing figure

In [None]:

sns.set(rc={'figure.figsize':(len(BANDS) * 8, len(BANDS) * 5)})

fig, axes = plt.subplots(len(BANDS), 1)
fig.suptitle('Mean and 95% CI by Band')
fig.subplots_adjust(hspace=0.5, wspace=0.5)


for i in range(len(BANDS)):
    band_name = BANDS[i]
    sns.lineplot(ax=axes[i], x="finHarvDat", y=band_name,
        hue="har_evnt",
        data=df)
    sns.scatterplot(ax=axes[i], data=df, x="finHarvDat", y=band_name, hue="har_evnt", style="har_evnt", alpha=0.5)

    ax2 = axes[i].twinx()
    sns.histplot(ax=ax2, data=df, x="finHarvDat", bins=40, element="step", fill=False, color='green')
    if(i % 2 == 0):
        #axes[i].tick_params(bottom=False)
        #axes[i].legend_.remove()
        pass
    title = (BANDS_DICT[band_name] + f' ({band_name})')
    axes[i].set_title(title)
plt.close() # closing figure


utilities.saveFigsAsPDF([fig], "../plots/line_plots/mean&CIByBand.pdf")


In [None]:
sns.set(rc={'figure.figsize':(len(BANDS) * 8, len(BANDS) * 5)})
fig, axes = plt.subplots(len(BANDS), 1)
fig.suptitle('Median and IQR by Band')
fig.subplots_adjust(hspace=0.5, wspace=0.5)

# https://stackoverflow.com/questions/52525476/seaborn-lineplot-using-median-instead-of-mean
for i in range(len(BANDS)):
    band_name = BANDS[i]
    sns.lineplot(ax=axes[i], x="finHarvDat", y=band_name,
        hue="har_evnt", estimator="median",errorbar=("pi", 50),# show inner quartile range #https://seaborn.pydata.org/tutorial/error_bars.html
        data=df)
    
    sns.scatterplot(ax=axes[i], data=df, x="finHarvDat", y=band_name, hue="har_evnt", style="har_evnt", alpha=0.5)

    ax2 = axes[i].twinx()
    sns.histplot(ax=ax2, data=df, x="finHarvDat", bins=40, element="step", fill=False, color='green')

    if(i % 2 == 0):
        #axes[i].tick_params(bottom=False)
        #axes[i].legend_.remove()
        pass

    title = (BANDS_DICT[band_name] + f' ({band_name})')
    axes[i].set_title(title)
plt.close() # closing figure
utilities.saveFigsAsPDF([fig], "../plots/line_plots/median&IQRByBand.pdf")


# PCA

In [None]:
pca = PCA(n_components=3)
numeric_col_values = (df.loc[:, NUMERIC_COLS]).copy().values
pca_result = pca.fit_transform(numeric_col_values)

df['pca_one'] = pca_result[:,0]
df['pca_two'] = pca_result[:,1] 
df['pca_three'] = pca_result[:,2]

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
fig = plt.figure(figsize=(16,10))
sns.scatterplot(
    x="pca_one", y="pca_two",
    hue= 'image_idx',
    palette=sns.color_palette("brg", NUM_SAMPLES),
    data=df,
    alpha=0.4,
    legend="full"
)
plt.show()

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(projection='3d')

p = ax.scatter(
    xs=df["pca_one"], 
    ys=df["pca_two"], 
    zs=df["pca_three"], 
    color= df['har_evnt'].apply(lambda x: "red" if(x) else "blue"),
    alpha=0.5
    )

ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')
plt.show()

In [None]:
colors = distinctipy.get_colors(NUM_SAMPLES)

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(projection='3d')

p = ax.scatter(
    xs=df["pca_one"], 
    ys=df["pca_two"], 
    zs=df["pca_three"], 
    color= df['image_idx'].apply(lambda x: colors[int(x[1:])]),
    alpha=0.5,
    )

ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')

plt.show()

In [None]:

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(projection='3d')

p = ax.scatter(
    xs=df["pca_one"], 
    ys=df["pca_two"], 
    zs=df["pca_three"], 
    c=df["NDVI"], 
    cmap='brg'
    )

ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')

fig.colorbar(p, ax=ax)
plt.show()

# T-NSE

In [None]:
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(numeric_col_values)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

df['tsne_2d_one'] = tsne_results[:,0]
df['tsne_2d_two'] = tsne_results[:,1]

In [None]:
fig = plt.figure(figsize=(16,10))

sns.scatterplot(
    x="tsne_2d_one", y="tsne_2d_two",
    hue="image_idx",
    palette=sns.color_palette("hls", NUM_SAMPLES),
    data=df,
    legend="full",
    alpha=0.5
)

fig = plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne_2d_one", y="tsne_2d_two",
    hue="har_evnt",
    palette=sns.color_palette("hls", 2),
    data=df,
    legend="full",
    alpha=0.5
)

In [None]:
time_start = time.time()
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(numeric_col_values.copy())

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

df['tsne_3d_one'] = tsne_results[:,0]
df['tsne_3d_two'] = tsne_results[:,1]
df['tsne_3d_three'] = tsne_results[:,2]

In [None]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')

p = ax.scatter(
    xs=df["tsne_3d_one"], 
    ys=df["tsne_3d_two"], 
    zs=df["tsne_3d_three"], 
    c=df["NDVI"], 
    cmap='brg'
    )

ax.set_xlabel('tsne_3d_one')
ax.set_ylabel('tsne_3d_two')
ax.set_zlabel('tsne_3d_three')

fig.colorbar(p, ax=ax)
plt.show()



fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(projection='3d')

p = ax.scatter(
    xs=df["tsne_3d_one"], 
    ys=df["tsne_3d_two"], 
    zs=df["tsne_3d_three"], 
    c=df["har_evnt"],
    cmap='Paired'
    )

ax.set_xlabel('tsne_3d_one')
ax.set_ylabel('tsne_3d_two')
ax.set_zlabel('tsne_3d_three')

fig.colorbar(p, ax=ax)
plt.show()

In [None]:
x = np.empty(NUM_SAMPLES)
y = np.zeros(NUM_SAMPLES)

for image_idx in range(NUM_SAMPLES):
    curr_df = df.loc[df['image_idx'] == 's'+str(image_idx)]
    x = curr_df['']

In [None]:

title = "finHarvDat By Sample's Date Range"
my_bins = pd.date_range(start=min(df.start_date), end=max(df.end_date),freq='3W')
sns.displot(data=curr_df,x="finHarvDat", bins=matplotlib.dates.date2num(my_bins)).set(title=title)


In [None]:
sns.displot(data=curr_df,x="finHarvDat", bins=40)

In [None]:
[0.83519663 0.         0.06660164 0.13135956 0.37101957 0.90362449
 1.30888853 1.47253057 1.59509951 1.7424843  1.74944273 1.77397426
 1.8056872  1.81273194 1.55127767 1.36529779 1.65753067 1.46888735
 0.         0.         1.68152529 0.         0.         0.
 0.         1.8561824  0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         1.81273194 0.         0.         0.                nan
        nan 0.        ]

In [None]:
np.log(np.array([0.83519663, 0, 0.63519663]))

In [None]:
plt.ioff()

sns.set(rc={'figure.figsize':(len(BANDS) * 8, len(BANDS) * 5)})

fig, axes = plt.subplots(len(BANDS), 1)
fig.suptitle('mean&CIByBand & frequencies by Band')
fig.subplots_adjust(hspace=0.5, wspace=0.5)


for i in range(len(BANDS)):
    band_name = BANDS[i]
    sns.lineplot(ax=axes[i], x="finHarvDat", y=band_name,
        hue="har_evnt",
        data=df)
    ax2 = axes[i].twinx()
    sns.histplot(ax=ax2, data=df, x="finHarvDat", bins=40, element="step", fill=False)
    if(i % 2 == 0):
        #axes[i].tick_params(bottom=False)
        #axes[i].legend_.remove()
        pass
    title = (BANDS_DICT[band_name] + f' ({band_name})')
    axes[i].set_title(title)


utilities.saveFigsAsPDF([fig], "../plots/line_plots/mean&CIByBand_freqs.pdf")
plt.ion()

In [None]:
df