In [None]:
import ee
import google.auth
import math
import numpy as np
import tensorflow as tf
import folium
import time
import geopandas
import pandas
import ast

In [None]:
# this is needed to Successfully save authorization token. from ee.Authenticate()
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
ee.Authenticate()

In [None]:
ee.Initialize()

In [None]:
# bands used for prediction
BANDS = ['B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B11', 'B12'] # bands with <= 30 resolution
OVERLAYED_FOLDER_NAMES = ["overlayed_3week_images_groundTruth/", "overlayed_3week_images_supSpecSmall/", "overlayed_3week_images_supSpecLarge/"]
MERGED_FILE_NAMES = ["merged_images_groundTruth", "merged_images_supSpecSmall", "merged_images_supSpecLarge"]
class Dataset:
    GROUND_TRUTH: 0
    SUP_SPEC_SMALL: 1
    SUP_SPEC_LARGE: 2
COUNTRY_GEOMETRY = ee.FeatureCollection("USDOS/LSIB_SIMPLE/2017").filter(ee.Filter.eq('country_na', 'Ukraine'))
COUNTRY_LATLON = 50., 31 # coordinate of center of ukraine

## Reading Labeled Shape File

In [None]:
#%%script echo skipping
DATASET_IDX = Dataset.SUP_SPEC_LARGE
OVERLAYED_FOLDER_NAME = OVERLAYED_FOLDER_NAMES[DATASET_IDX]
MERGED_FILE_NAME = MERGED_FILE_NAMES[DATASET_IDX]

TIMESTAMP_DIC ={
                "1" : "13/06/22",
                "2" : "27/06/22",
                "3" : "11/07/22",
                '4' :  "25/07/22",
                '5' : "08/08/22",
                '6' :  "22/08/22",
                '7' :  "05/09/22",
                '8' :  "19/10/22",
                }

# THIS LOOP HAS A PROBLEM
# TODO fix the problem of rereading the same points (ask Shabarinath and he will explain)
# we did not fix this because we simply drop the duplicate points later
coor_points_df = None
for i in range(1, 8):
    curr_df = pandas.read_csv(f"../data/HarvestSupervised2/supSpectralBands_{i}.csv")[["0_constant", ".geo"]]
    curr_df["0_constant"] = curr_df["0_constant"].astype(str)
    curr_df["is_harvested"] = curr_df["0_constant"].apply(lambda x : str(x) != "0")
    curr_df["0_constant"].mask(curr_df["0_constant"] == "0", str(i), inplace=True)
    if(type(coor_points_df) == type(None)):
        coor_points_df = curr_df
    else:
        coor_points_df = pandas.concat([coor_points_df, curr_df])
    
#coor_points_df[".geo"] = coor_points_df[".geo"].astype(str)
#def get_lat(x):
    
coor_points_df["lon"] = coor_points_df[".geo"].apply(lambda x: ast.literal_eval(x)["coordinates"][0])
coor_points_df["lat"] = coor_points_df[".geo"].apply(lambda x: ast.literal_eval(x)["coordinates"][1])

coor_points_df["finHarvDat"] = coor_points_df["0_constant"].apply(lambda x: TIMESTAMP_DIC[str(x)])
coor_points_df.finHarvDat = coor_points_df.finHarvDat.apply(lambda x: str(x))
coor_points_df["point_id"] = np.arange(0, coor_points_df.shape[0], 1, dtype=int)
coor_points_df = coor_points_df.drop([".geo", "0_constant"], axis=1)
print(coor_points_df.head(), coor_points_df.shape)

In [None]:
%%script echo skipping
DATASET_IDX = Dataset.GROUND_TRUTH
OVERLAYED_FOLDER_NAME = OVERLAYED_FOLDER_NAMES[DATASET_IDX]
MERGED_FILE_NAME = MERGED_FILE_NAMES[DATASET_IDX]

shapeFile = geopandas.read_file("../data/validation_data/merged_harvest_validation_20220919.shp")
shapeFile.head()

coor_points_df = shapeFile[['lat', 'lon', 'val_set1', 'finHarvDat']].dropna(subset=['lat', 'lon', 'val_set1'])
coor_points_df['is_harvested'] = coor_points_df['val_set1'].apply(lambda x: x == 1)
coor_points_df = coor_points_df.drop(['val_set1'], axis=1)
coor_points_df.finHarvDat = coor_points_df.finHarvDat.apply(lambda x: str(x))
coor_points_df['point_id'] = np.arange(0, coor_points_df.shape[0], 1, dtype=int)
print(coor_points_df.head(), coor_points_df.shape)

In [None]:
def overlay_points(img: ee.Image, df:pandas.DataFrame, coordinate_col_names:(str, str)=('lon', 'lat')) -> ee.FeatureCollection:
  """
  Overlays the points at df onto img. then, creates a table with the overlayed points(saved as ee.FeatureCollection).
  The ee.FeatureCollection is exported into Drive in a GeoJson format.
  So, the bands reflectances from img are put into the dataset described as the dataframe(df), then exported as .GeoJson.
  
  Args:
      img (ee.Image): _description_
      df (pandas.DataFrame): cointains longitude and latitude cols, describing the points to be overlayed.
      coordinate_col_names (str, str, optional): the name of columns (in the dataframe df) that have the coordinates. Defaults to ('lon', 'lat').

  Returns:
      ee.FeatureCollection: _description_
      
  Usage:
      export_to_drive(overlay_points(image, coor_points_df), 'points_from_sph')
      # will save points_from_sph.geojson into drive.
  """
  # Convert pandas dataframe to an ee.FeatureCollection
  def createFeature(row):
      lon = coordinate_col_names[0]
      lat = coordinate_col_names[1]
      geometry = ee.Geometry.Point([row[lon], row[lat]])
      #print(df.columns.values)
      dic = {}
      for col_name in df.columns.values:
        dic[col_name] = row[col_name]
        
      return ee.Feature(geometry, dic)

  features = coor_points_df.apply(createFeature, axis=1).tolist()
  fc = ee.FeatureCollection(features)


  # Overlay the points on the imagery to get training.
  overlayed_fc = img.sampleRegions(
    collection= fc,
    scale= 10 # maybe we should make this 10 instead
  )
  return overlayed_fc


def export_to_drive(fc: ee.FeatureCollection, file:str, folder:str="NASA_Harvest"):

  # Export the ee.FeatureCollection as a .GeoJSON file.
  task = ee.batch.Export.table.toDrive(**{
    'collection': fc,
    'description':file,
    'fileFormat': 'GeoJSON',
    'folder': folder
  })
  task.start()


  print('----')
  print(f'Polling for file name= {file}...')
  while task.active():
    time.sleep(5)
  print(f'Wrote {file}.GeoJSON. Check {folder} folder in Drive.')
  print('----')

In [None]:
from datetime import datetime

def read_geojson(file:str, date_col_name='finHarvDat') -> pandas.DataFrame:
    """
        Expects the date column to have this format "dd/mm/yy", example: "14/07/22".
        Expects that the file path looks like this: ../data/{file}.geojson
    Args:
        file (str): name of file (without the .geojson extention)
        date_col_name (str, optional):
        if df has no date column, pass None. Defaults to 'finHarvDat'.

    Returns:
        pandas.DataFrame
        
    Usage:
            read_geojson(file='points_from_sph').head()
    """
    

    def to_datetime(string):
        if(string == 'nan' or not string):
            return None
        splitted = string.split('/')

        day = int(splitted[0])
        month = int(splitted[1])
        year = 2000 + int(splitted[2])
        datetime_object = datetime(year=year, month = month, day=day)#datetime.strptime(string, '%d-%m-%Y')
        return datetime_object
    
    df = geopandas.read_file(f"../data/{file}.geojson")
    def fixDate(x):
        # 2013-06-22 to 13/06/22	
        string = str(x)
        return string[2:4] + "/" + string[5:7] + "/" + string[8:]
    
    #df["finHarvDat"] = df["finHarvDat"].apply(fixDate) # uncomment for supSpec
    
    if(date_col_name != None):
        df[date_col_name] = df[date_col_name].apply(to_datetime)
        
    return df

## Create 3-week image collection from sentindel2 in 2022

reference: https://medium.com/@moraesd90/creating-monthly-ndvi-composites-sentinel-2-on-google-earth-engine-a5c2d49bc9ca

In [None]:
def get_image(s2_img_collection: ee.ImageCollection) -> ee.Image:
    """
    Returns:
        ee.image: a cloud masked sentinel-2 image.
    """
    def cloudmask_and_clip(image: ee.Image) -> ee.Image:
        opaqueClouds_mask = 1 << 10
        cirrusClouds_mask =1 << 11
        bit_mask =opaqueClouds_mask | cirrusClouds_mask
        qa = image.select('QA60')
        mask = qa.bitwiseAnd(bit_mask).eq(0)
        return image.clip(COUNTRY_GEOMETRY).updateMask(mask)
    
    def add_ndvi(image):
        ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')
        image = image.addBands(ndvi.toFloat())
        return image.toFloat()
    
    default_value = 0.0
    image = s2_img_collection.map(cloudmask_and_clip).select(BANDS).filter(ee.Filter.lt("CLOUDY_PIXEL_PERCENTAGE", 20)).median().unmask(default_value).float()
    image = add_ndvi(image)
    return image

In [None]:
DATE_START = ee.Date('2022-01-01')
DATE_END= ee.Date('2022-12-27')
SURF_REF_SEN2 = ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED").filterDate(DATE_START, DATE_END)

# start_weeks.getInfo -> [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52]
total_weeks = ee.Number(DATE_END.difference(DATE_START, 'week')).round().getInfo()
start_weeks = ee.List.sequence(1, total_weeks, 3)

def extract_subset(start_week):
    
    start = DATE_START.advance(start_week, 'week')
    end = start.advance(3, 'week').advance(-1, 'day')
    
    def getCollection():
        return SURF_REF_SEN2.filterDate(start, end)
    
    img_collection = getCollection()
    return get_image(img_collection)
    
    

# Map the extract_subset function over the list of start weeks to create a new image collection that contains the subsets of the original image collection
new_img_collection = ee.ImageCollection.fromImages(start_weeks.map(extract_subset))
num_of_images = new_img_collection.size().getInfo()
# Print the number of images in the new image collection
print('Number of images in the new image collection:', num_of_images)
images_list = new_img_collection.toList(num_of_images)

In [None]:
images_list

In [None]:
image = ee.Image(images_list.get(12))

In [None]:
image = get_image(SURF_REF_SEN2)

In [92]:
vis_params = {
  "min": 0,
  "max": 3000,
  "bands": ["B4", "B3", "B2"],
}
map = folium.Map(location=COUNTRY_LATLON, zoom_start=13)
image=image.clip(COUNTRY_GEOMETRY)
mapid = image.getMapId(vis_params)

folium.TileLayer(
    tiles=mapid['tile_fetcher'].url_format,
    attr='Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
    overlay=True,
    name='median composite',
  ).add_to(map)
folium.LayerControl().add_to(map)


<folium.map.LayerControl at 0x2898a1000>

In [None]:
map

In [91]:
file_name = "merged_images_groundTruth"
df = geopandas.read_file(f'../data/{file_name}')
df = df[df.image_idx == "i7"]

In [93]:
from typing import Iterable
import folium

def pinned_points(df:pandas.DataFrame):
    def sample_points(
        region: ee.Geometry, image: ee.Image, points_per_class: int, scale: int) -> Iterable[tuple[float, float]]:
        # points is FeatureCollection 
        points = image.stratifiedSample(
            points_per_class,
            region=region,
            scale=scale,
            geometries=True,
        )
        for point in points.toList(points.size()).getInfo():
            #print(point)
            yield point["geometry"]["coordinates"]

    #for [lon, lat] in sample_points(countryGeometry, image_labels, points_per_class=100, scale=1000):
    #    folium.Marker(location=[lat, lon], popup = str([lon, lat])).add_to(map)

    lons_lats_np = np.array(df[['lon', 'lat']])
    for [lon, lat] in lons_lats_np:
        folium.Marker(location=[lat, lon], popup = str([lon, lat])).add_to(map)

    folium.LayerControl().add_to(map)
    return map
pinned_points(df)

In [79]:
df = geopandas.read_file(f'../data/{file_name}')
df

Unnamed: 0,level_0,INDEX,index,B11,B12,B2,B3,B4,B5,B6,...,NDTI_diff,NDMI_diff,MSI_diff,GCI_diff,NBRI_diff,BSI_diff,NDWI_diff,NDSI_diff,NDVI_diff,geometry
0,0,p15694,15694,0.23650,0.16900,0.04935,0.08010,0.10350,0.14255,0.17240,...,-0.044877,-0.279841,0.490136,-1.112650,-0.304765,0.277223,0.130734,-0.084003,-0.285695,POINT (33.75093 46.34732)
1,2067,p15694,15694,0.24800,0.19840,0.06410,0.08020,0.11040,0.11960,0.14190,...,-0.055350,-0.129529,0.341772,-0.509485,-0.187725,0.076637,0.091216,-0.017275,-0.129710,POINT (33.75093 46.34732)
2,4524,p15694,15694,0.38320,0.27630,0.12740,0.17840,0.23820,0.25630,0.26410,...,0.050981,0.090585,-0.249371,-0.343894,0.144338,-0.042790,0.080652,0.146601,-0.072076,POINT (33.75093 46.34732)
3,1,p15695,15695,0.26630,0.16070,0.06420,0.09100,0.09740,0.14350,0.23670,...,0.019594,-0.049278,0.090105,-0.686246,-0.026372,0.066846,0.071390,0.038415,-0.139560,POINT (28.99082 46.27818)
4,2068,p15695,15695,0.32930,0.20170,0.11160,0.15020,0.19300,0.20990,0.28560,...,-0.007005,0.013523,-0.025581,-0.703414,0.005641,0.027794,0.103517,0.117110,-0.187788,POINT (28.99082 46.27818)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5840,4522,p18314,18314,0.23995,0.18210,0.06440,0.09275,0.13040,0.16295,0.17865,...,-0.027446,-0.131064,0.256750,-1.334332,-0.154819,0.159337,0.172108,0.073386,-0.317999,POINT (31.30458 48.29508)
5841,6505,p18314,18314,0.26860,0.18630,0.08600,0.10680,0.12940,0.15590,0.20690,...,0.043850,-0.069920,0.168358,-0.362265,-0.025366,0.008730,0.073469,0.011434,-0.009650,POINT (31.30458 48.29508)
5842,8441,p18314,18314,0.28320,0.19100,0.07300,0.10240,0.10150,0.15580,0.24680,...,0.013514,0.107492,-0.248949,0.717221,0.120225,-0.096642,-0.131432,-0.037873,0.223222,POINT (31.30458 48.29508)
5843,10574,p18314,18314,0.15510,0.09505,0.03575,0.05850,0.04325,0.08865,0.15335,...,0.045623,0.054749,-0.107004,0.186735,0.097221,-0.063601,-0.026223,0.016632,0.129895,POINT (31.30458 48.29508)


In [90]:
df

Unnamed: 0,level_0,INDEX,index,B11,B12,B2,B3,B4,B5,B6,...,NDTI_diff,NDMI_diff,MSI_diff,GCI_diff,NBRI_diff,BSI_diff,NDWI_diff,NDSI_diff,NDVI_diff,geometry
0,0,p15694,15694,0.23650,0.16900,0.04935,0.08010,0.1035,0.14255,0.17240,...,-0.044877,-0.279841,0.490136,-1.112650,-0.304765,0.277223,0.130734,-0.084003,-0.285695,POINT (33.75093 46.34732)
3,1,p15695,15695,0.26630,0.16070,0.06420,0.09100,0.0974,0.14350,0.23670,...,0.019594,-0.049278,0.090105,-0.686246,-0.026372,0.066846,0.071390,0.038415,-0.139560,POINT (28.99082 46.27818)
5,2,p15696,15696,0.21195,0.14595,0.05530,0.07625,0.0958,0.12925,0.19655,...,-0.067727,-0.287614,0.438607,-1.957148,-0.317266,0.307128,0.175986,-0.027434,-0.361959,POINT (33.09322 47.07386)
10,3,p15697,15697,0.23410,0.15580,0.05540,0.08020,0.0687,0.11790,0.23980,...,0.142080,0.039511,-0.066787,2.372616,0.175567,-0.019652,-0.455985,-0.446061,0.509808,POINT (38.59677 49.45542)
12,4,p15698,15698,0.18290,0.10085,0.04030,0.06220,0.0472,0.09535,0.23335,...,-0.019241,-0.122096,0.140232,-1.569276,-0.106949,0.130730,0.068975,-0.000053,-0.094571,POINT (33.72342 48.10159)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5832,2062,p18310,18310,0.32210,0.19090,0.06480,0.10700,0.1972,0.24050,0.27020,...,-0.042915,-0.355576,0.487341,-2.817205,-0.327529,0.465846,0.185333,-0.053532,-0.553070,POINT (34.56668 47.67184)
5833,2063,p18311,18311,0.22660,0.15200,0.05020,0.07940,0.1085,0.14965,0.20050,...,0.157840,0.030924,-0.060579,1.784601,0.187620,0.043941,-0.400999,-0.376391,0.309958,POINT (31.02569 47.54457)
5835,2064,p18312,18312,0.16170,0.09315,0.03200,0.05555,0.0429,0.08590,0.22715,...,-0.031701,-0.042913,0.050093,-0.657191,-0.057518,0.060195,0.030336,0.008620,-0.056861,POINT (36.55101 49.60944)
5837,2065,p18313,18313,0.17530,0.12110,0.03760,0.06680,0.0542,0.10330,0.21050,...,-0.039494,-0.064806,0.085690,-0.617171,-0.090828,0.090267,0.045323,0.003711,-0.100411,POINT (37.74556 47.11833)


In [85]:
df.image_idx == "i8"

0       False
1        True
2       False
3       False
4        True
        ...  
5840     True
5841    False
5842    False
5843    False
5844     True
Name: image_idx, Length: 5845, dtype: bool

In [None]:
import datetime

In [None]:
start_end_dates = []
start_weeks = ee.List.sequence(1, total_weeks, 3)

# record the dates into start_end_dates array
def eeDate_to_datetime(eeDate: ee.Date)->datetime:
    year = eeDate.get('year').getInfo()
    month = eeDate.get('month').getInfo()
    day = eeDate.get('day').getInfo()
    return datetime(year=year, month=month, day=day)

for idx in range(num_of_images):
    start_week = start_weeks.get(idx).getInfo()
    start = DATE_START.advance(start_week, 'week')
    end = start.advance(3, 'week').advance(-1, 'day')
    start_end_dates.append((eeDate_to_datetime(start), eeDate_to_datetime(end)))

In [None]:
start_end_dates[0], len(start_end_dates)

In [None]:
start_end_dates

In [None]:
file_names = [f'img{idx}_overlayed' for idx in range(num_of_images)]

In [None]:
%%script echo skipping

# for each image, overylay the points(aka export to drive)
for idx in range(12, num_of_images):
    currImg = ee.Image(images_list.get(idx))
    file = f'img{idx}_overlayed'
    export_to_drive(overlay_points(currImg, coor_points_df),file=file )    

## Download From Drive & Move to 'data' Folder.

In [None]:

# true if (is_harvested is true) and if (the finHarvDat is within start_date & end_date)
def get__is_within_period(row):
    def date_within_range(dateToCheck:datetime, startDate:datetime, endDate:datetime):
        """credit: https://stackoverflow.com/users/22656/jon-skeet"""
        return dateToCheck >= startDate and dateToCheck <= endDate
    return row['is_harvested'] and date_within_range(row['finHarvDat'], row['start_date'], row['end_date'])
    
#samples = ee.List([]) # containing images
samples = [None] * (num_of_images) # sample[0] is None

path_inside_data_folder= OVERLAYED_FOLDER_NAMES[DATASET_IDX]
a= None
for idx in range(num_of_images):
    curr_img_df = read_geojson(file=path_inside_data_folder + file_names[idx])
    
    curr_img_df = curr_img_df.sort_values('point_id') # to make sure the points are aligned when we subtract ndvi values below
    
    # add time cols
    start, end = start_end_dates[idx]
    curr_img_df['start_date'] = np.tile(np.array([start]), curr_img_df.shape[0])
    curr_img_df['end_date'] = np.tile(np.array([end]), curr_img_df.shape[0])
    
    curr_img_df['is_within_period'] = curr_img_df.apply(get__is_within_period, axis=1)
    
    curr_img_df['image_idx'] = np.tile(np.array(['i'+str(idx)]), curr_img_df.shape[0])
    curr_img_df.rename(columns = {'point_id':'point_idx'}, inplace = True)
    curr_img_df.point_idx = curr_img_df.point_idx.apply(lambda x: 'p' + str(x))
    
    samples[idx] = curr_img_df

In [None]:
merged_images = pandas.concat(samples, sort=False)
merged_images = merged_images.drop(['geometry', 'id', 'is_harvested'], axis=1)
merged_images[BANDS] /= 10000 # # divide by 10000 bc the bands are scaled by 10000
                                #(according to https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S2)
merged_images.shape

In [None]:
merged_images.head()

In [None]:
sum(merged_images.is_within_period) # we expect 366

In [None]:
# cpied from learning_about-data.ipynb
BANDS_DICT = {   'B2': 'Blue',
            'B3': 'Green',
            'B4': 'Red',
            'B5': 'Red_Edge_1',
            'B6': 'Red_Edge_2',
            'B7': 'Red_Edge_3',
            'B8': 'NIR',
            'B8A': 'Red_Edge_4',
            'B11': 'SWIR_1',
            'B12': 'SWIR_2'}

BANDS = list(BANDS_DICT.keys())

In [None]:
MERGED_FILE_NAME

In [None]:
from scripts import utilities, veg_indices

# convert pandas df to geopandas df
merged_samples_gdf = geopandas.GeoDataFrame(
    merged_images, geometry=geopandas.points_from_xy(merged_images.lon, merged_images.lat))
merged_samples_gdf.rename(columns = {'is_within_period':'har_evnt'}, inplace = True)
merged_samples_gdf = merged_samples_gdf[(merged_samples_gdf.NDVI) != 0] # drop invalid points

df, NUMERIC_COLS, NUM_SAMPLES = utilities.get_df(merged_samples_gdf.copy(), veg_indices, BANDS)

# save dataset
df.to_file(f"../data/{MERGED_FILE_NAME}", driver='GeoJSON')

In [None]:
NUMERIC_COLS