In [1]:
import ee
import google.auth
import math
import numpy as np
import tensorflow as tf
import folium
import time
import geopandas
import pandas
import ast

In [2]:
# this is needed to Successfully save authorization token. from ee.Authenticate()
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
ee.Authenticate()


Successfully saved authorization token.


In [4]:
ee.Initialize()

In [5]:
# bands used for prediction
BANDS = ['B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B11', 'B12'] # bands with <= 30 resolution
OVERLAYED_FOLDER_NAMES = ["overlayed_3week_images_validation/", "overlayed_3week_images_train/"]
MERGED_FILE_NAMES = ["merged_images_validation", "merged_images_train"]


COUNTRY_GEOMETRY = ee.FeatureCollection("USDOS/LSIB_SIMPLE/2017").filter(ee.Filter.eq('country_na', 'Ukraine'))
COUNTRY_LATLON = 50., 31 # coordinate of center of ukraine

## Reading Labeled Shape File

In [7]:
#%%script echo skipping
OVERLAYED_FOLDER_NAME = OVERLAYED_FOLDER_NAMES[1]
MERGED_FILE_NAME = MERGED_FILE_NAMES[1]

TIMESTAMP_DIC ={
                "1" : "13/06/22",
                "2" : "27/06/22",
                "3" : "11/07/22",
                '4' :  "25/07/22",
                '5' : "08/08/22",
                '6' :  "22/08/22",
                '7' :  "05/09/22",
                '8' :  "19/10/22",
                }

coor_points_df = None
for i in range(1, 8):
    curr_df = pandas.read_csv(f"../data/HarvestSupervised2/supSpectralBands_{i}.csv")[["0_constant", ".geo"]]
    curr_df["0_constant"] = curr_df["0_constant"].astype(str)
    curr_df["is_harvested"] = curr_df["0_constant"].apply(lambda x : str(x) != "0")
    curr_df["0_constant"].mask(curr_df["0_constant"] == "0", str(i), inplace=True)
    if(type(coor_points_df) == type(None)):
        coor_points_df = curr_df
    else:
        coor_points_df = pandas.concat([coor_points_df, curr_df])
    
#coor_points_df[".geo"] = coor_points_df[".geo"].astype(str)
#def get_lat(x):
    
coor_points_df["lon"] = coor_points_df[".geo"].apply(lambda x: ast.literal_eval(x)["coordinates"][0])
coor_points_df["lat"] = coor_points_df[".geo"].apply(lambda x: ast.literal_eval(x)["coordinates"][1])

coor_points_df["finHarvDat"] = coor_points_df["0_constant"].apply(lambda x: TIMESTAMP_DIC[str(x)])
coor_points_df.finHarvDat = coor_points_df.finHarvDat.apply(lambda x: str(x))
coor_points_df["point_id"] = np.arange(0, coor_points_df.shape[0], 1, dtype=int)
coor_points_df = coor_points_df.drop([".geo", "0_constant"], axis=1)
print(coor_points_df.head(), coor_points_df.shape)

   is_harvested        lon        lat finHarvDat  point_id
0         False  33.750930  46.347320   13/06/22         0
1         False  28.990823  46.278183   13/06/22         1
2         False  33.093223  47.073863   13/06/22         2
3         False  38.596773  49.455416   13/06/22         3
4         False  33.723420  48.101587   13/06/22         4 (18316, 5)


In [8]:
%%script echo skipping
OVERLAYED_FOLDER_NAME = OVERLAYED_FOLDER_NAMES[0]
MERGED_FILE_NAME = MERGED_FILE_NAMES[0]

shapeFile = geopandas.read_file("../data/validation_data/merged_harvest_validation_20220919.shp")
shapeFile.head()

coor_points_df = shapeFile[['lat', 'lon', 'val_set1', 'finHarvDat']].dropna(subset=['lat', 'lon', 'val_set1'])
coor_points_df['is_harvested'] = coor_points_df['val_set1'].apply(lambda x: x == 1)
coor_points_df = coor_points_df.drop(['val_set1'], axis=1)
coor_points_df.finHarvDat = coor_points_df.finHarvDat.apply(lambda x: str(x))
coor_points_df['point_id'] = np.arange(0, coor_points_df.shape[0], 1, dtype=int)
print(coor_points_df.head(), coor_points_df.shape)

skipping


In [9]:
def overlay_points(img: ee.Image, df:pandas.DataFrame, coordinate_col_names:(str, str)=('lon', 'lat')) -> ee.FeatureCollection:
  """
  Overlays the points at df onto img. then, creates a table with the overlayed points(saved as ee.FeatureCollection).
  The ee.FeatureCollection is exported into Drive in a GeoJson format.
  So, the bands reflectances from img are put into the dataset described as the dataframe(df), then exported as .GeoJson.
  
  Args:
      img (ee.Image): _description_
      df (pandas.DataFrame): cointains longitude and latitude cols, describing the points to be overlayed.
      coordinate_col_names (str, str, optional): the name of columns (in the dataframe df) that have the coordinates. Defaults to ('lon', 'lat').

  Returns:
      ee.FeatureCollection: _description_
      
  Usage:
      export_to_drive(overlay_points(image, coor_points_df), 'points_from_sph')
      # will save points_from_sph.geojson into drive.
  """
  # Convert pandas dataframe to an ee.FeatureCollection
  def createFeature(row):
      lon = coordinate_col_names[0]
      lat = coordinate_col_names[1]
      geometry = ee.Geometry.Point([row[lon], row[lat]])
      #print(df.columns.values)
      dic = {}
      for col_name in df.columns.values:
        dic[col_name] = row[col_name]
        
      return ee.Feature(geometry, dic)

  features = coor_points_df.apply(createFeature, axis=1).tolist()
  fc = ee.FeatureCollection(features)


  # Overlay the points on the imagery to get training.
  overlayed_fc = img.sampleRegions(
    collection= fc,
    scale= 10 # maybe we should make this 10 instead
  )
  return overlayed_fc


def export_to_drive(fc: ee.FeatureCollection, file:str, folder:str="NASA_Harvest"):

  # Export the ee.FeatureCollection as a .GeoJSON file.
  task = ee.batch.Export.table.toDrive(**{
    'collection': fc,
    'description':file,
    'fileFormat': 'GeoJSON',
    'folder': folder
  })
  task.start()


  print('----')
  print(f'Polling for file name= {file}...')
  while task.active():
    time.sleep(5)
  print(f'Wrote {file}.GeoJSON. Check {folder} folder in Drive.')
  print('----')

In [10]:
from datetime import datetime

def read_geojson(file:str, date_col_name='finHarvDat') -> pandas.DataFrame:
    """
        Expects the date column to have this format "dd/mm/yy", example: "14/07/22".
        Expects that the file path looks like this: ../data/{file}.geojson
    Args:
        file (str): name of file (without the .geojson extention)
        date_col_name (str, optional):
        if df has no date column, pass None. Defaults to 'finHarvDat'.

    Returns:
        pandas.DataFrame
        
    Usage:
            read_geojson(file='points_from_sph').head()
    """
    

    def to_datetime(string):
        if(string == 'nan' or not string):
            return None
        splitted = string.split('/')

        day = int(splitted[0])
        month = int(splitted[1])
        year = 2000 + int(splitted[2])
        datetime_object = datetime(year=year, month = month, day=day)#datetime.strptime(string, '%d-%m-%Y')
        return datetime_object
    
    df = geopandas.read_file(f"../data/{file}.geojson")
    def fixDate(x):
        # 2013-06-22 to 13/06/22	
        string = str(x)
        return string[2:4] + "/" + string[5:7] + "/" + string[8:]
    
    df["finHarvDat"] = df["finHarvDat"].apply(fixDate)
    
    if(date_col_name != None):
        df[date_col_name] = df[date_col_name].apply(to_datetime)
        
    return df

## Create 3-week image collection from sentindel2 in 2022

reference: https://medium.com/@moraesd90/creating-monthly-ndvi-composites-sentinel-2-on-google-earth-engine-a5c2d49bc9ca

In [11]:
def get_image(s2_img_collection: ee.ImageCollection) -> ee.Image:
    """
    Returns:
        ee.image: a cloud masked sentinel-2 image.
    """
    def cloudmask_and_clip(image: ee.Image) -> ee.Image:
        opaqueClouds_mask = 1 << 10
        cirrusClouds_mask =1 << 11
        bit_mask =opaqueClouds_mask | cirrusClouds_mask
        qa = image.select('QA60')
        mask = qa.bitwiseAnd(bit_mask).eq(0)
        return image.clip(COUNTRY_GEOMETRY).updateMask(mask)
    
    def add_ndvi(image):
        ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')
        image = image.addBands(ndvi.toFloat())
        return image.toFloat()
    
    default_value = 0.0
    image = s2_img_collection.map(cloudmask_and_clip).select(BANDS).filter(ee.Filter.lt("CLOUDY_PIXEL_PERCENTAGE", 20)).median().unmask(default_value).float()
    image = add_ndvi(image)
    return image

In [12]:
DATE_START = ee.Date('2022-01-01')
DATE_END= ee.Date('2022-12-27')
SURF_REF_SEN2 = ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED").filterDate(DATE_START, DATE_END)

# start_weeks.getInfo -> [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52]
total_weeks = ee.Number(DATE_END.difference(DATE_START, 'week')).round().getInfo()
start_weeks = ee.List.sequence(1, total_weeks, 3)

def extract_subset(start_week):
    
    start = DATE_START.advance(start_week, 'week')
    end = start.advance(3, 'week').advance(-1, 'day')
    
    def getCollection():
        return SURF_REF_SEN2.filterDate(start, end)
    
    img_collection = getCollection()
    return get_image(img_collection)
    
    

# Map the extract_subset function over the list of start weeks to create a new image collection that contains the subsets of the original image collection
new_img_collection = ee.ImageCollection.fromImages(start_weeks.map(extract_subset))
num_of_images = new_img_collection.size().getInfo()
# Print the number of images in the new image collection
print('Number of images in the new image collection:', num_of_images)
images_list = new_img_collection.toList(num_of_images)

Number of images in the new image collection: 17


In [13]:
images_list

<ee.ee_list.List at 0x17698ad40>

In [14]:
image = ee.Image(images_list.get(6))

In [15]:
vis_params = {
  "min": 0,
  "max": 3000,
  "bands": ["B4", "B3", "B2"],
}
map = folium.Map(location=COUNTRY_LATLON, zoom_start=13)
image=image.clip(COUNTRY_GEOMETRY)
mapid = image.getMapId(vis_params)

folium.TileLayer(
    tiles=mapid['tile_fetcher'].url_format,
    attr='Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
    overlay=True,
    name='median composite',
  ).add_to(map)
folium.LayerControl().add_to(map)


<folium.map.LayerControl at 0x17694a200>

In [16]:
map

In [17]:
start_end_dates = []
start_weeks = ee.List.sequence(1, total_weeks, 3)

# record the dates into start_end_dates array
def eeDate_to_datetime(eeDate: ee.Date)->datetime:
    year = eeDate.get('year').getInfo()
    month = eeDate.get('month').getInfo()
    day = eeDate.get('day').getInfo()
    return datetime(year=year, month=month, day=day)

for idx in range(num_of_images):
    start_week = start_weeks.get(idx).getInfo()
    start = DATE_START.advance(start_week, 'week')
    end = start.advance(3, 'week').advance(-1, 'day')
    start_end_dates.append((eeDate_to_datetime(start), eeDate_to_datetime(end)))

In [18]:
start_end_dates[0], len(start_end_dates)

((datetime.datetime(2022, 1, 8, 0, 0), datetime.datetime(2022, 1, 28, 0, 0)),
 17)

In [19]:
file_names = [f'img{idx}_overlayed' for idx in range(num_of_images)]

In [25]:
#%%script echo skipping

# for each image, overylay the points(aka export to drive)
for idx in range(12, num_of_images):
    currImg = ee.Image(images_list.get(idx))
    file = f'img{idx}_overlayed'
    export_to_drive(overlay_points(currImg, coor_points_df),file=file )    

## Download From Drive & Move to 'data' Folder.

In [21]:

# true if (is_harvested is true) and if (the finHarvDat is within start_date & end_date)
def get__is_within_period(row):
    def date_within_range(dateToCheck:datetime, startDate:datetime, endDate:datetime):
        """credit: https://stackoverflow.com/users/22656/jon-skeet"""
        return dateToCheck >= startDate and dateToCheck <= endDate
    return row['is_harvested'] and date_within_range(row['finHarvDat'], row['start_date'], row['end_date'])
    
#samples = ee.List([]) # containing images
samples = [None] * (num_of_images) # sample[0] is None

path_inside_data_folder= OVERLAYED_FOLDER_NAMES[1]
a= None
for idx in range(num_of_images):
    curr_img_df = read_geojson(file=path_inside_data_folder + file_names[idx])
    
    curr_img_df = curr_img_df.sort_values('point_id') # to make sure the points are aligned when we subtract ndvi values below
    
    # add time cols
    start, end = start_end_dates[idx]
    curr_img_df['start_date'] = np.tile(np.array([start]), curr_img_df.shape[0])
    curr_img_df['end_date'] = np.tile(np.array([end]), curr_img_df.shape[0])
    
    curr_img_df['is_within_period'] = curr_img_df.apply(get__is_within_period, axis=1)
    
    curr_img_df['image_idx'] = np.tile(np.array(['i'+str(idx)]), curr_img_df.shape[0])
    curr_img_df.rename(columns = {'point_id':'point_idx'}, inplace = True)
    curr_img_df.point_idx = curr_img_df.point_idx.apply(lambda x: 'p' + str(x))
    
    samples[idx] = curr_img_df

In [60]:
merged_images = pandas.concat(samples, sort=False)
merged_images = merged_images.drop(['geometry', 'id', 'is_harvested'], axis=1)
merged_images[BANDS] /= 10000 # # divide by 10000 bc the bands are scaled by 10000
                                #(according to https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S2)
merged_images.shape

(49606, 19)

In [61]:
merged_images.head()

Unnamed: 0,B11,B12,B2,B3,B4,B5,B6,B7,B8,B8A,NDVI,finHarvDat,lat,lon,point_idx,start_date,end_date,is_within_period,image_idx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-06-13,46.34732,33.75093,p0,2022-01-08,2022-01-28,False,i0
1,0.2643,0.2107,0.0356,0.0523,0.0915,0.1225,0.1268,0.134,0.1444,0.157,0.224248,2022-06-13,46.278183,28.990823,p1,2022-01-08,2022-01-28,False,i0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-06-13,47.073863,33.093223,p2,2022-01-08,2022-01-28,False,i0
3,0.1392,0.1225,0.2996,0.3321,0.3483,0.3758,0.3833,0.3814,0.3987,0.3801,0.06747,2022-06-13,49.455416,38.596773,p3,2022-01-08,2022-01-28,False,i0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-06-13,48.101587,33.72342,p4,2022-01-08,2022-01-28,False,i0


In [62]:
sum(merged_images.is_within_period) # we expect 366

1584

In [64]:
# convert pandas df to geopandas df
merged_samples_gdf = geopandas.GeoDataFrame(
    merged_images, geometry=geopandas.points_from_xy(merged_images.lon, merged_images.lat))
merged_samples_gdf.rename(columns = {'is_within_period':'har_evnt'}, inplace = True)
merged_samples_gdf = merged_samples_gdf[(merged_samples_gdf.NDVI) != 0] # drop invalid points


# save dataset
merged_samples_gdf.to_file(f"../data/{MERGED_FILE_NAME}", driver='GeoJSON')