# Load, preprocess, and save train and test data
# John Brandt
# April 1, 2020

- Fuse Sentinel 1/2 data
- Reconstruct 2D-array from CEO output CSV by plot
- Match sentinel data to CEO labels
- Stack data_x, data_y, length
- Save numpy arrays for data_x, data_y, length

The notebook additionally contains some development code for:
- Parameter selection in whittaker smoothing
- Graphing plot locations on map

# Package imports and source code

In [1]:
from tqdm import tqdm_notebook, tnrange
import pandas as pd
import numpy as np
from random import shuffle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random
import itertools
from scipy.ndimage import median_filter

os.environ['KMP_DUPLICATE_LIB_OK']='True'

%run ../src/preprocessing/slope.py

In [2]:
def reconstruct_images(plot_id):
    '''Takes a plot ID and subsets the input pd.DataFrame to that plot ID
       returns a (14, 14) array-like list with binary labels
       
        Parameters:
          batch_ids (list):
          batch_size (int):
          
         Returns:
          x_batch (arr):
          y_batch (arr):
    '''
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

In [3]:
source = 'train'
sentinel_1 = True
s2_path = "../data/{}-s2-new/".format(source)
s1_path = "../data/{}-s1-radiometric/".format(source)
csv_path = "../data/{}-csv/".format(source)
output_path = "../data/{}-processed/".format(source)
dem_path = "../data/{}-dem/".format(source)

In [12]:
# Load and edit bad plot ids if needed
verified_lu_change = np.load("bad_plot_ids.npy")
len(verified_lu_change)

to_add = [139430763]
to_add = [x for x in to_add if x not in verified_lu_change]
verified_lu_change = np.concatenate([verified_lu_change, 
                     np.array(to_add).flatten()])

to_remove = [139189689, 139189690, 139320349]

verified_lu_change = [x for x in verified_lu_change if x not in to_remove]
np.save("bad_plot_ids.npy", np.array(verified_lu_change))
print(len(verified_lu_change))



2127


In [17]:
# For either train or test data, loop through each plot and determine whether there is
# labelled Y data for it -- returning one dataframe for the entire data set

cols_to_keep = ['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'ANALYSES', 'USER_ID',
       'COLLECTION_TIME', 'ANALYSIS_DURATION', 'TREE']
csvs = [x for x in sorted(os.listdir(csv_path)) if 'potential-tea' in x]

dfs = []
for i in csvs:
    print(i)
    df = pd.read_csv(csv_path + i, encoding = "ISO-8859-1")
    df.columns = [x.upper() for x in df.columns]

    for column in df.columns:
        if column not in cols_to_keep:
            df = df.drop(column, axis = 1)
    df['country'] = i.split(".")[0]
    dfs.append(df)

df = pd.concat(dfs, ignore_index = True, sort = True)
df = df[~pd.isna(df['TREE'])]

plot_ids = sorted(df['PLOT_ID'].unique())
plot_ids_loaded = plot_ids

print(f"There are {len(plot_ids)} plots")

ceo-potential-tea-11-data-2021-03-24.csv
ceo-potential-tea-2-sample-data-2021-03-26.csv
ceo-potential-tea-22-data-2021-03-25.csv
ceo-potential-tea-3-sample-data-2021-03-26.csv
ceo-potential-tea-4-sample-data-2021-03-26.csv
ceo-potential-tea-banana-sample-data-2021-03-24.csv
ceo-potential-tea-espirito-sample-data-2021-03-25.csv
ceo-potential-tea-new-5-sample-data-2021-03-28.csv
ceo-potential-tea-new-6-sample-data-2021-03-28.csv
ceo-potential-tea-new-7-sample-data-2021-03-28.csv
ceo-potential-tea-new-8-sample-data-2021-03-29.csv
ceo-potential-tea-zdry-forest-sample-data-2021-04-03.csv
There are 879 plots


In [18]:
def to_int16(array: np.array) -> np.array:
    '''Converts a float32 array to int16, reducing storage costs by three-fold'''
    assert np.min(array) >= 0, np.min(array)
    assert np.max(array) <= 1, np.max(array)
    
    array = np.clip(array, 0, 1)
    array = np.trunc(array * 65535)
    assert np.min(array >= 0)
    assert np.max(array <= 65535)
    
    return array.astype(np.uint16)

def process_dem(dem):
    dem =  median_filter(dem, size = 5)
    dem = calcSlope(dem.reshape((1, 32+2, 32+2)),
                      np.full((32+2, 32+2), 10),
                      np.full((32+2, 32+2), 10), 
                      zScale = 1, minSlope = 0.02)
    dem = dem / 90
    dem = dem.reshape((32+2, 32+2, 1))
    dem = dem[1:-1, 1:-1]
    dem = median_filter(dem, 5)[4:-4, 4:-4]
    return dem

def grndvi(array):
    nir = np.clip(array[..., 3], 0, 1)
    green = np.clip(array[..., 1], 0, 1)
    red = np.clip(array[..., 2], 0, 1)
    denominator = (nir+(green+red)) + 1e-5
    return (nir-(green+red)) / denominator


In [19]:
# FOR THE TIME BEING!
from skimage.transform import resize

%run ../src/preprocessing/indices.py

count = 0
dataframe = pd.DataFrame({'plot_id': [''], 'lat': [0.325], 'long': [0.325],
                          'y': [0]})

# Identify shape of data to load
plot_ids_to_load = []
for i in tnrange(len(plot_ids)):
    s1_i = f'{s1_path}{str(plot_ids[i])}.npy'
    s2_i = f'{s2_path}{str(plot_ids[i])}.npy'
    dem_i = f'{dem_path}{str(plot_ids[i])}.npy'
    s1_new_i = f'../data/{source}-s1/{str(plot_ids[i])}.npy'
    s1_exists = (os.path.exists(s1_i) or os.path.exists(s1_new_i))
    
    
    if os.path.isfile(s2_i) and s1_exists:
        if plot_ids[i] not in verified_lu_change:
            plot_ids_to_load.append(plot_ids[i])

print(f"There are {len(plot_ids_to_load)} plots")
plot_ids_to_load = [x for x in plot_ids_to_load if x not in  [139077414,
                                                              139187051,
                                                              139187043,
                                                             139187133, 139187134]]
data_x = np.zeros((len(plot_ids_to_load), 12, 24, 24, 13)).astype(np.uint16)
data_y = np.zeros((len(plot_ids_to_load), 14, 14))
            
    

# Iterate over each plot
to_remove = []
#139187043
for i in tnrange(len(plot_ids_to_load)):
    print(plot_ids_to_load[i])
    s1_i = f'{s1_path}{str(plot_ids_to_load[i])}.npy'
    s2_i = f'{s2_path}{str(plot_ids_to_load[i])}.npy'
    dem_i = f'{dem_path}{str(plot_ids_to_load[i])}.npy'

    x = np.load(s2_i)
    if os.path.exists(s1_i):
        s1 = np.load(s1_i)
    else:
        s1 = np.load(f'../data/{source}-s1/{str(plot_ids_to_load[i])}.npy')
    s1 = np.reshape(s1, (12, 12, 2, 12, 2, 2))
    s1 = np.mean(s1, axis = (2, 4))
    s1 = resize(s1, (12, 24, 24, 2))
    
    
    
    dem = np.load(dem_i)
    dem = process_dem(dem)
    dem = np.tile(dem.reshape((1, 24, 24)), (x.shape[0], 1, 1))
    x[..., 10] = dem
    #x = evi(x, True)
    #x = bi(x, True)
    #x = msavi2(x, True)
    #x= ndvi(x, True)
    
    x = np.concatenate([x, s1], axis = -1)
    #median = np.median(x, axis = 0)
    #x = np.concatenate([x, median[np.newaxis]], axis = 0)
    count += 1
    y = reconstruct_images(plot_ids_to_load[i])
    long = np.mean(df[df['PLOT_ID'] == plot_ids_to_load[i]]['LON'])
    lat = np.mean(df[df['PLOT_ID'] == plot_ids_to_load[i]]['LAT'])
    dataframe = dataframe.append({'plot_id': str(plot_ids_to_load[i]),
                                  'lat': lat, 'long': long,
                                 'y': np.sum(np.array(y))}, 
                                 ignore_index = True)
    dataframe.append([plot_ids_to_load[i], lat, long])
    # The indices can range from -1 to 1, clip to 0-1
    #x[..., 11:15] = np.clip(x[..., 11:15], -1, 1)
    #x[..., 11:15] = (x[..., 11:15] + 1) / 2
    if np.sum(np.isnan(x)) > 0:
        to_remove.append(i)
    else:
        x = np.clip(x, 0, 1)
        x = to_int16(x)
        data_x[i] = x
        try:
            data_y[i] = np.array(y)
        except:
            to_remove.append(i)
            
# Remove any data samples that had missing values
if len(to_remove) > 0:
    print(f"Removing {to_remove}")
    data_x = np.delete(data_x, to_remove, 0)
    data_y = np.delete(data_y, to_remove, 0)
            
print(f"Finished loading: {data_x.shape} of {data_x.dtype} type")

HBox(children=(IntProgress(value=0, max=879), HTML(value='')))


There are 761 plots


HBox(children=(IntProgress(value=0, max=761), HTML(value='')))

139397643
139397644
139397646
139397647
139397648
139397649
139397650
139397651
139397652
139397653
139397654
139397655
139397656
139397657
139397658
139397659
139397660
139397661
139397662
139397663
139397664
139397665
139397666
139397667
139397668
139397669
139397672
139397673
139397674
139397675
139397676
139397677
139397678
139397679
139397680
139397681
139397682
139397683
139397684
139397685
139397686
139397687
139397689
139397690
139397691
139397692
139397693
139397694
139397695
139397696
139397697
139397698
139397700
139397701
139397737
139397738
139397739
139397740
139397741
139397742
139397743
139397744
139397745
139397746
139397747
139397748
139397749
139397750
139397751
139397752
139397753
139397754
139397755
139397756
139397757
139397758
139397759
139397760
139397761
139397762
139397763
139397764
139397765
139397766
139397767
139397768
139397769
139397770
139397771
139397772
139397773
139397774
139397775
139397776
139397777
139397778
139397779
139397780
139397781
139397782


In [20]:
import hickle as hkl
dataframe = dataframe.drop(0, 0)
dataframe.reset_index(inplace = True, drop = True)
if len(to_remove) > 0:
    dataframe = dataframe.drop(to_remove, 0)
    dataframe.reset_index(inplace = True, drop = True)

print(f"Writing {source} data")
hkl.dump(data_x, f"../tile_data/{source}/{source}_x.hkl", mode='w', compression='gzip')
hkl.dump(data_y, f"../tile_data/{source}/{source}_y.hkl", mode='w', compression='gzip')
dataframe.to_csv(f"../tile_data/{source}/{source}_plot_ids.csv", index = False)
print("Finished!")

Writing train data
Finished!
