# Load, preprocess, and save train and test data
# John Brandt
# April 1, 2020

- Fuse Sentinel 1/2 data
- Reconstruct 2D-array from CEO output CSV by plot
- Match sentinel data to CEO labels
- Stack data_x, data_y, length
- Save numpy arrays for data_x, data_y, length

The notebook additionally contains some development code for:
- Parameter selection in whittaker smoothing
- Graphing plot locations on map

# Package imports and source code

In [3]:
from tqdm import tqdm_notebook, tnrange
import pandas as pd
import numpy as np
from random import shuffle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random
import itertools
from scipy.ndimage import median_filter

os.environ['KMP_DUPLICATE_LIB_OK']='True'

%run ../src/preprocessing/slope.py

In [4]:
def reconstruct_images(plot_id):
    '''Takes a plot ID and subsets the input pd.DataFrame to that plot ID
       returns a (14, 14) array-like list with binary labels
       
        Parameters:
          batch_ids (list):
          batch_size (int):
          
         Returns:
          x_batch (arr):
          y_batch (arr):
    '''
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

In [15]:
source = 'train'
sentinel_1 = True
s2_path = "../data/{}-s2-new/".format(source)
s1_path = "../data/{}-s1-new/".format(source)
csv_path = "../data/{}-csv/".format(source)
output_path = "../data/{}-processed/".format(source)
dem_path = "../data/{}-dem/".format(source)

In [16]:
# Load and edit bad plot ids if needed
verified_lu_change = np.load("bad_plot_ids.npy")
len(verified_lu_change)

#verified_lu_change = np.concatenate([verified_lu_change, 
#                     np.array([138948201, 138948267, 138948365, 138948427, 138948534]).flatten()])
print(len(verified_lu_change))

to_remove = []

verified_lu_change = [x for x in verified_lu_change if x not in to_remove]
np.save("bad_plot_ids.npy", np.array(verified_lu_change))
print(len(verified_lu_change))



992
992


In [17]:
# For either train or test data, loop through each plot and determine whether there is
# labelled Y data for it -- returning one dataframe for the entire data set

cols_to_keep = ['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'ANALYSES', 'USER_ID',
       'COLLECTION_TIME', 'ANALYSIS_DURATION', 'TREE']
csvs = [x for x in os.listdir(csv_path) if '.csv' in x]

dfs = []
for i in csvs:
    print(i)
    df = pd.read_csv(csv_path + i, encoding = "ISO-8859-1")
    df.columns = [x.upper() for x in df.columns]

    for column in df.columns:
        if column not in cols_to_keep:
            df = df.drop(column, axis = 1)
    df['country'] = i.split(".")[0]
    dfs.append(df)

df = pd.concat(dfs, ignore_index = True)
df = df[~pd.isna(df['TREE'])]

plot_ids = sorted(df['PLOT_ID'].unique())
plot_ids_loaded = plot_ids

print(f"There are {len(plot_ids)} plots")

tanzania-region-val.csv
ceo-brazil-finetune-sample-data-2020-09-14.csv
ghana-kwofu-train.csv
kenya-makueni-train.csv
cameroon-finetune-3.csv
ceo-makueni-fix-2-sample-data-2020-10-22.csv
ceo-brazil-mid-coast-sample-data-2020-07-24.csv
lac-south-train.csv
india-sidhi-train.csv
cameroon-finetune-2.csv
koure-finetune.csv
todo-brazil-north.csv
kenya-makueni-train-2.csv
ceo-brazil-paraiba-train-sample-data-2020-07-22.csv
train-honduras-2.csv
kenya-planet.csv
ghana-ashanti-train-2.csv
africa-east-train.csv
mexico-campeche-train.csv
ghana-farm-train.csv
sa-train.csv
europe-sw-asia-train.csv
ghana-south-train.csv
centralasia-train.csv
todo-afr-south.csv
ceo-lac-random-points-sample-data-2020-07-29.csv
honduras-2-train.csv
cameroonnigerghana-train.csv
train-sa-west.csv
ceo-br-gain-4-sample-data-2020-09-29.csv
malawi-rumphi-train.csv
ceo-elsalvador-train-sample-data-2020-07-22.csv
ghana-ashanti-train-small.csv
ghana-train.csv
ceo-south-central-america-train.csv
kenya-train.csv
ceo-brazil-gain-ove

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




There are 6817 plots


In [18]:
def to_int16(array: np.array) -> np.array:
    '''Converts a float32 array to int16, reducing storage costs by three-fold'''
    assert np.min(array) >= 0, np.min(array)
    assert np.max(array) <= 1, np.max(array)
    
    array = np.clip(array, 0, 1)
    array = np.trunc(array * 65535)
    assert np.min(array >= 0)
    assert np.max(array <= 65535)
    
    return array.astype(np.uint16)

def process_dem(dem):
    dem =  median_filter(dem, size = 5)
    dem = calcSlope(dem.reshape((1, 32+2, 32+2)),
                      np.full((32+2, 32+2), 10),
                      np.full((32+2, 32+2), 10), 
                      zScale = 1, minSlope = 0.02)
    dem = dem / 90
    dem = dem.reshape((32+2, 32+2, 1))
    dem = dem[1:-1, 1:-1]
    dem = median_filter(dem, 5)[4:-4, 4:-4]
    return dem

In [19]:
count = 0
dataframe = pd.DataFrame({'plot_id': [''], 'lat': [0.325], 'long': [0.325]})

# Identify shape of data to load
plot_ids_to_load = []
for i in tnrange(len(plot_ids)):
    s1_i = f'{s1_path}{str(plot_ids[i])}.npy'
    s2_i = f'{s2_path}{str(plot_ids[i])}.npy'
    dem_i = f'{dem_path}{str(plot_ids[i])}.npy'
    if os.path.isfile(s2_i) and os.path.isfile(s1_i):
        if plot_ids[i] not in verified_lu_change:
            plot_ids_to_load.append(plot_ids[i])

print(f"There are {len(plot_ids_to_load)} plots")
data_x = np.zeros((len(plot_ids_to_load), 13, 24, 24, 17)).astype(np.uint16)
data_y = np.zeros((len(plot_ids_to_load), 14, 14))
            
    

# Iterate over each plot
to_remove = []
for i in tnrange(len(plot_ids_to_load)):
    s1_i = f'{s1_path}{str(plot_ids_to_load[i])}.npy'
    s2_i = f'{s2_path}{str(plot_ids_to_load[i])}.npy'
    dem_i = f'{dem_path}{str(plot_ids_to_load[i])}.npy'

    x = np.load(s2_i)
    s1 = np.load(s1_i)
    s1_median = np.median(s1, axis = 0)
    s1 = np.concatenate([s1, s1_median[np.newaxis]], axis = 0)
    dem = np.load(dem_i)
    dem = process_dem(dem)
    dem = np.tile(dem.reshape((1, 24, 24)), (x.shape[0], 1, 1))
    x[..., 10] = dem
    x = np.concatenate([x, s1], axis = -1)
    count += 1
    y = reconstruct_images(plot_ids_to_load[i])
    long = np.mean(df[df['PLOT_ID'] == plot_ids_to_load[i]]['LON'])
    lat = np.mean(df[df['PLOT_ID'] == plot_ids_to_load[i]]['LAT'])
    dataframe = dataframe.append({'plot_id': str(plot_ids_to_load[i]), 'lat': lat, 'long': long}, ignore_index = True)
    dataframe.append([plot_ids_to_load[i], lat, long])
    # The indices can range from -1 to 1, clip to 0-1
    x[..., 11:15] = np.clip(x[..., 11:15], -1, 1)
    x[..., 11:15] = (x[..., 11:15] + 1) / 2
    if np.sum(np.isnan(x)) > 0:
        to_remove.append(i)
    else:
        x = np.clip(x, 0, 1)
        x = to_int16(x)
        data_x[i] = x
        data_y[i] = y
            
# Remove any data samples that had missing values
if len(to_remove) > 0:
    print(f"Removing {to_remove}")
    data_x = np.delete(data_x, to_remove, 0)
    data_y = np.delete(data_y, to_remove, 0)
            
print(f"Finished loading: {data_x.shape} of {data_x.dtype} type")

HBox(children=(IntProgress(value=0, max=6817), HTML(value='')))


There are 5939 plots


HBox(children=(IntProgress(value=0, max=5939), HTML(value='')))


Removing [231]
Finished loading: (5938, 13, 24, 24, 17) of uint16 type


In [20]:
import hickle as hkl
dataframe = dataframe.drop(0, 0)
dataframe.reset_index(inplace = True, drop = True)

print(f"Writing {source} data")
hkl.dump(data_x, f"../tile_data/{source}/{source}_x.hkl", mode='w', compression='gzip')
hkl.dump(data_y, f"../tile_data/{source}/{source}_y.hkl", mode='w', compression='gzip')
dataframe.to_csv(f"../tile_data/{source}/{source}_plot_ids.csv", index = False)

Writing train data
