# Load, preprocess, and save train and test data
# John Brandt
# April 1, 2020

- Fuse Sentinel 1/2 data
- Reconstruct 2D-array from CEO output CSV by plot
- Match sentinel data to CEO labels
- Stack data_x, data_y, length
- Save numpy arrays for data_x, data_y, length

The notebook additionally contains some development code for:
- Parameter selection in whittaker smoothing
- Graphing plot locations on map

# Package imports and source code

In [1]:
from tqdm import tqdm_notebook, tnrange

import pandas as pd
import numpy as np
from random import shuffle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random
import itertools

os.environ['KMP_DUPLICATE_LIB_OK']='True'

#!source ~/.bash_profile
#!export PATH="/usr/lib/google-cloud-sdk/bin:$PATH"

In [2]:
%run ../src/preprocessing/slope.py

In [3]:
def reconstruct_images(plot_id):
    '''Takes a plot ID and subsets the input pd.DataFrame to that plot ID
       returns a (14, 14) array-like list with binary labels
       
        Parameters:
          batch_ids (list):
          batch_size (int):
          
         Returns:
          x_batch (arr):
          y_batch (arr):
    '''
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

In [4]:
# Note, the data in data/new-data/train-s2/2019 and 2018/ is bad (missing / NA values)
source = 'test'
sentinel_1 = True
s2_path = "../data/{}-s2-new/".format(source)
s1_path = "../data/{}-s1-new/".format(source)
csv_path = "../data/{}-csv/".format(source)
output_path = "../data/{}-processed/".format(source)
dem_path = "../data/{}-dem/".format(source)

In [5]:
# Regional models
east_africa = ['ethiopia', 'africa-east', 'hyperarid', 'kenya',
               'malawi', 'rwanda', 'subplot', 'sudan', 'africaeast', 'makueni']

west_africa = ['africa-west', 'ghana', 'niger', 'koure', 
               'hyperarid', 'africawest', 'cameroon', 'ethiopia', 'africawest'] 

asia = ['australia', 'asia', 'india', 'global']

americas = ['brazil', 'elsalvador', 'honduras', 
            'lac', 'mexico', 'sa-', 'america']

africa = east_africa + west_africa

malawi = ['malawi', 'africaeast']
nigercameroon = ['niger', 'koure', 'cameroon']

globe = [x for x in os.listdir(csv_path)]

region = east_africa

In [6]:
verified_lu_change = np.load("bad_plot_ids.npy")
len(verified_lu_change)

987

In [7]:
#verified_lu_change = np.concatenate([verified_lu_change, 
#                     np.array([[135542383, 135702506, 135807759, 136434961, 136435074, 136752744, 136752846, 136752868]
#                     ]).flatten()])
#print(len(verified_lu_change))


In [8]:
to_remove = []

verified_lu_change = [x for x in verified_lu_change if x not in to_remove]
np.save("bad_plot_ids.npy", np.array(verified_lu_change))
print(len(verified_lu_change))




987


In [9]:
# For either train or test data, loop through each plot and determine whether there is
# labelled Y data for it -- returning one dataframe for the entire data set

dfs = []
for i in os.listdir(csv_path):
    if ".csv" in i:#".csv" in i:
    #if any([x in i for x in region]):
        print(i)
        df = pd.read_csv(csv_path + i, encoding = "ISO-8859-1")
        df.columns = [x.upper() for x in df.columns]
        for column in ['IMAGERY_TITLE', 'STACKINGPROFILEDG',
                   'PL_PLOTID', 'IMAGERYYEARDG',
                  'IMAGERYMONTHPLANET', 'IMAGERYYEARPLANET', 
                   'IMAGERYDATESECUREWATCH',
                  'IMAGERYENDDATESECUREWATCH', 
                  'IMAGERYFEATUREPROFILESECUREWATCH',
                  'IMAGERYSTARTDATESECUREWATCH',
                  'IMAGERY_ATTRIBUTIONS',
                  'SAMPLE_GEOM']:
            if column in df.columns:
                df = df.drop(column, axis = 1)
        df['country'] = i.split(".")[0]
        dfs.append(df)

df = pd.concat(dfs, ignore_index = True)
df = df[~pd.isna(df['TREE'])]

plot_ids = sorted(df['PLOT_ID'].unique())
plot_ids_loaded = plot_ids

africaoceana-test.csv
india-test.csv
lac-north-test.csv
cameroon-test.csv
europe-test.csv
australia-test.csv
eurasia-test.csv
americas-test-random.csv
ghana-test.csv
kenya-planet-test.csv
global-test.csv
kenya-test.csv
lac-south-test.csv
ethiopia-test.csv
lac-brazil-test.csv
ghana-test-large.csv
africaeast-test.csv
africawest-test.csv


In [10]:
len(plot_ids)

1189

In [None]:
from scipy.ndimage import median_filter


# Initiate empty lists to store the X and Y data in
data_x, data_y, lengths = [], [], []
    
countries = {}
count = 0
to_remove = []
plot_ids_loaded = []
pl_plot_ids_loaded = []
dataframe = pd.DataFrame({'plot_id': [''], 'lat': [0.325], 'long': [0.325]})

sentinel_1_len = []

# Iterate over each plot
for i in tnrange(len(plot_ids)):
    s1_i = f'{s1_path}{str(plot_ids[i])}.npy'
    s2_i = f'{s2_path}{str(plot_ids[i])}.npy'
    dem_i = f'{dem_path}{str(plot_ids[i])}.npy'
    if os.path.isfile(s2_i) and os.path.isfile(s1_i):
        if plot_ids[i] not in verified_lu_change:
            country = df[df['PLOT_ID'] == plot_ids[i]]['country'].unique()
            if str(country[0]) not in countries.keys():
                countries[str(country[0])] = [count, count]

            countries[str(country[0])][1] = count
            x = np.load(s2_i)
            s1 = np.load(s1_i)
            s1_median = np.median(s1, axis = 0)
            s1 = np.concatenate([s1, s1_median[np.newaxis]], axis = 0)
            dem = np.load(dem_i)
            dem =  median_filter(dem, size = 5)
            dem = calcSlope(dem.reshape((1, 32+2, 32+2)),
                              np.full((32+2, 32+2), 10),
                              np.full((32+2, 32+2), 10), 
                              zScale = 1, minSlope = 0.02)
            dem = dem / 90
            dem = dem.reshape((32+2, 32+2, 1))
            dem = dem[1:-1, 1:-1]
            dem = median_filter(dem, 5)[4:-4, 4:-4]
            dem = np.tile(dem.reshape((1, 24, 24)), (x.shape[0], 1, 1))
            x[..., 10] = dem
            x = np.concatenate([x, s1], axis = -1)
            count += 1
            y = reconstruct_images(plot_ids[i])
            long = np.mean(df[df['PLOT_ID'] == plot_ids[i]]['LON'])
            lat = np.mean(df[df['PLOT_ID'] == plot_ids[i]]['LAT'])
            dataframe = dataframe.append({'plot_id': str(plot_ids[i]), 'lat': lat, 'long': long}, ignore_index = True)
            dataframe.append([plot_ids[i], lat, long])
            plot_ids_loaded.append(str(plot_ids[i]))
            lengths.append(x.shape[0])
            data_x.append(x)
            data_y.append(y)
print("Finished data loading")

data_x = np.stack(data_x)
data_x = np.float32(data_x)
data_y = np.stack(data_y)
lengths = np.stack(lengths)
print(f"The data shape is: {data_x.shape}")

HBox(children=(IntProgress(value=0, max=1189), HTML(value='')))

In [None]:
import hickle as hkl
dataframe = dataframe.drop(0, 0)
dataframe.reset_index(inplace = True, drop = True)

print(f"Writing {source} data")
hkl.dump(data_x, f"../tile_data/processed/{source}_x.hkl", mode='w', compression='gzip')
hkl.dump(data_y, f"../tile_data/processed/{source}_y.hkl", mode='w', compression='gzip')
hkl.dump(lengths, f"../tile_data/processed/{source}_l.hkl", mode='w', compression='gzip')
dataframe.to_csv(f"../tile_data/processed/{source}_plot_ids.csv", index = False)