# Load, preprocess, and save train and test data
# John Brandt
# April 1, 2020

- Fuse Sentinel 1/2 data
- Reconstruct 2D-array from CEO output CSV by plot
- Match sentinel data to CEO labels
- Stack data_x, data_y, length
- Save numpy arrays for data_x, data_y, length

The notebook additionally contains some development code for:
- Parameter selection in whittaker smoothing
- Graphing plot locations on map

# Package imports and source code

In [1]:
#TODO Remove imports that aren't needed to save RAM
from tqdm import tqdm_notebook, tnrange

import pandas as pd
import numpy as np
from random import shuffle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random
import itertools

os.environ['KMP_DUPLICATE_LIB_OK']='True'

#!source ~/.bash_profile
#!export PATH="/usr/lib/google-cloud-sdk/bin:$PATH"

%run ../src/utils/utils.py
%run ../src/utils/slope.py

Instructions for updating:
Colocations handled automatically by placer.


In [2]:
def reconstruct_images(plot_id):
    '''Takes a plot ID and subsets the input pd.DataFrame to that plot ID
       returns a (14, 14) array-like list with binary labels
       
        Parameters:
          batch_ids (list):
          batch_size (int):
          
         Returns:
          x_batch (arr):
          y_batch (arr):
    '''
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

In [3]:
source = 'test'
sentinel_1 = True
s2_path = "../data/{}-s2/".format(source)
s1_path = "../data/{}-s1/".format("old")
csv_path = "../data/{}-csv/".format(source)
output_path = "../data/{}-processed/".format(source)

#s2_path = "../data/drylands/s2/"
#s1_path = "../data/drylands/s1/"
#csv_path = "../data/drylands/csv/"

#s2_path = "../data/project-val/sentinel-2/"
#s1_path = "../data/project-val/sentinel-1/"
#csv_path = "../data/project-val/"

In [4]:
# Regional models
east_africa = ['ethiopia', 'africa-east', 'hyperarid', 'kenya',
               'malawi', 'rwanda', 'subplot', 'sudan', 'africaeast',] # 1442, 236

west_africa = ['africa-west', 'ghana', 'niger', 'koure', 
               'hyperarid', 'africawest', 'cameroon'] # 1685, 171

asia = ['australia', 'asia', 'india', 'global'] # 471, 346

americas = ['brazil', 'elsalvador', 'honduras', 
            'lac', 'mexico', 'sa-', 'america'] # 976, 218

africa = east_africa + west_africa

malawi = ['malawi', 'africaeast']
nigercameroon = ['niger', 'koure', 'cameroon']

globe = [x for x in os.listdir(csv_path)]

region = globe

In [5]:
# For either train or test data, loop through each plot and determine whether there is
# labelled Y data for it -- returning one dataframe for the entire data set

dfs = []
for i in os.listdir(csv_path):
    if ".csv" in i:
        if any([x in i for x in region]):

            print(i)
            df = pd.read_csv(csv_path + i).drop('IMAGERY_TITLE', axis = 1)
            df['country'] = i.split(".")[0]
            dfs.append(df)

for i in range(len(dfs)):
    if "PL_PLOTID" not in dfs[i].columns:
        dfs[i]['PL_PLOTID'] = 0
            #dfs[i] = dfs[i].drop("PL_PLOTID", axis = 1)
    if 'STACKINGPROFILEDG' in dfs[i].columns:
        dfs[i] = dfs[i].drop('STACKINGPROFILEDG', axis = 1)
    if 'IMAGERYYEARDG' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYYEARDG', axis = 1)

df = pd.concat(dfs, ignore_index = True)
df = df.dropna(axis = 0)

existing = [int(x[:-4]) for x in os.listdir(s2_path) if ".DS" not in x]

df = df[df['PLOT_ID'].isin(existing)]
plot_ids = sorted(df['PLOT_ID'].unique())

africaoceana-test.csv
india-test.csv
lac-north-test.csv
cameroon-test.csv
europe-test.csv
australia-test.csv
eurasia-test.csv
americas-test-random.csv
ghana-test.csv
global-test.csv
kenya-test.csv
lac-south-test.csv
ethiopia-test.csv
ghana-test-large.csv
africaeast-test.csv
africawest-test.csv


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [6]:
# Initiate empty lists to store the X and Y data in
data_x, data_y, lengths = [], [], []
    
    
countries = {}
count = 0
to_remove = []
plot_ids_loaded = []
pl_plot_ids_loaded = []
# Iterate over each plot
for i in tnrange(len(plot_ids)):
    skip = True if sentinel_1 else False
    # Load the sentinel imagery
    if (str(plot_ids[i]) + ".npy")in os.listdir(s2_path):
        if plot_ids[i] not in [136077593, 136776935, 136776898]:
            country = df[df['PLOT_ID'] == plot_ids[i]]['country'].unique()
            if str(country[0]) not in countries.keys():
                countries[str(country[0])] = [count, count]
            countries[str(country[0])][1] = count
            x = np.load(s2_path + str(plot_ids[i]) + ".npy")
            if sentinel_1 and os.path.isfile(s1_path + str(plot_ids[i]) + ".npy"):
                skip = False
                s1 = np.load(s1_path + str(plot_ids[i]) + ".npy")
                print(np.max(s1))
                x = np.concatenate([x, s1], axis = -1)
            count += 1
        y = reconstruct_images(plot_ids[i])
        if not skip:
            plot_ids_loaded.append(plot_ids[i])
            lengths.append(x.shape[0])
            data_x.append(x)
            data_y.append(y)
print("Finished data loading")

data_x = np.stack(data_x)
data_y = np.stack(data_y)
lengths = np.stack(lengths)

HBox(children=(IntProgress(value=0, max=480), HTML(value='')))

0.6349875
0.8291
0.6322
1.6892
0.79295
0.7629
0.9239
0.5698
0.82225
0.5364
0.3167
0.9208
0.6715
0.7271
0.510975
0.92745
1.5934626
1.82055
0.894
0.8344
1.0317
1.18005
0.67385
0.7927
0.923425
0.88825
1.3549
1.258225
0.6253
1.2297499
6.5535
1.3349
2.18895
0.88205
0.73209995
0.7169
0.629575
1.1153499
1.6402
0.7622
0.7232
0.73885
1.3936499
2.0568
0.773575
0.663
1.0672
0.57565
0.62037504
0.65197504
0.9572
0.54684997
0.49385
0.23664999
0.5805
1.809
1.9339001
1.1348
0.8991
0.7712
0.69565
1.165675
0.6784
2.165
1.333125
1.8530501
0.7142
3.3221
0.94079995
0.67114997
0.7589
0.4705
1.1215
0.66595
0.9469
0.7741
0.90825
5.4129
0.418425
1.8788002
0.6663
1.5507
0.696375
3.33455
0.57755
0.8585
4.064525
1.8556
0.8429
0.59985
0.9392
1.1213
0.568
0.63215
0.57455003
0.691025
1.277575
0.4731
0.5301
1.5206
0.6839
0.64769995
0.547
0.5494
0.953975
1.3421
0.7002
0.57140005
0.92480004
2.373525
0.5321
0.47025
0.67497504
0.70085
0.64575005
0.44325
0.293025
0.64545
0.4376
0.59525
1.3382
0.594325
0.60275
0.5411
0.388

In [7]:
print(data_x.shape)

(477, 24, 16, 16, 17)


In [8]:
countries

{'india-test': [0, 50],
 'lac-north-test': [51, 91],
 'cameroon-test': [92, 130],
 'americas-test-random': [131, 181],
 'eurasia-test': [182, 281],
 'africaoceana-test': [282, 347],
 'australia-test': [348, 397],
 'europe-test': [398, 477]}

In [9]:
if source == 'train':
    np.save("../tile_data/processed/data_x_l2a_processed.npy", data_x)
    np.save("../tile_data/processed/data_y_l2a_processed.npy", np.array(data_y))
    np.save("../tile_data/processed/length_l2a_processed.npy", np.array(lengths))
if source == 'test' or source == "project":
    print("Writing test data")
    np.save("../tile_data/processed/test_x_l2a_processed.npy", data_x)
    np.save("../tile_data/processed/test_y_l2a_processed.npy", data_y)
    np.save("../tile_data/processed/test_length_l2a_processed.npy", lengths)

Writing test data


# Development code

## Parameter evaluation for lambda in whittaker smoother

In [107]:
def mask_segments(arr, percent):
    '''Docstring
    
         Parameters:
          arr (arr):
          percent (float):
          
         Returns:
          masked (arr):
    '''
    masked = np.copy(arr)
    for sample in range(masked.shape[0]):
        start = random.sample(range(72 - int(72*(percent))), 1)[0]
        length = int(72*(percent))
        
        masked[sample, start:start+length] = np.tile(
            (masked[sample, start] + masked[sample, start+length])/2,
            (1, length, 1, 1, 1))
        
    return masked

def apply_smoothing(arr, lmb):
    '''Docstring
    
         Parameters:
          arr (arr):
          lmb (float):
          
         Returns:
          smoothed (arr):
    '''
    smoothed = np.copy(arr)
    for sample in tnrange(smoothed.shape[0]):
        for row in range(0, 16):
            for column in range(0, 16):
                for band in [x for x in range(0, 15) if x != 10]:
                    sm = smooth(smoothed[sample, :, row, column, band], lmb, d = 2)
                    smoothed[sample, :, row, column, band] = sm
    return smoothed

#for lmb in [1, 20, 50, 100, 500, 750]:
#    for percent in [20, 30, 40, 50]:
#        masked = mask_segments(data_x, percent/100)
#        smoothed = apply_smoothing(masked, lmb)
#        mse = np.mean(abs(smoothed - data_x))
#        #mse = calculate_mse(smoothed, data_x)
#        print("{}% masked data, {} lambda: {} error".format(percent, lmb, mse))

## Extraction of lat and longs for training / testing data

In [None]:
lats = []
longs = []
plot_ids = []
df = pd.read_csv("../data/science-2017-test.csv")
df = df.sample(frac=1, random_state = 5)
df['PLOT_ID'] = range(0, len(df), 1)
df['group'] = df['tree_cover'] * 100
print(np.unique(df['group']))
df['group'] = df['group'].astype(int)
print(np.unique(df['group']))
for i in range(0, len(df), 100):
    lat = df['location_y'][i]
    lon = df['location_x'][i]
    lats.append(lat)
    longs.append(lon)
    plot_ids.append(i)

In [10]:
lats = []
longs = []
for i in plot_ids_loaded:
    subs = df[df['PLOT_ID'] == i]
    lat = float(subs['LAT'].head(1))
    lon = float(subs['LON'].head(1))
   # print(subs['LAT'])
    #if lon > region[0] and lon < region[1]:
        #if lat > region[2] and lat < region[3]:
    lats.append(lat)
    longs.append(lon)
    
lats = np.array(lats)
longs = np.array(longs)
    
BBox = ((longs.min(),   longs.max(),      
         lats.min(), lats.max()))

print(BBox)
print(lats.shape)

(-123.7612999908743, 156.998167669016, -54.904626104679984, 67.53857583112786)
(1007,)


In [11]:
#np.save("../data/metrics/plotids.npy", plot_ids_loaded)
testing_latlongs = pd.DataFrame({'lats':lats,'longs':longs, 'id':plot_ids_loaded})
testing_latlongs.to_csv("../data/drylands/latlong.csv", index = False)