# Load, preprocess, and save train and test data
# John Brandt
# April 1, 2020

- Fuse Sentinel 1/2 data
- Reconstruct 2D-array from CEO output CSV by plot
- Match sentinel data to CEO labels
- Stack data_x, data_y, length
- Save numpy arrays for data_x, data_y, length

The notebook additionally contains some development code for:
- Parameter selection in whittaker smoothing
- Graphing plot locations on map

# Package imports and source code

In [1]:
from tqdm import tqdm_notebook, tnrange

import pandas as pd
import numpy as np
from random import shuffle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random
import itertools

os.environ['KMP_DUPLICATE_LIB_OK']='True'

#!source ~/.bash_profile
#!export PATH="/usr/lib/google-cloud-sdk/bin:$PATH"

In [2]:
def reconstruct_images(plot_id):
    '''Takes a plot ID and subsets the input pd.DataFrame to that plot ID
       returns a (14, 14) array-like list with binary labels
       
        Parameters:
          batch_ids (list):
          batch_size (int):
          
         Returns:
          x_batch (arr):
          y_batch (arr):
    '''
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

In [3]:
source = 'train'
sentinel_1 = True
s2_path = "../data/{}-s2/".format("train")
s1_path = "../data/{}-s1/".format("train")
csv_path = "../data/{}-csv/".format(source)
output_path = "../data/{}-processed/".format(source)

#s2_path = "../data/drylands/s2/"
#s1_path = "../data/drylands/s1/"
#csv_path = "../data/drylands/csv/"

#s2_path = "../data/project-val/sentinel-2/"
#s1_path = "../data/project-val/sentinel-1/"
#csv_path = "../data/project-val/"

In [4]:
# Regional models
east_africa = ['ethiopia', 'africa-east', 'hyperarid', 'kenya',
               'malawi', 'rwanda', 'subplot', 'sudan', 'africaeast']

west_africa = ['africa-west', 'ghana', 'niger', 'koure', 
               'hyperarid', 'africawest', 'cameroon'] 

asia = ['australia', 'asia', 'india', 'global']

americas = ['brazil', 'elsalvador', 'honduras', 
            'lac', 'mexico', 'sa-', 'america']

africa = east_africa + west_africa

malawi = ['malawi', 'africaeast']
nigercameroon = ['niger', 'koure', 'cameroon']

globe = [x for x in os.listdir(csv_path)]

region = globe

In [5]:
# For either train or test data, loop through each plot and determine whether there is
# labelled Y data for it -- returning one dataframe for the entire data set

dfs = []
for i in os.listdir(csv_path):
    if ".csv" in i:
        if any([x in i for x in region]):
            print(i)
            df = pd.read_csv(csv_path + i).drop('IMAGERY_TITLE', axis = 1)
            df['country'] = i.split(".")[0]
            dfs.append(df)

for i in range(len(dfs)):
    if "PL_PLOTID" not in dfs[i].columns:
        dfs[i]['PL_PLOTID'] = 0
            #dfs[i] = dfs[i].drop("PL_PLOTID", axis = 1)
    if 'STACKINGPROFILEDG' in dfs[i].columns:
        dfs[i] = dfs[i].drop('STACKINGPROFILEDG', axis = 1)
    if 'IMAGERYYEARDG' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYYEARDG', axis = 1)

df = pd.concat(dfs, ignore_index = True)
df = df.dropna(axis = 0)

existing = [int(x[:-4]) for x in os.listdir(s2_path) if ".DS" not in x]

df = df[df['PLOT_ID'].isin(existing)]
plot_ids = sorted(df['PLOT_ID'].unique())

ghana-kwofu-train.csv
kenya-makueni-train.csv
cameroon-finetune-3.csv
lac-south-train.csv
india-sidhi-train.csv
cameroon-finetune-2.csv
koure-finetune.csv
ghana-ashanti-train-2.csv
africa-east-train.csv
mexico-campeche-train.csv
ghana-farm-train.csv
sa-train.csv
europe-sw-asia-train.csv
ghana-south-train.csv
centralasia-train.csv
honduras-2-train.csv
cameroonnigerghana-train.csv
brazil-paraiba-train.csv
lac-north-train.csv
malawi-rumphi-train.csv
ghana-ashanti-train-small.csv
ghana-train.csv
kenya-train.csv
india-train.csv
malawi-rumphi-small.csv
honduras-train.csv
rwanda-train.csv
cameroon-train.csv
hyperarid-train.csv
subplot4.csv
kenya-farm-train-2.csv
elsalvador-train.csv
india-kochi-train.csv
subplot.csv
southamerica-train.csv
sudan-train.csv
niger-train.csv
subplot2.csv
subplot3.csv
northamerica-train.csv
ghana-ashanti-train.csv
ghana-upperwest-train.csv
malawi-train.csv
honduras-train-north.csv
africa-west-train.csv
ghana-kwofu-large.csv
lac-train.csv
ghana-mid-train.csv
ghana-k

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [6]:
# Initiate empty lists to store the X and Y data in
data_x, data_y, lengths = [], [], []
    
    
countries = {}
count = 0
to_remove = []
plot_ids_loaded = []
pl_plot_ids_loaded = []
# Iterate over each plot
for i in tnrange(len(plot_ids)):
    skip = True if sentinel_1 else False
    # Load the sentinel imagery
    if (str(plot_ids[i]) + ".npy") in os.listdir(s2_path):
        if plot_ids[i] not in [136077593, 136776935, 136776898]:
            country = df[df['PLOT_ID'] == plot_ids[i]]['country'].unique()
            if str(country[0]) not in countries.keys():
                countries[str(country[0])] = [count, count]
            countries[str(country[0])][1] = count
            x = np.load(s2_path + str(plot_ids[i]) + ".npy")
            if sentinel_1 and os.path.isfile(s1_path + str(plot_ids[i]) + ".npy"):
                skip = False
                s1 = np.load(s1_path + str(plot_ids[i]) + ".npy")
                x = np.concatenate([x, s1], axis = -1)
            count += 1
        y = reconstruct_images(plot_ids[i])
        if not skip:
            plot_ids_loaded.append(plot_ids[i])
            lengths.append(x.shape[0])
            data_x.append(x)
            data_y.append(y)
print("Finished data loading")

data_x = np.stack(data_x)
data_y = np.stack(data_y)
lengths = np.stack(lengths)

HBox(children=(IntProgress(value=0, max=4395), HTML(value='')))


Finished data loading


In [7]:
data_x = np.concatenate([data_x, data_x2])
data_y = np.concatenate([data_y, data_y2])
lengths = np.concatenate([lengths, lengths_2])

NameError: name 'data_x2' is not defined

In [18]:
print(data_x.shape)

(1649, 24, 16, 16, 17)


In [19]:
countries

{'brazil-paraiba-train': [0, 75],
 'mexico-campeche-train': [76, 163],
 'elsalvador-train': [164, 236],
 'sa-train': [237, 270],
 'lac-train': [271, 324],
 'honduras-train': [325, 425],
 'honduras-train-north': [426, 470],
 'honduras-2-train': [471, 578],
 'lac-south-train': [579, 696],
 'lac-north-train': [697, 717],
 'southamerica-train': [718, 823]}

In [9]:
if source == 'train':
    np.save("../tile_data/processed/data_x_l2a_processed.npy", data_x)
    np.save("../tile_data/processed/data_y_l2a_processed.npy", np.array(data_y))
    np.save("../tile_data/processed/length_l2a_processed.npy", np.array(lengths))
if source == 'test' or source == "project":
    print("Writing test data")
    np.save("../tile_data/processed/test_x_l2a_processed.npy", data_x)
    np.save("../tile_data/processed/test_y_l2a_processed.npy", data_y)
    np.save("../tile_data/processed/test_length_l2a_processed.npy", lengths)

## Extraction of lat and longs for training / testing data

In [None]:
lats = []
longs = []
plot_ids = []
df = pd.read_csv("../data/science-2017-test.csv")
df = df.sample(frac=1, random_state = 5)
df['PLOT_ID'] = range(0, len(df), 1)
df['group'] = df['tree_cover'] * 100
print(np.unique(df['group']))
df['group'] = df['group'].astype(int)
print(np.unique(df['group']))
for i in range(0, len(df), 100):
    lat = df['location_y'][i]
    lon = df['location_x'][i]
    lats.append(lat)
    longs.append(lon)
    plot_ids.append(i)

In [8]:
lats = []
longs = []
for i in plot_ids_loaded:
    subs = df[df['PLOT_ID'] == i]
    lat = float(subs['LAT'].head(1))
    lon = float(subs['LON'].head(1))
   # print(subs['LAT'])
    #if lon > region[0] and lon < region[1]:
        #if lat > region[2] and lat < region[3]:
    lats.append(lat)
    longs.append(lon)
    
lats = np.array(lats)
longs = np.array(longs)
    
BBox = ((longs.min(),   longs.max(),      
         lats.min(), lats.max()))

print(BBox)
print(lats.shape)

(-121.31806300493471, 156.60271959506528, -54.427197440476576, 59.66233807380722)
(4390,)


In [9]:
#np.save("../data/metrics/plotids.npy", plot_ids_loaded)
testing_latlongs = pd.DataFrame({'lats':lats,'longs':longs, 'id':plot_ids_loaded})
testing_latlongs.to_csv("../data/latlongs/training.csv", index = False)