# Load, preprocess, and save train and test data
# John Brandt
# April 1, 2020

- Fuse Sentinel 1/2 data
- Reconstruct 2D-array from CEO output CSV by plot
- Match sentinel data to CEO labels
- Stack data_x, data_y, length
- Save numpy arrays for data_x, data_y, length

The notebook additionally contains some development code for:
- Parameter selection in whittaker smoothing
- Graphing plot locations on map

# Package imports and source code

In [6]:
from tqdm import tqdm_notebook, tnrange

import pandas as pd
import numpy as np
from random import shuffle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random
import itertools

os.environ['KMP_DUPLICATE_LIB_OK']='True'

#!source ~/.bash_profile
#!export PATH="/usr/lib/google-cloud-sdk/bin:$PATH"

In [7]:
%run ../src/preprocessing/slope.py

In [8]:
def reconstruct_images(plot_id):
    '''Takes a plot ID and subsets the input pd.DataFrame to that plot ID
       returns a (14, 14) array-like list with binary labels
       
        Parameters:
          batch_ids (list):
          batch_size (int):
          
         Returns:
          x_batch (arr):
          y_batch (arr):
    '''
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

In [9]:
# Note, the data in data/new-data/train-s2/2019 and 2018/ is bad (missing / NA values)
source = 'train'
sentinel_1 = True
s2_path = "../data/{}-s2-24/".format(source)
s1_path = "../data/{}-s1-new/".format(source)
csv_path = "../data/{}-csv/".format(source)
output_path = "../data/{}-processed/".format(source)
dem_path = "../data/{}-dem/".format(source)

In [10]:
# Regional models
east_africa = ['ethiopia', 'africa-east', 'hyperarid', 'kenya',
               'malawi', 'rwanda', 'subplot', 'sudan', 'africaeast', 'makueni']

west_africa = ['africa-west', 'ghana', 'niger', 'koure', 
               'hyperarid', 'africawest', 'cameroon', 'ethiopia', 'africawest'] 

asia = ['australia', 'asia', 'india', 'global']

americas = ['brazil', 'elsalvador', 'honduras', 
            'lac', 'mexico', 'sa-', 'america']

africa = east_africa + west_africa

malawi = ['malawi', 'africaeast']
nigercameroon = ['niger', 'koure', 'cameroon']

globe = [x for x in os.listdir(csv_path)]

region = east_africa

In [18]:
verified_lu_change = np.load("bad_plot_ids.npy")
len(verified_lu_change)

869

In [19]:
#verified_lu_change = np.concatenate([verified_lu_change, 
#                     np.array([
#                     ])])


In [25]:
to_remove = []

verified_lu_change = [x for x in verified_lu_change if x not in to_remove]
np.save("bad_plot_ids.npy", np.array(verified_lu_change))
print(len(verified_lu_change))




851


851

In [14]:
# For either train or test data, loop through each plot and determine whether there is
# labelled Y data for it -- returning one dataframe for the entire data set

dfs = []
for i in os.listdir(csv_path):
    if ".csv" in i:#".csv" in i:
    #if any([x in i for x in region]):
        print(i)
        df = pd.read_csv(csv_path + i)
        df.columns = [x.upper() for x in df.columns]
        df = df.drop('IMAGERY_TITLE', axis = 1)
        df['country'] = i.split(".")[0]
        dfs.append(df)

for i in range(len(dfs)):
    
    if "PL_PLOTID" not in dfs[i].columns:
        dfs[i]['PL_PLOTID'] = 0
            #dfs[i] = dfs[i].drop("PL_PLOTID", axis = 1)
    if 'STACKINGPROFILEDG' in dfs[i].columns:
        dfs[i] = dfs[i].drop('STACKINGPROFILEDG', axis = 1)
    if 'IMAGERYYEARDG' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYYEARDG', axis = 1)
    if 'IMAGERYDATESECUREWATCH' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYDATESECUREWATCH', axis = 1)
    if 'IMAGERYENDDATESECUREWATCH' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYENDDATESECUREWATCH', axis = 1)
    if 'IMAGERYSTARTDATESECUREWATCH' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYSTARTDATESECUREWATCH', axis = 1)
    if 'IMAGERYFEATUREPROFILESECUREWATCH' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYFEATUREPROFILESECUREWATCH', axis = 1)

df = pd.concat(dfs, ignore_index = True)
df = df[~pd.isna(df['TREE'])]
#df = df.dropna(axis = 0)

plot_ids = sorted(df['PLOT_ID'].unique())
plot_ids_loaded = plot_ids

ceo-brazil-finetune-sample-data-2020-09-14.csv
ghana-kwofu-train.csv
kenya-makueni-train.csv
cameroon-finetune-3.csv
ceo-makueni-fix-2-sample-data-2020-10-22.csv
ceo-brazil-mid-coast-sample-data-2020-07-24.csv
lac-south-train.csv
india-sidhi-train.csv
cameroon-finetune-2.csv
koure-finetune.csv
kenya-makueni-train-2.csv
ceo-brazil-paraiba-train-sample-data-2020-07-22.csv
kenya-planet.csv
ghana-ashanti-train-2.csv
africa-east-train.csv
mexico-campeche-train.csv
ghana-farm-train.csv
sa-train.csv
europe-sw-asia-train.csv
ghana-south-train.csv
centralasia-train.csv
ceo-lac-random-points-sample-data-2020-07-29.csv
honduras-2-train.csv
cameroonnigerghana-train.csv
ceo-br-gain-4-sample-data-2020-09-29.csv
malawi-rumphi-train.csv
ceo-elsalvador-train-sample-data-2020-07-22.csv
ghana-ashanti-train-small.csv
ghana-train.csv
kenya-train.csv
ceo-brazil-gain-overall-sample-data-2020-10-08.csv
india-train.csv
ceo-brazil-south-small-sample-data-2020-07-23.csv
ceo-makueni-fix-sample-data-2020-10-20.csv

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [15]:
from scipy.ndimage import median_filter


# Initiate empty lists to store the X and Y data in
data_x, data_y, lengths = [], [], []
    
countries = {}
count = 0
to_remove = []
plot_ids_loaded = []
pl_plot_ids_loaded = []
dataframe = pd.DataFrame({'plot_id': [''], 'lat': [0.325], 'long': [0.325]})

sentinel_1_len = []

# Iterate over each plot
for i in tnrange(len(plot_ids)):
    s1_i = f'{s1_path}{str(plot_ids[i])}.npy'
    s2_i = f'{s2_path}{str(plot_ids[i])}.npy'
    dem_i = f'{dem_path}{str(plot_ids[i])}.npy'
    if not os.path.isfile(s1_i):
        s1_i = f'../data/{source}-s1/{str(plot_ids[i])}.npy'
        print(s1_i)
    if (os.path.isfile(s1_i) and os.path.isfile(s2_i)):
        if plot_ids[i] in verified_lu_change:
            country = df[df['PLOT_ID'] == plot_ids[i]]['country'].unique()
            if str(country[0]) not in countries.keys():
                countries[str(country[0])] = [count, count]

            countries[str(country[0])][1] = count
            x = np.load(s2_i)
            #s1 = np.load(s1_i)
            dem = np.load(dem_i)
            dem =  median_filter(dem, size = 5)
            dem = calcSlope(dem.reshape((1, 32+2, 32+2)),
                              np.full((32+2, 32+2), 10),
                              np.full((32+2, 32+2), 10), 
                              zScale = 1, minSlope = 0.02)
            dem = dem / 90
            dem = dem.reshape((32+2, 32+2, 1))
            dem = dem[1:-1, 1:-1]
            dem = median_filter(dem, 5)[4:-4, 4:-4]
            dem = np.tile(dem.reshape((1, 24, 24)), (x.shape[0], 1, 1))
            x[..., 10] = dem
            #x = np.concatenate([x, s1], axis = -1)
            count += 1
            y = reconstruct_images(plot_ids[i])
            long = np.mean(df[df['PLOT_ID'] == plot_ids[i]]['LON'])
            lat = np.mean(df[df['PLOT_ID'] == plot_ids[i]]['LAT'])
            dataframe = dataframe.append({'plot_id': str(plot_ids[i]), 'lat': lat, 'long': long}, ignore_index = True)
            dataframe.append([plot_ids[i], lat, long])
            plot_ids_loaded.append(str(plot_ids[i]))
            lengths.append(x.shape[0])
            data_x.append(x)
            data_y.append(y)
print("Finished data loading")

data_x = np.stack(data_x)
data_x = np.float32(data_x)
data_y = np.stack(data_y)
lengths = np.stack(lengths)

HBox(children=(IntProgress(value=0, max=5475), HTML(value='')))

../data/train-s1/135191148.npy
../data/train-s1/135191205.npy
../data/train-s1/135191344.npy
../data/train-s1/135224709.npy
../data/train-s1/135224761.npy
../data/train-s1/135224765.npy
../data/train-s1/135224902.npy
../data/train-s1/135345907.npy
../data/train-s1/135345984.npy
../data/train-s1/135346007.npy
../data/train-s1/135346010.npy
../data/train-s1/135346079.npy
../data/train-s1/135697778.npy
../data/train-s1/135704110.npy
../data/train-s1/135704122.npy
../data/train-s1/135787860.npy
../data/train-s1/136434311.npy
../data/train-s1/136434496.npy
../data/train-s1/136434797.npy
../data/train-s1/136434834.npy
../data/train-s1/137547503.npy
../data/train-s1/137891144.npy
../data/train-s1/137891961.npy
../data/train-s1/137891972.npy
../data/train-s1/137891973.npy
../data/train-s1/137891976.npy

Finished data loading


In [16]:
import hickle as hkl
if source == 'train':
    hkl.dump(data_x, "../tile_data/processed/train_x.hkl", mode='w', compression='gzip')
    hkl.dump(data_y, "../tile_data/processed/train_y.hkl", mode='w', compression='gzip')
    hkl.dump(lengths, "../tile_data/processed/train_l.hkl", mode='w', compression='gzip')

if source == 'test' or source == "project":
    print("Writing test data")
    hkl.dump(data_x, "../tile_data/processed/test_x.hkl", mode='w', compression='gzip')
    hkl.dump(data_y, "../tile_data/processed/test_y.hkl", mode='w', compression='gzip')
    hkl.dump(lengths, "../tile_data/processed/test_lengths.hkl", mode='w', compression='gzip')

In [17]:
dataframe = dataframe.drop(0, 0)
dataframe.reset_index(inplace = True, drop = True)
dataframe.to_csv(f"../tile_data/processed/{source}_plot_ids.csv", index = False)


## Extraction of lat and longs for training / testing data

In [None]:
lats = []
longs = []
plot_ids = []
df = pd.read_csv("../data/science-2017-test.csv")
df = df.sample(frac=1, random_state = 5)
df['PLOT_ID'] = range(0, len(df), 1)
df['group'] = df['tree_cover'] * 100
print(np.unique(df['group']))
df['group'] = df['group'].astype(int)
print(np.unique(df['group']))
for i in range(0, len(df), 100):
    lat = df['location_y'][i]
    lon = df['location_x'][i]
    lats.append(lat)
    longs.append(lon)
    plot_ids.append(i)

In [132]:
lats = []
longs = []
for i in plot_ids:
    subs = df[df['PLOT_ID'] == i]
    lat = float(subs['LAT'].head(1))
    lon = float(subs['LON'].head(1))
   # print(subs['LAT'])
    #if lon > region[0] and lon < region[1]:
        #if lat > region[2] and lat < region[3]:
    lats.append(lat)
    longs.append(lon)
    
lats = np.array(lats)
longs = np.array(longs)
    
BBox = ((longs.min(),   longs.max(),      
         lats.min(), lats.max()))

print(BBox)
print(lats.shape)

(-121.31806300493471, 156.60271959506528, -54.427197440476576, 59.66233807380722)
(5451,)


In [26]:
#np.save("../data/metrics/plotids.npy", plot_ids_loaded)
testing_latlongs = pd.DataFrame({'lats':lats,'longs':longs, 'id':plot_ids})
testing_latlongs.to_csv("../data/latlongs/training_plots.csv", index = False)

# Validation section (compare with predictions)

In [177]:
#to_remove = [148, 465, 699, 1072, 1299, 610, 707, 778, 1151, 1160 ]
#np.array([val for x, val in enumerate(np.array(plot_ids_loaded).astype(int)) if x in to_remove] )
location = 4855 + 1
dataframe.loc[location]['lat'], dataframe.loc[location]['long']

(-2.081520502573931, 37.9517595996893)

In [None]:
np.array([val for x, val in enumerate(np.array(plot_ids_loaded).astype(int)) if x in to_remove] )

In [20]:
len(dataframe)

4767