# Load, preprocess, and save train and test data
# John Brandt
# April 1, 2020

- Fuse Sentinel 1/2 data
- Reconstruct 2D-array from CEO output CSV by plot
- Match sentinel data to CEO labels
- Stack data_x, data_y, length
- Save numpy arrays for data_x, data_y, length

The notebook additionally contains some development code for:
- Parameter selection in whittaker smoothing
- Graphing plot locations on map

# Package imports and source code

In [1]:
from tqdm import tqdm_notebook, tnrange

import pandas as pd
import numpy as np
from random import shuffle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random
import itertools

os.environ['KMP_DUPLICATE_LIB_OK']='True'

#!source ~/.bash_profile
#!export PATH="/usr/lib/google-cloud-sdk/bin:$PATH"

In [2]:
def reconstruct_images(plot_id):
    '''Takes a plot ID and subsets the input pd.DataFrame to that plot ID
       returns a (14, 14) array-like list with binary labels
       
        Parameters:
          batch_ids (list):
          batch_size (int):
          
         Returns:
          x_batch (arr):
          y_batch (arr):
    '''
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

In [3]:
# Note, the data in data/new-data/train-s2/2019 and 2018/ is bad (missing / NA values)
source = 'train'
sentinel_1 = True
s2_path = "../data/{}-s2/".format(source)
s1_path = "../data/{}-s1/".format(source)
csv_path = "../data/{}-csv/".format(source)
output_path = "../data/{}-processed/".format(source)

#s2_path = "../data/train-s2/"
#s1_path = "../data/train-s1/"
#csv_path = "../data/train-csv/"

#s2_path = "../data/project-val/sentinel-2/"
#s1_path = "../data/project-val/sentinel-1/"
#csv_path = "../data/project-val/"

In [4]:
csv_path

'../data/train-csv/'

In [5]:
# Regional models
east_africa = ['ethiopia', 'africa-east', 'hyperarid', 'kenya',
               'malawi', 'rwanda', 'subplot', 'sudan', 'africaeast', 'makueni']

west_africa = ['africa-west', 'ghana', 'niger', 'koure', 
               'hyperarid', 'africawest', 'cameroon', 'ethiopia', 'africawest'] 

asia = ['australia', 'asia', 'india', 'global']

americas = ['brazil', 'elsalvador', 'honduras', 
            'lac', 'mexico', 'sa-', 'america']

africa = east_africa + west_africa

malawi = ['malawi', 'africaeast']
nigercameroon = ['niger', 'koure', 'cameroon']

globe = [x for x in os.listdir(csv_path)]

region = east_africa

In [6]:
verified_lu_change = [135703477, 136075784, 136089231, 136134581, 136134595, 136134787,
       137532501, 137535072, 137535083, 137587676, 135191141, 135191227,
       135191272, 135697462, 135697723, 135702981, 135703022, 135703057,
       135703258, 135703288, 135704195, 135704222, 135704474, 135704510,
       135704558, 135732286, 135780032, 135787072, 135787363, 135787380,
       135787903, 135788087, 135809757, 136029535, 136029610, 136075882,
       136134449, 136134537, 136134558, 136434718, 136434928, 136446295,
       136446356, 136456479, 136456560, 136456588, 136457103, 136457117,
       137517310, 137526963, 137532588, 137532613, 137535055, 137547467,
       137547478, 137547498, 137587717, 135505800, 135505834, 135542550,
       135702454, 135702495, 136077533, 136434987, 136434995, 136435043,
       136435064, 136435124, 136752672, 136752771, 136752779, 136752885,
       136752998, 136776870, 135191300, 135346081, 135654670, 135672801,
       135680048, 135680131, 135680267, 135680293, 135697731, 135697750,
       135698164, 135702833, 135703043, 135703066, 135703082, 135703150,
       135704171, 135704230, 135704400, 135704410, 135704413, 135704433,
       135704546, 135724890, 135724985, 135787455, 135787924, 136029704,
       136089025, 136089169, 136134415, 136134697, 136434418, 136434513,
       136434869, 136446291, 136446450, 136456646, 136457119, 137517272,
       137517323, 137535067, 137535095, 137535131, 137535172, 137547517,
       137547519, 137547530, 137587922, 135703457, 135703793, 135787012,
       135787179, 135787198, 136029499, 136029524, 136029531, 136029573,
       136029600, 136029642, 136029714, 136089012, 136089023, 136089058,
       136089067, 136089088, 136089189, 136089259, 136134540, 137517058,
       137517099, 137517118, 137517361, 137532494, 137532586, 137535140,
       137547513, 137587761, 137588043, 137891126, 136077575, 136077600,
       135703266, 135703558, 137532571, 137535093, 136029514, 136029689,
       136075828, 136089079, 136134497, 136134692, 137532612, 137535181,
       137587843, 136029514, 136029689, 136075828, 136089079, 136134497,
       136134692, 137532612, 137535181, 137587843, 135703186, 135780106,
       135787177, 136134740, 137526991, 137532549, 137588037, 137891125,
       135191140, 135191173, 135191213, 135191220, 135191234, 135191243,
       135191281, 135191304, 135191309, 135191314, 135191346, 135224767,
       135224808, 135345887, 135345895, 135345936, 135346016, 135346019,
       135346079, 135346094, 135542666, 135680238, 135680255, 135697473,
       135697529, 135697602, 135697627, 135697783, 135702872, 135704096,
       135704180, 135704226, 135704450, 135724981, 135787457, 135803842,
       135809743, 135809761, 135809889, 135840870, 135840902, 135841010,
       135841051, 135846908, 135847217, 136029501, 136029726, 136089139,
       136134477, 136134482, 136434825, 136434827, 136434842, 136434893,
       136434901, 136434930, 136456448, 136456966, 137517285, 137527052,
       137891143, 135542391, 135702537, 136752753, 136752804, 136752859,
       137862294, 135680221, 135680224, 135697631, 135697710, 135697726, 
       135698188, 135703084, 135703100, 135703146, 135703155, 135703166,
       135703207, 135704553, 136134617, 136446452, 137517316, 135703204,
       135703850, 135703880, 135703898, 135787029, 136029530, 136029746,
       136075851, 136089034, 136089038, 136089089, 136089184, 136456539,
       136456541, 136456580, 136456620, 136456641, 137517338, 137532624,
       137535081, 137547534, 137588119, 135698162, 136776935, 136776898,
       136134446, 136134502, 136456523, 136456525, 137587839, 136077593,
       135703212, 135703279, 135703321, 135703786, 135787227, 136029538,
       136029562, 136029661, 136029741, 136134418, 136134728, 136456454,
       136456511, 136456606, 137517315, 137526984, 137535050, 137587704,
       137587791, 138004137, 135191372, 135787932, 
       135780008, 135780027, 135787016, 136029515, 136029572, 136075916,
       136089017, 136134405, 136134565, 136456467, 136456478, 136456534,
       136456627, 137517351, 137535135, 137587687, 137891168, 138004139,
       136434478, 135191163, 135191175, 135191200, 135191204, 135191226,
       135224758, 135344169, 135344194, 135345939, 135346084, 135542619,
       135546262, 135672788, 135672798, 135672814, 135702874, 135724903,
       135724913, 135724914, 135724921, 135724963, 135724971, 135732254,
       135505810, 135505833, 135542457, 135774794, 135804085, 135807739,
       136077552, 136434950, 136434972, 136434979, 136435044, 136435058,
       136435101, 136752669, 136752678, 136752697, 136752704, 136752799,
       136752810, 136752840, 136752871, 136752888, 136753001, 136776840,
       136776886, 136776912, 138303010,
       135191321, 135224667, 135224688, 135224789, 135542625, 135542643,
       135542645, 135546200, 135546334, 135546394, 135680098, 135680234,
       135680249, 135697759, 135697808, 135702935, 135703272, 135704149,
       135724980, 135732331, 135803942, 135803966, 135809843, 135809846,
       135809850, 135809875, 136075843, 136075848, 136089043, 136134424,
       136134510, 136134664, 136434747, 136434760, 136434766, 136434783,
       136434839, 136434939, 136446443, 136456574, 136456956, 136456971,
       136457106, 137527044, 137587751, 137587924, 137891150,
       136077629, 136435080, 136435099, 136435119, 136752808, 136776871,
       135191267, 135224653, 135546181, 135654752, 135680088, 135680623,
       135680717, 135680766, 135680806, 135697572, 135697577, 135697640,
       135697802, 135702922, 135703075, 135703474, 135704042, 135704455,
       135724909, 135724986, 135738208, 135787902, 135788017, 135788021,
       135788066, 135809815, 135841107, 135846794, 136029534, 136029742,
       136075801, 136075846, 136434567, 136434765, 136434868, 136446359,
       136446425, 136457074, 138076925, 138302769,  135224766,
       135191148, 135191156, 135191312, 135191336, 135191385, 135224703,
         135345854, 135345883, 135345894, 135542632, 135542658,
         135546227, 135546231, 135546355, 135345987, 135546386, 135546400, 
         135654720, 135680052, 135680167, 135680220, 135680294, 135680715,
         135680768, 135680779, 135680966, 135697463, 135697497,
         135697647, 135697746, 135697778, 135702895, 135703795, 135703817,
         135704041, 135787421, 135787861, 135809830, 135809888, 136134475,
         136134553, 136134754, 136434534, 136434769, 136434781, 136446370,
         136446438, 136457133, 137517233, 137517308, 137517356, 137532561,
         137547535, 137587699, 137588045, 137891110,  137891145, 138152213,
         138302791
       ]

gain = [137891975, 137966730, 137966725, 137966729, 137587698, 137587772,
        137587784, 137587786, 137891127, 137891978, 137966723, 137966724,
        137966731, 137966732, 137966726, 137966727, 137966728, 137891964,
        137891961, 137891968, 13789118, 138004137, 137966721, 137966722]

In [7]:
# For either train or test data, loop through each plot and determine whether there is
# labelled Y data for it -- returning one dataframe for the entire data set

dfs = []
for i in os.listdir(csv_path):
    if "salv" in i:
     #   i in i:
    #if any([x in i for x in region]):
        print(i)
        df = pd.read_csv(csv_path + i).drop('IMAGERY_TITLE', axis = 1)
        df['country'] = i.split(".")[0]
        dfs.append(df)

for i in range(len(dfs)):
    if "PL_PLOTID" not in dfs[i].columns:
        dfs[i]['PL_PLOTID'] = 0
            #dfs[i] = dfs[i].drop("PL_PLOTID", axis = 1)
    if 'STACKINGPROFILEDG' in dfs[i].columns:
        dfs[i] = dfs[i].drop('STACKINGPROFILEDG', axis = 1)
    if 'IMAGERYYEARDG' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYYEARDG', axis = 1)
    if 'IMAGERYDATESECUREWATCH' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYDATESECUREWATCH', axis = 1)
    if 'IMAGERYENDDATESECUREWATCH' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYENDDATESECUREWATCH', axis = 1)
    if 'IMAGERYSTARTDATESECUREWATCH' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYSTARTDATESECUREWATCH', axis = 1)
    if 'IMAGERYFEATUREPROFILESECUREWATCH' in dfs[i].columns:
        dfs[i] = dfs[i].drop('IMAGERYFEATUREPROFILESECUREWATCH', axis = 1)

df = pd.concat(dfs, ignore_index = True)
df = df[~pd.isna(df['TREE'])]
#df = df.dropna(axis = 0)

#existing = [int(x[:-4]) for x in os.listdir(s2_path) if ".DS" not in x]

#df = df[df['PLOT_ID'].isin(existing)]
plot_ids = sorted(df['PLOT_ID'].unique())
plot_ids_loaded = plot_ids

ceo-elsalvador-train-sample-data-2020-07-22.csv
ceo-el-salvador-train-sample-data-2020-07-29.csv


In [8]:
# Initiate empty lists to store the X and Y data in
data_x, data_y, lengths = [], [], []
    
    
countries = {}
count = 0
to_remove = []
plot_ids_loaded = []
pl_plot_ids_loaded = []
dataframe = pd.DataFrame({'plot_id': [''], 'lat': [0.325], 'long': [0.325]})
# Iterate over each plot
for i in tnrange(len(plot_ids)):
    skip = True if sentinel_1 else False
    # Load the sentinel imagery
    if (str(plot_ids[i]) + ".npy") in os.listdir(s2_path):
        if plot_ids[i] not in verified_lu_change:# + gain:
            country = df[df['PLOT_ID'] == plot_ids[i]]['country'].unique()
            if str(country[0]) not in countries.keys():
                countries[str(country[0])] = [count, count]
            countries[str(country[0])][1] = count
            x = np.load(s2_path + str(plot_ids[i]) + ".npy")
            #if np.sum(np.isnan(x)) > 0:
            #    os.remove(s2_path + str(plot_ids[i]) + ".npy")
            #    print("Deleting:", s2_path + str(plot_ids[i]) + ".npy")
            if sentinel_1 and os.path.isfile(s1_path + str(plot_ids[i]) + ".npy"):
                skip = False
                s1 = np.load(s1_path + str(plot_ids[i]) + ".npy")
                x = np.concatenate([x, s1], axis = -1)
            count += 1
        y = reconstruct_images(plot_ids[i])
        if not skip:
            long = np.mean(df[df['PLOT_ID'] == plot_ids[i]]['LON'])
            lat = np.mean(df[df['PLOT_ID'] == plot_ids[i]]['LAT'])
            dataframe = dataframe.append({'plot_id': str(plot_ids[i]), 'lat': lat, 'long': long}, ignore_index = True)
            dataframe.append([plot_ids[i], lat, long])
            plot_ids_loaded.append(str(plot_ids[i]))
            lengths.append(x.shape[0])
            data_x.append(x)
            data_y.append(y)
print("Finished data loading")

data_x = np.stack(data_x)
data_x = np.float32(data_x)
data_y = np.stack(data_y)
lengths = np.stack(lengths)

HBox(children=(IntProgress(value=0, max=204), HTML(value='')))


Finished data loading


In [9]:
print(countries)

{'ceo-elsalvador-train-sample-data-2020-07-22': [0, 65], 'ceo-el-salvador-train-sample-data-2020-07-29': [66, 126]}


In [10]:
import hickle as hkl
if source == 'train':
    #hkl.dump(data_x, "../tile_data/processed/data_x_l2a_processed.hkl", mode='w', compression='gzip')
    #hkl.dump(data_y, "../tile_data/processed/data_y_l2a_processed.hkl", mode='w', compression='gzip')
    #hkl.dump(lengths, "../tile_data/processed/length_l2a_processed.hkl", mode='w', compression='gzip')
    np.save("../tile_data/processed/data_x_l2a_processed.npy", data_x)
    np.save("../tile_data/processed/data_y_l2a_processed.npy", np.array(data_y))
    np.save("../tile_data/processed/length_l2a_processed.npy", np.array(lengths))
if source == 'test' or source == "project":
    print("Writing test data")
    np.save("../tile_data/processed/test_x_l2a_processed.npy", data_x)
    np.save("../tile_data/processed/test_y_l2a_processed.npy", data_y)
    np.save("../tile_data/processed/test_length_l2a_processed.npy", lengths)

## Extraction of lat and longs for training / testing data

In [None]:
lats = []
longs = []
plot_ids = []
df = pd.read_csv("../data/science-2017-test.csv")
df = df.sample(frac=1, random_state = 5)
df['PLOT_ID'] = range(0, len(df), 1)
df['group'] = df['tree_cover'] * 100
print(np.unique(df['group']))
df['group'] = df['group'].astype(int)
print(np.unique(df['group']))
for i in range(0, len(df), 100):
    lat = df['location_y'][i]
    lon = df['location_x'][i]
    lats.append(lat)
    longs.append(lon)
    plot_ids.append(i)

In [25]:
lats = []
longs = []
for i in plot_ids:
    subs = df[df['PLOT_ID'] == i]
    lat = float(subs['LAT'].head(1))
    lon = float(subs['LON'].head(1))
   # print(subs['LAT'])
    #if lon > region[0] and lon < region[1]:
        #if lat > region[2] and lat < region[3]:
    lats.append(lat)
    longs.append(lon)
    
lats = np.array(lats)
longs = np.array(longs)
    
BBox = ((longs.min(),   longs.max(),      
         lats.min(), lats.max()))

print(BBox)
print(lats.shape)

(-121.31806300493471, 156.60271959506528, -54.427197440476576, 59.66233807380722)
(4231,)


In [26]:
#np.save("../data/metrics/plotids.npy", plot_ids_loaded)
testing_latlongs = pd.DataFrame({'lats':lats,'longs':longs, 'id':plot_ids})
testing_latlongs.to_csv("../data/latlongs/training_plots.csv", index = False)

In [119]:
to_remove = [133, 572, 656, 724, 945, 1018, 1064,
            1078, 1096, 1156, 1246, 1263, 1543, 1610, 1969,
            2641, 2980, 3160, 3398, 3573, 3878, ]

# Validation section (compare with predictions)

In [177]:
#to_remove = [148, 465, 699, 1072, 1299, 610, 707, 778, 1151, 1160 ]
#np.array([val for x, val in enumerate(np.array(plot_ids_loaded).astype(int)) if x in to_remove] )
location = 4855 + 1
dataframe.loc[location]['lat'], dataframe.loc[location]['long']

(-2.081520502573931, 37.9517595996893)

In [121]:
np.array([val for x, val in enumerate(np.array(plot_ids_loaded).astype(int)) if x in to_remove] )

array([135191267, 135224653, 135546181, 135654752, 135680088, 135680623,
       135680717, 135680766, 135680806, 135697572, 135697577, 135697640,
       135697802, 135702922, 135703075, 135703474, 135704042, 135704455,
       135724909, 135724986, 135738208, 135787902, 135788017, 135788021,
       135788066, 135809815, 135841107, 135846794, 136029534, 136029742,
       136075801, 136075846, 136434567, 136434765, 136434868, 136446359,
       136446425, 136457074, 138076925, 138302769])

In [189]:
dataframe.to_csv("../data/latlongs/test_plot_ids.csv", index = False)