# Load, preprocess, and save train and test data

This notebook preprocesses and collates the training and testing data for model creation.

# John Brandt
# July 11, 2021

- Fuse Sentinel 1/2 data
- Reconstruct 2D-array from CEO output CSV by plot
- Match sentinel data to CEO labels
- Stack data_x, data_y, length
- Save arrays for data_x, data_y, length


# Package imports and source code

In [1]:
from tqdm import tqdm_notebook, tnrange
import pandas as pd
import numpy as np
from random import shuffle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random
import itertools
from scipy.ndimage import median_filter
import hickle as hkl

os.environ['KMP_DUPLICATE_LIB_OK']='True'

%run ../src/preprocessing/slope.py

In [2]:
def reconstruct_images(plot_id):
    '''Takes a plot ID and subsets the input pd.DataFrame to that plot ID
       returns a (14, 14) array-like list with binary labels
       
        Parameters:
          batch_ids (list):
          batch_size (int):
          
         Returns:
          x_batch (arr):
          y_batch (arr):
    '''
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

In [3]:
source = 'train'
sentinel_1 = True
s2_path = "../data/{}-s2/".format(source)
s1_path = "../data/{}-s1/".format(source)
csv_path = "../data/{}-csv/".format(source)
output_path = "../data/{}-processed/".format(source)
dem_path = "../data/{}-dem/".format(source)

In [4]:
# Load and edit bad plot ids if needed
verified_lu_change = np.load("bad_plot_ids.npy")
len(verified_lu_change)

to_add = [141238348]
to_add = [x for x in to_add if x not in verified_lu_change]
verified_lu_change = np.concatenate([verified_lu_change, 
                     np.array(to_add).flatten()])

to_remove = []

verified_lu_change = [x for x in verified_lu_change if x not in to_remove]
np.save("bad_plot_ids.npy", np.array(verified_lu_change))
print(len(verified_lu_change))

2328


In [5]:
# For either train or test data, loop through each plot and determine whether there is
# labelled Y data for it -- returning one dataframe for the entire data set
import re 

cols_to_keep = ['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'ANALYSES', 'USER_ID',
       'COLLECTION_TIME', 'ANALYSIS_DURATION', 'TREE', 'plotid', 'sampleid']
csvs = [x for x in sorted(os.listdir(csv_path)) if ".csv" in x]
#csvs = [x for x in csvs if 'uuid'in x]
csvs = [x for x in csvs if ".csv" in x]
csvs = [x for x in csvs if "chaco" in x]
#csvs = [x for x in csvs if "senegal" not in x]

print(csvs)

dfs = []
for i in csvs:
    df = pd.read_csv(csv_path + i, encoding = "ISO-8859-1")
    df.columns = [re.sub(r'\W+', '', x) for x in df.columns]
    df.rename(columns={'ïplotid':'plotid'}, inplace=True)
    print(df.columns)
    df.columns = [x.upper() for x in df.columns]
    df.columns = ['PLOT_ID' if x == 'PLOTID' else x for x in df.columns]
    df.columns = ['SAMPLE_ID' if x == 'SAMPLEID' else x for x in df.columns]
    print(df.columns)
    print(df['PLOT_ID'][0])
    
    # If there are no unique IDs already, go ahead and assign them

    if abs(df['PLOT_ID'][0]) == 1:
        print(df['PLOT_ID'][0])
        print(f"No unique ID for {i}")
        for index, row in df.iterrows():
            row['PLOT_ID'] = abs(row['PLOT_ID'])
            df['PLOT_ID'][index] = int(str(i[-6:-4]) + '00' + str(row['PLOT_ID']))
    
    print(df['PLOT_ID'].unique())
    
    #print(df.columns)

    for column in df.columns:
        if column not in cols_to_keep:
            df = df.drop(column, axis = 1)
            
    df['country'] = i.split(".")[0]
    #print(( len(df) - np.sum(pd.isna(df['TREE']))) / 196 )
    print(len(df.columns))
    df.to_csv(csv_path + i, index = False)
    dfs.append(df)

df = pd.concat(dfs, ignore_index = True, sort = True)
print(len(df) // 196)
df = df[~pd.isna(df['TREE'])]
print(len(df) // 196)

plot_ids = sorted(df['PLOT_ID'].unique())
plot_ids_loaded = plot_ids

print(f"There are {len(plot_ids)} plots")

['ceo-2022chaco_rubber-sample-29.csv', 'ceo-chaco-eucalyptus-30.csv', 'ceo-chaco-global-uuid-11.csv']
Index(['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'COLLECTION_TIME',
       'ANALYSIS_DURATION', 'TREE', 'country'],
      dtype='object')
Index(['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'COLLECTION_TIME',
       'ANALYSIS_DURATION', 'TREE', 'COUNTRY'],
      dtype='object')
29001
[ 29001  29002  29003  29004  29005  29006  29007  29008  29009 290010
 290011 290012 290013 290014 290015 290016 290017 290018 290019 290020
 290021 290022 290023 290024 290025 290026 290027 290028 290029 290030
 290031 290032 290033 290034 290035 290036 290037 290038 290039 290040
 290041 290042 290043 290044 290045 290046 290047 290048 290049 290050
 290051 290052 290053 290054 290055 290056 290057 290058 290059 290060
 290061 290062 290063 290064 290065 290066 290067 290068 290069 290070
 290071 290072 290073 290074 290075 290076 290077 290078 290079]
9
Index(['plotid', 'sampleid', 'lon', 'l

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[ 30001  30002  30003  30004  30005  30006  30007  30008  30009 300010
 300011 300012 300013 300014 300015 300016 300017 300018 300019 300020
 300021 300022 300023 300024 300025 300026 300027 300028 300029 300030
 300031 300032 300033 300034 300035 300036 300037 300038 300039 300040
 300041 300042 300043 300044 300045 300046 300047 300048 300049 300050
 300051 300052 300053 300054 300055 300056 300057 300058 300059 300060
 300061 300062 300063 300064]
9
Index(['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'COLLECTION_TIME',
       'ANALYSIS_DURATION', 'TREE', 'country'],
      dtype='object')
Index(['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'COLLECTION_TIME',
       'ANALYSIS_DURATION', 'TREE', 'COUNTRY'],
      dtype='object')
11001
[ 11001  11002  11003  11004  11005  11006  11007  11008  11009 110010
 110011 110012 110013 110014 110015 110016 110017 110018 110019 110020
 110021 110022 110023 110024 110025 110026 110027 110028 110029 110030
 110031 110032 110033 110034 1100

In [6]:
def to_int16(array: np.array) -> np.array:
    '''Converts a float32 array to int16, reducing storage costs by three-fold'''
    assert np.min(array) >= 0, np.min(array)
    assert np.max(array) <= 1, np.max(array)
    
    array = np.clip(array, 0, 1)
    array = np.trunc(array * 65535)
    assert np.min(array >= 0)
    assert np.max(array <= 65535)
    
    return array.astype(np.uint16)

def process_dem(dem):
    dem =  median_filter(dem, size = 5)
    dem = calcSlope(dem.reshape((1, 32+2, 32+2)),
                      np.full((32+2, 32+2), 10),
                      np.full((32+2, 32+2), 10), 
                      zScale = 1, minSlope = 0.02)
    dem = dem / 90
    dem = dem.reshape((32+2, 32+2, 1))
    dem = dem[1:-1, 1:-1]
    dem = median_filter(dem, 5)[2:-2, 2:-2]
    return dem

def grndvi(array):
    nir = np.clip(array[..., 3], 0, 1)
    green = np.clip(array[..., 1], 0, 1)
    red = np.clip(array[..., 2], 0, 1)
    denominator = (nir+(green+red)) + 1e-5
    return (nir-(green+red)) / denominator


In [7]:
from skimage.transform import resize

%run ../src/preprocessing/indices.py

def to_float32(array: np.array) -> np.array:
    """Converts an int_x array to float32"""
    if not isinstance(array.flat[0], np.floating):
        assert np.max(array) > 1
        array = np.float32(array) / 65535.
    assert np.max(array) <= 1
    assert array.dtype == np.float32
    return array

count = 0
dataframe = pd.DataFrame({'plot_id': [''], 'lat': [0.325], 'long': [0.325],
                          'y': [0]})

# Identify shape of data to load
plot_ids_to_load = []
for i in range(len(plot_ids)):
    s1_i = f'{s1_path}{str(plot_ids[i])}.hkl'
    s2_i = f'{s2_path}{str(plot_ids[i])}.hkl'
    dem_i = f'{dem_path}{str(plot_ids[i])}.npy'
    s1_new_i = f'../data/{source}-s1/{str(plot_ids[i])}.npy'
    s1_exists = (os.path.exists(s1_i))
    
    if os.path.isfile(s2_i) and s1_exists:
        if plot_ids[i] not in verified_lu_change:
            plot_ids_to_load.append(plot_ids[i])

print(f"There are {len(plot_ids_to_load)} plots")
plot_ids_to_load = [x for x in plot_ids_to_load if x not in  [139077414,
                                                              139187051,
                                                              139187043,
                                                             139187133, 139187134]]
data_x = np.zeros((len(plot_ids_to_load), 12, 28, 28, 14)).astype(np.uint16)
data_y = np.zeros((len(plot_ids_to_load), 14, 14))
            
# Iterate over each plot
to_remove = []

for i in range(len(plot_ids_to_load)):
    s1_i = f'{s1_path}{str(plot_ids_to_load[i])}.hkl'
    s2_i = f'{s2_path}{str(plot_ids_to_load[i])}.hkl'
    dem_i = f'{dem_path}{str(plot_ids_to_load[i])}.npy'

    x = to_float32(hkl.load(s2_i))
    s1 = hkl.load(s1_i)
    s1 = np.reshape(s1, (12, 16, 2, 16, 2, 2))
    s1 = np.mean(s1, axis = (2, 4))
    s1 = resize(s1, (12, 32, 32, 2), order = 1)
    s1 = s1[:, 2:-2, 2:-2, :]
    
    dem = np.load(dem_i)
    dem = process_dem(dem)
    dem = np.tile(dem.reshape((1, 28, 28)), (x.shape[0], 1, 1))
    x[..., 10] = dem
    x = np.concatenate([x, s1], axis = -1)

    count += 1
    y = reconstruct_images(plot_ids_to_load[i])
    long = np.mean(df[df['PLOT_ID'] == plot_ids_to_load[i]]['LON'])
    lat = np.mean(df[df['PLOT_ID'] == plot_ids_to_load[i]]['LAT'])
    dataframe = dataframe.append({'plot_id': str(plot_ids_to_load[i]),
                                  'lat': lat, 'long': long,
                                 'y': np.sum(np.array(y))}, 
                                 ignore_index = True)
    dataframe.append([plot_ids_to_load[i], lat, long])

    if np.sum(np.isnan(x)) > 0:
        to_remove.append(i)
    else:
        x = np.clip(x, 0, 1)
        x = to_int16(x)
        data_x[i] = x
        try:
            data_y[i] = np.array(y)
        except:
            to_remove.append(i)
            
# Remove any data samples that had missing values
if len(to_remove) > 0:
    print(f"Removing {to_remove}")
    data_x = np.delete(data_x, to_remove, 0)
    data_y = np.delete(data_y, to_remove, 0)
            
print(f"Finished loading: {data_x.shape} of {data_x.dtype} type")

There are 1521 plots
Finished loading: (1521, 12, 28, 28, 14) of uint16 type


In [8]:
import hickle as hkl
dataframe = dataframe.drop(0, 0)
dataframe.reset_index(inplace = True, drop = True)
if len(to_remove) > 0:
    dataframe = dataframe.drop(to_remove, 0)
    dataframe.reset_index(inplace = True, drop = True)

print(f"Writing {source} data")
hkl.dump(data_x, f"../data/{source}/{source}_x.hkl", mode='w', compression='gzip')
hkl.dump(data_y, f"../data/{source}/{source}_y.hkl", mode='w', compression='gzip')
dataframe.to_csv(f"../data/{source}/{source}_plot_ids.csv", index = False)
print("Finished!")

Writing train data


  


Finished!
