In [None]:
from tqdm import tqdm_notebook, tnrange
import numpy as np
from random import shuffle
import pandas as pd
import os
import random

# Additional function definitions

In [None]:
def reconstruct_images(plot_id):
    '''
    Takes a plot ID and subsets the input pd.DataFrame to that plot ID
    returns a (14, 14) array-like list with binary labels
    '''
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

# Data loading

In [None]:
source = 'train'
sentinel_1 = True

In [None]:
# For either train or test data, loop through each plot and determine whether there is
# labelled Y data for it -- returning one dataframe for the entire data set

if source == 'train':
    folder = '../data/train-super/'
    dfs = []
    for i in os.listdir("../data/train-csv/"):
        if ".DS" not in i:
            print(i)
            df = pd.read_csv("../data/train-csv/" + i).drop('IMAGERY_TITLE', axis = 1)
            df['country'] = i.split(".")[0]
            dfs.append(df)
            
    for i in range(len(dfs)):
        print(dfs[i].shape[0])
        if "PL_PLOTID" in dfs[i].columns:
                dfs[i] = dfs[i].drop("PL_PLOTID", axis = 1)
        if 'STACKINGPROFILEDG' in dfs[i].columns:
            dfs[i] = dfs[i].drop('STACKINGPROFILEDG', axis = 1)
        if 'IMAGERYYEARDG' in dfs[i].columns:
            dfs[i] = dfs[i].drop('IMAGERYYEARDG', axis = 1)
    
            
    df = pd.concat(dfs, ignore_index = True)
    df = df.dropna(axis = 0)

    existing = [int(x[:-4]) for x in os.listdir(folder) if ".DS" not in x]
    df = df[df['PLOT_ID'].isin(existing)]
    plot_ids = sorted(df['PLOT_ID'].unique())
    
if source == 'test':
    folder = "../data/test-super/"
    dfs = []
    for i in sorted(os.listdir("../data/test-csv/")):
        if ".DS" not in i and ".csv" in i:
            print(i)
            df = pd.read_csv("../data/test-csv/" + i).drop('IMAGERY_TITLE', axis = 1)
            if "PL_PLOTID" in df.columns:
                df = df.drop("PL_PLOTID", axis = 1)
            if 'STACKINGPROFILEDG' in df.columns:
                df = df.drop('STACKINGPROFILEDG', axis = 1)
            if 'IMAGERYYEARDG' in df.columns:
                df = df.drop('IMAGERYYEARDG', axis = 1)
            print(i, df.columns)
            df = df.dropna(axis = 0)
            df['country'] = i.split(".")[0]
            dfs.append(df)
        
    if len(dfs) > 1:
        df = pd.concat(dfs, ignore_index = True)
    print(df.shape[0]/196)
    
    print(df.columns)
    df = df.dropna(axis = 0)
    existing = [int(x[:-4]) for x in os.listdir(folder) if ".DS" not in x]
    df = df[df['PLOT_ID'].isin(existing)]
    df = df.sort_values('country', axis = 0)
    plot_ids = sorted(df['PLOT_ID'].unique())

In [None]:
# Subset out the plot ids to remove those which do not have X data
existing = [int(x[:-4]) for x in os.listdir("../data/{}-super".format(source)) if ".DS_S" not in x]
df = df[df['PLOT_ID'].isin(existing)]

In [None]:
# Initiate empty lists to store the X and Y data in
data_x, data_y, lengths = [], [], []
    
    
countries = {}
count = 0
to_remove = []
# Iterate over each plot
for i in tnrange(len(plot_ids)):
    skip = True if sentinel_1 else False
    # Load the sentinel imagery
    if (str(plot_ids[i]) + ".npy")in os.listdir("../data/{}-super/".format(source)):
        country = df[df['PLOT_ID'] == plot_ids[i]]['country'].unique()
        if str(country[0]) not in countries.keys():
            countries[str(country[0])] = [count, count]
        countries[str(country[0])][1] = count
        x = np.load("../data/{}-super/".format(source) + str(plot_ids[i]) + ".npy")
        if sentinel_1 and os.path.isfile("../data/{}-s1/{}.npy".format(source, str(plot_ids[i]))):
            skip = False
            s1 = np.load("../data/{}-s1/{}.npy".format(source, str(plot_ids[i])))
            x = np.concatenate([x, s1], axis = -1)
        count += 1
    y = reconstruct_images(plot_ids[i])
    if not skip:
        lengths.append(x.shape[0])
        data_x.append(x)
        data_y.append(y)
print("Finished data loading")

data_x = np.stack(data_x)
data_y = np.stack(data_y)
lengths = np.stack(lengths)

# Data writing

In [None]:
# This writes a (N_samples, time, width, height, channels) X data that is not standardized
# and a (N_samples, width, height) Y data where the Y is a 0 / 1 binary tree presence
# The X data is 16x16, and the Y data is 14x14 -- so the X data incorporates an extra boundary
# pixel, which should be downsampled before output layer with a convolution layer with no padding

if source == 'train':
    np.save("../tile_data/processed/data_x_l2a_processed.npy", data_x)
    np.save("../tile_data/processed/data_y_l2a_processed.npy", np.array(data_y))
    np.save("../tile_data/processed/length_l2a_processed.npy", np.array(lengths))
if source == 'test' or source == "project":
    print("Writing test data")
    np.save("../tile_data/processed/test_x_l2a_processed.npy", data_x)
    np.save("../tile_data/processed/test_y_l2a_processed.npy", data_y)
    np.save("../tile_data/processed/test_length_l2a_processed.npy", lengths)