# Process the data to create an internal test list

In [1]:
import pandas as pd
import numpy as np
import os
from shutil import copy

In [2]:
metadata_file_path = {
    "old": "data/GLC24_PA_metadata_train.csv",
    "new": "metadata.csv"}
satellite_rgb_file_path = {
    "old": "data/PA_Train_SatellitePatches_RGB/pa_train_patches_rgb",
    "new": "satellite_rgb"}
satellite_nir_file_path = {
    "old": "data/PA_Train_SatellitePatches_NIR/pa_train_patches_nir",
    "new": "satellite_nir"}
landsat = {
    "old": "data/PA-train-landsat_time_series",
    "new": "landsat"}
rasters_climate_average = {
    "old": "data/EnvironmentalRasters/EnvironmentalRasters/Climate/Average 1981-2010/GLC24-PA-train-bioclimatic.csv",
    "new": "rasters-climate_average.csv"}
rasters_climate_monthly = {
    "old": "data/EnvironmentalRasters/EnvironmentalRasters/Climate/Monthly/GLC24-PA-train-bioclimatic_monthly.csv",
    "new": "rasters-climate_monthly.csv"}
rasters_elevation = {
    "old": "data/EnvironmentalRasters/EnvironmentalRasters/Elevation/GLC24-PA-train-elevation.csv",
    "new": "rasters-climate_elevation.csv"}
rasters_human_footprint = {
    "old": "data/EnvironmentalRasters/EnvironmentalRasters/Human Footprint/GLC24-PA-train-human_footprint.csv",
    "new": "rasters-human_footprint.csv"}
rasters_landcover = {
    "old": "data/EnvironmentalRasters/EnvironmentalRasters/LandCover/GLC24-PA-train-landcover.csv",
    "new": "rasters-landcover.csv"}
rasters_soilgrids = {
    "old": "data/EnvironmentalRasters/EnvironmentalRasters/SoilGrids/GLC24-PA-train-soilgrids.csv",
    "new": "rasters-soilgrids.csv"}
# cubes_landsat = {
#     "old": "data/TimeSeries-Cubes/GLC24-PA-train-landsat_time_series",
#     "new": "cubes_landsat"}
dir_paths= [satellite_rgb_file_path, satellite_nir_file_path]

In [3]:
for type in ["train", "test"]:
    for path in dir_paths:
        new_path = os.path.join("processed_data/internal_split", type, path["new"])
        if not os.path.exists(new_path):
            os.makedirs(new_path)

In [4]:
metadata_df = pd.read_csv(metadata_file_path["old"])

In [5]:
metadata_df.surveyId=metadata_df.surveyId.astype(str)
metadata_df.speciesId=metadata_df.speciesId.astype(int)
metadata_df.speciesId=metadata_df.speciesId.astype(str)

In [6]:
train_metadata_surveyIds = np.loadtxt("train_metadata_surveyIds.csv", delimiter=",", dtype=str)
test_metadata_surveyIds = np.loadtxt("test_metadata_surveyIds.csv", delimiter=",", dtype=str)

In [7]:
train_df = metadata_df.query("surveyId in @train_metadata_surveyIds")

In [8]:
train_df['speciesId_count'] = train_df.groupby('surveyId')['surveyId'].transform('count')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['speciesId_count'] = train_df.groupby('surveyId')['surveyId'].transform('count')


## Test Data

In [9]:
test_df = metadata_df.query("surveyId in @test_metadata_surveyIds")

### Create a _perfect_ submission file to test against

In [10]:
submission_actual = test_df.groupby('surveyId').apply(lambda x : pd.Series(x['speciesId'].str.cat(sep=' '), index=['predictions']))

  submission_actual = test_df.groupby('surveyId').apply(lambda x : pd.Series(x['speciesId'].str.cat(sep=' '), index=['predictions']))


In [11]:
submission_actual.to_csv("submission_actual.csv")

### Test dataset

In [12]:
test_df = test_df.drop(columns=['speciesId']).groupby('surveyId').first().reset_index()

In [13]:
test_df.to_csv(os.path.join("processed_data/internal_split/test", metadata_file_path["new"]), index=False)

## Satellite Images

In [14]:
for file_path in [satellite_rgb_file_path, satellite_nir_file_path]:
    for cd_folder in os.listdir(file_path["old"]):
        for ad_folder in os.listdir(os.path.join(file_path["old"], cd_folder)):
            for file in os.listdir(os.path.join(file_path["old"], cd_folder, ad_folder)):
                if file[:-5] in test_metadata_surveyIds:
                    copy(os.path.join(file_path["old"], cd_folder, ad_folder, file), os.path.join("processed_data/internal_split/test", file_path["new"]))
                else:
                    copy(os.path.join(file_path["old"], cd_folder, ad_folder, file), os.path.join("processed_data/internal_split/train", file_path["new"]))


## Landsat Time Series

In [15]:
landsat_blue_df = pd.read_csv(os.path.join(landsat["old"], "GLC24-PA-train-landsat_time_series-blue.csv"))
landsat_green_df = pd.read_csv(os.path.join(landsat["old"], "GLC24-PA-train-landsat_time_series-green.csv"))
landsat_red_df = pd.read_csv(os.path.join(landsat["old"], "GLC24-PA-train-landsat_time_series-red.csv"))
landsat_nir_df = pd.read_csv(os.path.join(landsat["old"], "GLC24-PA-train-landsat_time_series-nir.csv"))
landsat_swir1_df = pd.read_csv(os.path.join(landsat["old"], "GLC24-PA-train-landsat_time_series-swir1.csv"))
landsat_swir2_df = pd.read_csv(os.path.join(landsat["old"], "GLC24-PA-train-landsat_time_series-swir2.csv"))

In [16]:
landsat_blue_df.surveyId=landsat_blue_df.surveyId.astype(str)
landsat_green_df.surveyId=landsat_green_df.surveyId.astype(str)
landsat_red_df.surveyId=landsat_red_df.surveyId.astype(str)
landsat_nir_df.surveyId=landsat_nir_df.surveyId.astype(str)
landsat_swir1_df.surveyId=landsat_swir1_df.surveyId.astype(str)
landsat_swir2_df.surveyId=landsat_swir2_df.surveyId.astype(str)

In [17]:
landsat_blue_train_df = landsat_blue_df.query("surveyId in @train_metadata_surveyIds")
landsat_green_train_df = landsat_green_df.query("surveyId in @train_metadata_surveyIds")
landsat_red_train_df = landsat_red_df.query("surveyId in @train_metadata_surveyIds")
landsat_nir_train_df = landsat_nir_df.query("surveyId in @train_metadata_surveyIds")
landsat_swir1_train_df = landsat_swir1_df.query("surveyId in @train_metadata_surveyIds")
landsat_swir2_train_df = landsat_swir2_df.query("surveyId in @train_metadata_surveyIds")

In [18]:
landsat_blue_test_df = landsat_blue_df.query("surveyId in @test_metadata_surveyIds")
landsat_green_test_df = landsat_green_df.query("surveyId in @test_metadata_surveyIds")
landsat_red_test_df = landsat_red_df.query("surveyId in @test_metadata_surveyIds")
landsat_nir_test_df = landsat_nir_df.query("surveyId in @test_metadata_surveyIds")
landsat_swir1_test_df = landsat_swir1_df.query("surveyId in @test_metadata_surveyIds")
landsat_swir2_test_df = landsat_swir2_df.query("surveyId in @test_metadata_surveyIds")

In [19]:
landsat_blue_train_df.to_csv(os.path.join("processed_data/internal_split/train", "landsat-blue.csv"), index=False)
landsat_green_train_df.to_csv(os.path.join("processed_data/internal_split/train", "landsat-green.csv"), index=False)
landsat_red_train_df.to_csv(os.path.join("processed_data/internal_split/train", "landsat-red.csv"), index=False)
landsat_nir_train_df.to_csv(os.path.join("processed_data/internal_split/train", "landsat-nir.csv"), index=False)
landsat_swir1_train_df.to_csv(os.path.join("processed_data/internal_split/train", "landsat-swir1.csv"), index=False)
landsat_swir2_train_df.to_csv(os.path.join("processed_data/internal_split/train", "landsat-swir2.csv"), index=False)

In [20]:
landsat_blue_test_df.to_csv(os.path.join("processed_data/internal_split/test", "landsat-blue.csv"), index=False)
landsat_green_test_df.to_csv(os.path.join("processed_data/internal_split/test", "landsat-green.csv"), index=False)
landsat_red_test_df.to_csv(os.path.join("processed_data/internal_split/test", "landsat-red.csv"), index=False)
landsat_nir_test_df.to_csv(os.path.join("processed_data/internal_split/test", "landsat-nir.csv"), index=False)
landsat_swir1_test_df.to_csv(os.path.join("processed_data/internal_split/test", "landsat-swir1.csv"), index=False)
landsat_swir2_test_df.to_csv(os.path.join("processed_data/internal_split/test", "landsat-swir2.csv"), index=False)

## Environmental Rasters

In [21]:
to_process = [rasters_climate_average, rasters_climate_monthly, rasters_elevation, rasters_human_footprint, rasters_landcover, rasters_soilgrids]

for file_path in to_process:
    df = pd.read_csv(file_path["old"])
    df.surveyId=df.surveyId.astype(str)
    train_df = df.query("surveyId in @train_metadata_surveyIds")
    test_df = df.query("surveyId in @test_metadata_surveyIds")
    train_df.to_csv(os.path.join("processed_data/internal_split/train", file_path["new"]), index=False)
    test_df.to_csv(os.path.join("processed_data/internal_split/test", file_path["new"]), index=False)