In [2]:
import pandas as pd
original_train_df = pd.read_csv("data/GLC24_PA_metadata_train.csv")



KeyError: 'surveyId'

In [11]:
X_test = pd.read_pickle("processed_data/internal/test_X.pkl")

In [12]:
X_test.head()

Unnamed: 0,surveyId,lon,lat,year,geoUncertaintyInM,areaInM2,Bio1,Bio2,Bio3,Bio4,...,country_Slovakia,country_Slovenia,country_Spain,country_Switzerland,region_ALPINE,region_ATLANTIC,region_BLACK SEA,region_CONTINENTAL,region_MEDITERRANEAN,region_PANNONIAN
20,970,-2.758823,43.068574,2017,5.0,100.0,2858,79,3,4366,...,False,False,True,False,False,True,False,False,False,False
32,1650,3.0315,42.749162,2018,10.0,16.0,2890,76,3,5810,...,False,False,False,False,False,False,False,False,True,False
34,1741,4.991745,43.433207,2018,0.0,1.0,2886,81,3,6054,...,False,False,False,False,False,False,False,False,True,False
38,1910,12.04873,54.91366,2018,10.0,79.0,2821,25,1,5721,...,False,False,False,False,False,False,False,True,False,False
43,2037,9.56194,56.07804,2017,10.0,79.0,2810,61,2,5664,...,False,False,False,False,False,False,False,True,False,False


In [6]:
# group by surveyId and get the number of unique species
species_per_survey = original_train_df.groupby("surveyId")["speciesId"].nunique()

In [10]:
print(f"{species_per_survey.mean()=}")
print(f"{species_per_survey.std()=}")
print(f"{species_per_survey.max()=}")
print(f"{species_per_survey.min()=}")

species_per_survey.mean()=16.114713385101194
species_per_survey.std()=9.5728748131864
species_per_survey.max()=100
species_per_survey.min()=1


In [3]:
original_train_df.head()

Unnamed: 0,lon,lat,year,geoUncertaintyInM,areaInM2,region,country,speciesId,surveyId
0,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,6874.0,212
1,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,476.0,212
2,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,11157.0,212
3,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,8784.0,212
4,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,4530.0,212


In [13]:
# Data Processor
#
# Desired output
# - Full train dataset - for training model for Kaggle submission
# - Full test dataset - for Kaggle submission
# - Internal train dataset - 80% of full train dataset for internal banchmarking
# - Internal test dataset - 20% of full train dataset for internal benchmarking
# - All tabular data in one file

import pandas as pd
import numpy as np
import os
from PIL import Image
from tqdm import tqdm


print("Processing data\n")

# Load data
print("Load data")
original_train_df = pd.read_csv("data/GLC24_PA_metadata_train.csv")
original_test_df = pd.read_csv("data/GLC24_PA_metadata_test.csv")
metadata_train_df = original_train_df.copy()
metadata_test_df = original_test_df.copy()

# Drop: 'Andorra', 'Hungary', 'Ireland', 'Latvia', 'Luxembourg', 'Monaco', 'Norway',
# 'Portugal', 'Romania', 'Serbia', 'The former Yugoslav Republic of Macedonia' they
# are not represented in the test set
print("Drop countries not in test data")
drop_countries = {
    "Andorra",
    "Hungary",
    "Ireland",
    "Latvia",
    "Luxembourg",
    "Monaco",
    "Norway",
    "Portugal",
    "Romania",
    "Serbia",
    "The former Yugoslav Republic of Macedonia",
}
metadata_train_df = metadata_train_df[~metadata_train_df.country.isin(drop_countries)]

# -inf, inf replaced by NaN
print("Replace -inf, inf with NaN")
metadata_train_df = metadata_train_df.replace([np.inf, -np.inf], np.nan)
metadata_test_df = metadata_test_df.replace([np.inf, -np.inf], np.nan)

# For the train data NaN in geoUncertaintyInM, and areaInM2 replaced with the country median values
print("Process NaN values")
for column in ["areaInM2", "geoUncertaintyInM"]:
    print(f" - Processing train data: {column}")
    for country in tqdm(metadata_train_df.country.unique()):
        metadata_train_df.loc[
            metadata_train_df.country == country, column
        ] = metadata_train_df.query(f"country == '{country}'")[column].fillna(
            metadata_train_df.query(f"country == '{country}'")[column].median()
        )

# For the test data NaN in geoUncertaintyInM, and areaInM2 replaced with the training data country
# median values (note this is data leakage, but I believe it is tolerable)
for column in ["areaInM2", "geoUncertaintyInM"]:
    print(f" - Processing test data: {column}")
    for country in metadata_test_df.country.unique():
        metadata_test_df.loc[
            metadata_test_df.country == country, column
        ] = metadata_test_df.query(f"country == '{country}'")[column].fillna(
            metadata_train_df.query(f"country == '{country}'")[column].median()
        )

# Resulting dataframes
print("Resulting dataframes")
print(f" - Train data: {metadata_train_df.shape}")
print(f" - Test data: {metadata_test_df.shape}")

# set speciesId as int
print("Set speciesId as int")
metadata_train_df["speciesId"] = metadata_train_df["speciesId"].astype(int)

# Resulting dataframes
print("Resulting dataframes")
print(f" - Train data: {metadata_train_df.shape}")
print(f" - Test data: {metadata_test_df.shape}")

# Combine all environmental data
print("Combine environmental data")
files_to_combine = [
    "data/EnvironmentalRasters/EnvironmentalRasters/Climate/Average 1981-2010/GLC24-PA-{}-bioclimatic.csv",
    "data/EnvironmentalRasters/EnvironmentalRasters/Climate/Monthly/GLC24-PA-{}-bioclimatic_monthly.csv",
    "data/EnvironmentalRasters/EnvironmentalRasters/Elevation/GLC24-PA-{}-elevation.csv",
    "data/EnvironmentalRasters/EnvironmentalRasters/Human Footprint/GLC24-PA-{}-human_footprint.csv",
    "data/EnvironmentalRasters/EnvironmentalRasters/LandCover/GLC24-PA-{}-landcover.csv",
    "data/EnvironmentalRasters/EnvironmentalRasters/SoilGrids/GLC24-PA-{}-soilgrids.csv",
]
combined_train_df = metadata_train_df
combined_test_df = metadata_test_df

for file in files_to_combine:
    print(" - Processing train data:", file.format("train"))
    combined_train_df = pd.merge(
        combined_train_df, pd.read_csv(file.format("train")), on="surveyId"
    )

for file in files_to_combine:
    print(" - Processing test data:", file.format("test"))
    combined_test_df = pd.merge(
        combined_test_df, pd.read_csv(file.format("test")), on="surveyId"
    )


# Resulting dataframes
print("Resulting dataframes")
print(f" - Train data: {combined_train_df.shape}")
print(f" - Test data: {combined_test_df.shape}")

# Handle missing data
print("Handle missing data")
for column in list(
    combined_train_df.isna()
    .sum()[combined_train_df.isna().sum() > 0]
    .keys()
):
    print(f" - Processing train data: {column}")
    for country in tqdm(combined_train_df.country.unique()):
        combined_train_df.loc[
            combined_train_df.country == country, column
        ] = combined_train_df.query(f"country == '{country}'")[column].fillna(
            combined_train_df.query(f"country == '{country}'")[column].median()
        )

for column in list(
    combined_train_df.isna()
    .sum()[combined_train_df.isna().sum() > 0]
    .keys()
):
    print(f" - Processing train data: {column}")
    for country in tqdm(combined_train_df.country.unique()):
        combined_train_df.loc[
            combined_train_df.country == country, column
        ] = combined_train_df.query(f"country == '{country}'")[column].fillna(
            combined_train_df.query(f"country == '{country}'")[column].median()
        )

for column in list(
    combined_test_df.isna()
    .sum()[combined_test_df.isna().sum() > 0]
    .sort_values(ascending=False)
    .keys()
):
    print(f" - Processing train data: {column}")
    for country in tqdm(combined_test_df.country.unique()):
        combined_test_df.loc[
            combined_test_df.country == country, column
        ] = combined_test_df.query(f"country == '{country}'")[column].fillna(
            combined_train_df.query(f"country == '{country}'")[column].median()
        )
# Resulting dataframes
print("Resulting dataframes")
print(f" - Train data: {combined_train_df.shape}")
print(f" - Test data: {combined_test_df.shape}")

print(f"{combined_train_df.index=}")
print(f"{combined_test_df.index=}")

# for training data country, region, and speciesId one-hot encoded
print("One-hot encode country, region, and speciesId in training data")
for column in ["country", "region", "speciesId"]:
    print(f" - Processing: {column}")
    ohe_train_df = pd.concat(
        [combined_train_df, pd.get_dummies(combined_train_df[column], prefix=column)],
        axis=1,
    )
    ohe_train_df = ohe_train_df.drop(columns=[column])

# for test data country and region one-hot encoded
print("One-hot encode country and region in test data")
for column in ["country", "region"]:
    print(f" - Processing: {column}")
    ohe_test_df = pd.concat(
        [combined_test_df, pd.get_dummies(combined_test_df[column], prefix=column)],
        axis=1,
    )
    ohe_test_df = ohe_test_df.drop(columns=[column])


# Grouped by surveyId, use max
print("Group by surveyId")
grouped_train_df = ohe_train_df.groupby("surveyId", as_index=False).max()

# Resulting dataframes
print("Resulting dataframes")
print(f" - Train data: {grouped_train_df.shape}")
print(f" - Test data: {ohe_test_df.shape}")


Processing data

Load data
Drop countries not in test data
Replace -inf, inf with NaN
Process NaN values
 - Processing train data: areaInM2


100%|██████████| 18/18 [00:01<00:00, 14.34it/s]


 - Processing train data: geoUncertaintyInM


100%|██████████| 18/18 [00:01<00:00, 14.52it/s]


 - Processing test data: areaInM2
 - Processing test data: geoUncertaintyInM
Resulting dataframes
 - Train data: (1460051, 9)
 - Test data: (4716, 8)
Set speciesId as int
Resulting dataframes
 - Train data: (1460051, 9)
 - Test data: (4716, 8)
Combine environmental data
 - Processing train data: data/EnvironmentalRasters/EnvironmentalRasters/Climate/Average 1981-2010/GLC24-PA-train-bioclimatic.csv
 - Processing train data: data/EnvironmentalRasters/EnvironmentalRasters/Climate/Monthly/GLC24-PA-train-bioclimatic_monthly.csv
 - Processing train data: data/EnvironmentalRasters/EnvironmentalRasters/Elevation/GLC24-PA-train-elevation.csv
 - Processing train data: data/EnvironmentalRasters/EnvironmentalRasters/Human Footprint/GLC24-PA-train-human_footprint.csv
 - Processing train data: data/EnvironmentalRasters/EnvironmentalRasters/LandCover/GLC24-PA-train-landcover.csv
 - Processing train data: data/EnvironmentalRasters/EnvironmentalRasters/SoilGrids/GLC24-PA-train-soilgrids.csv
 - Processi

100%|██████████| 18/18 [00:09<00:00,  1.95it/s]


 - Processing train data: HumanFootprint-NavWater1994


100%|██████████| 18/18 [00:09<00:00,  1.96it/s]


 - Processing train data: HumanFootprint-NavWater2009


100%|██████████| 18/18 [00:09<00:00,  1.95it/s]


 - Processing train data: HumanFootprint-Roads


100%|██████████| 18/18 [00:09<00:00,  1.95it/s]


 - Processing train data: HumanFootprint-HFP1993


100%|██████████| 18/18 [00:09<00:00,  1.93it/s]


 - Processing train data: HumanFootprint-HFP2009


100%|██████████| 18/18 [00:09<00:00,  1.96it/s]


 - Processing train data: Soilgrid-bdod


100%|██████████| 18/18 [00:09<00:00,  1.96it/s]


 - Processing train data: Soilgrid-cec


100%|██████████| 18/18 [00:09<00:00,  1.94it/s]


 - Processing train data: Soilgrid-cfvo


100%|██████████| 18/18 [00:09<00:00,  1.95it/s]


 - Processing train data: Soilgrid-clay


100%|██████████| 18/18 [00:09<00:00,  1.95it/s]


 - Processing train data: Soilgrid-nitrogen


100%|██████████| 18/18 [00:09<00:00,  1.95it/s]


 - Processing train data: Soilgrid-phh2o


100%|██████████| 18/18 [00:09<00:00,  1.95it/s]


 - Processing train data: Soilgrid-sand


100%|██████████| 18/18 [00:09<00:00,  1.94it/s]


 - Processing train data: Soilgrid-silt


100%|██████████| 18/18 [00:09<00:00,  1.94it/s]


 - Processing train data: Soilgrid-soc


100%|██████████| 18/18 [00:09<00:00,  1.96it/s]


 - Processing train data: Soilgrid-bdod


100%|██████████| 18/18 [00:05<00:00,  3.26it/s]


 - Processing train data: Soilgrid-soc


100%|██████████| 18/18 [00:05<00:00,  3.28it/s]


 - Processing train data: Soilgrid-cec


100%|██████████| 18/18 [00:05<00:00,  3.31it/s]


 - Processing train data: Soilgrid-cfvo


100%|██████████| 18/18 [00:05<00:00,  3.28it/s]


 - Processing train data: Soilgrid-clay


100%|██████████| 18/18 [00:05<00:00,  3.32it/s]


 - Processing train data: Soilgrid-nitrogen


100%|██████████| 18/18 [00:05<00:00,  3.35it/s]


 - Processing train data: Soilgrid-phh2o


100%|██████████| 18/18 [00:05<00:00,  3.33it/s]


 - Processing train data: Soilgrid-sand


100%|██████████| 18/18 [00:05<00:00,  3.30it/s]


 - Processing train data: Soilgrid-silt


100%|██████████| 18/18 [00:05<00:00,  3.28it/s]


 - Processing train data: HumanFootprint-NavWater1994


100%|██████████| 18/18 [00:05<00:00,  3.29it/s]


 - Processing train data: HumanFootprint-NavWater2009


100%|██████████| 18/18 [00:05<00:00,  3.30it/s]


 - Processing train data: HumanFootprint-Roads


100%|██████████| 18/18 [00:05<00:00,  3.32it/s]


 - Processing train data: HumanFootprint-HFP1993


100%|██████████| 18/18 [00:05<00:00,  3.30it/s]


 - Processing train data: HumanFootprint-HFP2009


100%|██████████| 18/18 [00:05<00:00,  3.35it/s]

Resulting dataframes
 - Train data: (1460051, 967)
 - Test data: (4716, 966)
combined_train_df.index=RangeIndex(start=0, stop=1460051, step=1)
combined_test_df.index=RangeIndex(start=0, stop=4716, step=1)





In [4]:
print(f"{original_train_df.shape=}")
print(f"{original_test_df.shape=}")
print(f"{metadata_train_df.shape=}")
print(f"{metadata_test_df.shape=}")
print(f"{combined_train_df.shape=}")
print(f"{combined_test_df.shape=}")
print(f"{ohe_train_df.shape=}")
print(f"{ohe_test_df.shape=}")
print(f"{grouped_train_df.shape=}")

original_train_df.shape=(1483637, 9)
original_test_df.shape=(4716, 8)
metadata_train_df.shape=(1460051, 9)
metadata_test_df.shape=(4716, 8)
combined_train_df.shape=(1460051, 967)
combined_test_df.shape=(4716, 966)
ohe_train_df.shape=(1460051, 5893)
ohe_test_df.shape=(4716, 969)
grouped_train_df.shape=(88428, 5892)


In [6]:
# find nan in grouped_train_df
print("Find NaN in grouped_train_df")
for column in ohe_test_df.columns:
    if ohe_test_df[column].isna().sum() > 0:
        print(f" - {column}: {ohe_test_df[column].isna().sum()}")

Find NaN in grouped_train_df


In [9]:
grouped_train_df.index

Index([    212,     222,     243,     324,     333,     391,     410,     489,
           590,     607,
       ...
       3919341, 3919365, 3919375, 3919517, 3919518, 3919553, 3919592, 3919620,
       3919640, 3919655],
      dtype='int64', name='surveyId', length=88428)

In [3]:
metadata_train_df.speciesId.value_counts()

speciesId
540      21478
4397     19441
254      18113
4499     15065
10317    14538
         ...  
5806         1
8272         1
416          1
3907         1
8119         1
Name: count, Length: 4927, dtype: int64

In [10]:
ohe_test_df.index

RangeIndex(start=0, stop=4716, step=1)