## Quick start XGBC

In [1]:
import numpy as np
import pandas as pd 
import os
import gc

from sklearn.preprocessing import MultiLabelBinarizer
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
os.chdir("..")

### Loading datasets

In [3]:
path = "Dataset/geolifeclef-2024/"

train = pd.read_csv(path +"GLC24_PA_metadata_train.csv")
test = pd.read_csv(path+"GLC24_PA_metadata_test.csv")

In [4]:
elevation_train = pd.read_csv(path+"EnvironmentalRasters/EnvironmentalRasters/Elevation/GLC24-PA-train-elevation.csv")
elevation_test = pd.read_csv(path+"EnvironmentalRasters/EnvironmentalRasters/Elevation/GLC24-PA-test-elevation.csv")

In [5]:
human_footprint_train = pd.read_csv(path+"EnvironmentalRasters/EnvironmentalRasters/Human Footprint/GLC24-PA-train-human_footprint.csv")
human_footprint_test = pd.read_csv(path+"EnvironmentalRasters/EnvironmentalRasters/Human Footprint/GLC24-PA-test-human_footprint.csv")

In [6]:
landcover_test = pd.read_csv(path+"EnvironmentalRasters/EnvironmentalRasters/LandCover/GLC24-PA-test-landcover.csv")
landcover_train = pd.read_csv(path+"EnvironmentalRasters/EnvironmentalRasters/LandCover/GLC24-PA-train-landcover.csv")

In [7]:
soilgrids_test = pd.read_csv(path+"EnvironmentalRasters/EnvironmentalRasters/SoilGrids/GLC24-PA-test-soilgrids.csv")
soilgrids_train = pd.read_csv(path+"EnvironmentalRasters/EnvironmentalRasters/SoilGrids/GLC24-PA-train-soilgrids.csv")

In [8]:
print(train.shape)
for dataset in [elevation_train, human_footprint_train, landcover_train, soilgrids_train]:
    train = pd.merge(train, dataset, how = "left", left_on = "surveyId", right_on = "surveyId")
    print(train.shape)
    del dataset

(1483637, 9)
(1483637, 10)
(1483637, 26)
(1483637, 27)
(1483637, 36)


In [9]:
print(test.shape)
for dataset in [elevation_test, human_footprint_test, landcover_test, soilgrids_test]:
    test = pd.merge(test, dataset, how = "left", left_on = "surveyId", right_on = "surveyId")
    print(test.shape)
    del dataset

(4716, 8)
(4716, 9)
(4716, 25)
(4716, 26)
(4716, 35)


In [10]:
_ = gc.collect()

### Data Preparation

Filter regions and rarest species

In [11]:
#If the region is not in test, we drop it
print(train.shape)
train = train[train["region"].isin(test["region"].unique())]
print(train.shape)

(1483637, 36)
(1465435, 36)


Filter

In [12]:
NB_SPECIES_TO_KEEP = 1000 # Choose the number of most present species you want to keep in each region

print(train.shape)
species_to_keep = {}

for region in train["region"].unique():
    species_to_keep[region] = list(train[train["region"] == region]["speciesId"].value_counts().index[:NB_SPECIES_TO_KEEP])
    
train = train[train.apply(lambda row: row['speciesId'] in species_to_keep[row['region']], axis=1)]
print(train.shape)

(1465435, 36)
(1432730, 36)


Regroup all species by survey

In [13]:
dict_groupby = {k:["first"] for k in train.columns.drop(["speciesId", "surveyId"])} 
dict_groupby["speciesId"] = [list]

train["speciesId"] = train["speciesId"].astype(int)
train = train.groupby("surveyId").agg(dict_groupby).reset_index()
train.columns = [x[0] for x in train.columns]
print(train.shape)

train["speciesId"] = train["speciesId"].apply(lambda x : list(set(x)))
train.head(2)

(88571, 36)


Unnamed: 0,surveyId,lon,lat,year,geoUncertaintyInM,areaInM2,region,country,Elevation,HumanFootprint-Built1994,...,Soilgrid-bdod,Soilgrid-cec,Soilgrid-cfvo,Soilgrid-clay,Soilgrid-nitrogen,Soilgrid-phh2o,Soilgrid-sand,Soilgrid-silt,Soilgrid-soc,speciesId
0,212,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,47.0,0.0,...,140.0,214.0,151.0,292.0,159.0,73.0,284.0,422.0,176.0,"[1304, 9030, 8784, 4530, 9458, 51, 11157, 982,..."
1,222,9.88456,56.91214,2017,10.0,79.0,CONTINENTAL,Denmark,6.0,0.0,...,120.0,225.0,101.0,94.0,379.0,58.0,650.0,255.0,609.0,"[433, 4499, 9816, 540, 254]"


Preprocess NaN, inf, data type

In [14]:
pd.set_option('mode.use_inf_as_na', True)

def preprocess(df_to_preprocess:pd.DataFrame) -> pd.DataFrame:

    df = df_to_preprocess.copy()
    df['geoUncertaintyInM']  = df['geoUncertaintyInM'].fillna(df['geoUncertaintyInM'].mode()[0])
    df['areaInM2'] = df['areaInM2'].fillna(df['areaInM2'].mean())
    df['Elevation'] = df['Elevation'].fillna(df['Elevation'].mode()[0])

    for column in df.filter(regex='HumanFootprint').columns:
        df[column] = df[column].fillna(df[column].mean())

    for column in df.filter(regex='Soilgrid').columns:
        df[column] = df[column].fillna(df[column].mode()[0])

    columns_to_convert =['Elevation','geoUncertaintyInM','HumanFootprint-Pasture1993', 'HumanFootprint-Pasture2009']

    df[columns_to_convert] = df[columns_to_convert].astype(int)

    for name,values in df.filter(regex='Soilgrid').items():
        df[name] = df[name].astype(int)
    
    df["surveyId"]=df["surveyId"].astype(int).astype(str)
    
    df.drop("country", axis = 1, inplace = True)
    
    df.fillna(0, inplace = True)
        
    return df


train = preprocess(train)
test = preprocess(test)

### Models training and predict on test

Training one model for each region, and predicting top K number of species, K being the mean number of species observed in each region.

In [15]:
%%time

output = pd.DataFrame()

# Train one model per region
for region in train["region"].unique():
    
    print(f"Training model for region {region}")

    # Select train and test data corresponding to the current region
    X_train = train[train["region"] == region].copy().set_index("surveyId")
    nb_mean_species = int(np.round(X_train["speciesId"].apply(lambda x : len(x)).mean(),0))
    print("Train set size :",X_train.shape)
    print("Number of species to predict :",nb_mean_species)

    X_test = test[test["region"] == region].copy().set_index("surveyId")
    X_test.drop("region", axis =1, inplace = True)
    
    #Label encoding
    mlb = MultiLabelBinarizer()

    y_train = pd.DataFrame(mlb.fit_transform(X_train["speciesId"]),
                       columns=mlb.classes_,
                       index=X_train.index)

    X_train.drop(["speciesId", "region"], axis = 1, inplace = True)
    X_test = X_test[X_train.columns]
    
    #Hyperparameters could be optimized depending on the region
    xgb_params = {
        'objective' : 'binary:logistic',
        'eval_metric' : 'logloss',
        'colsample_bytree': 0.8,
        'learning_rate': 0.1,
        'max_depth': 9,
        'n_estimators': 1000,
        'reg_alpha': 0.2,
        'reg_lambda': 0.8,
        'tree_method':'hist',
        'device' : 'gpu:0' #comment this line if not using GPU, or select good GPU
    }

    model = XGBClassifier(**xgb_params)

    %time model.fit(X_train, y_train)  
    
    # Predict probabilities to select top-k species
    proba_predictions = model.predict_proba(X_test)
    classes = mlb.classes_

    top_k_species = pd.DataFrame(data=proba_predictions,
                                  index=X_test.index,
                                  columns=classes).apply(lambda x: pd.Series(x.nlargest(nb_mean_species).index), axis=1)
    

    

Training model for region MEDITERRANEAN
Train set size : (7282, 34)
Number of species to predict : 15
CPU times: total: 11min 24s
Wall time: 19min 24s
Training model for region CONTINENTAL
Train set size : (43128, 34)
Number of species to predict : 17
CPU times: total: 29min 20s
Wall time: 49min 42s
Training model for region ATLANTIC
Train set size : (36335, 34)
Number of species to predict : 13
CPU times: total: 18min 22s
Wall time: 34min 12s
Training model for region ALPINE
Train set size : (1826, 34)
Number of species to predict : 26
CPU times: total: 7min 55s
Wall time: 13min 49s
CPU times: total: 1h 7min 53s
Wall time: 1h 58min 59s


### Result

In [16]:
# Add predictions to output dataframe
output = pd.concat([output, top_k_species])


# Free memory
del model
del X_train, X_test, y_train, top_k_species, proba_predictions

_ = gc.collect()

In [17]:
output.to_csv("output/xgb_output.csv")