# Different Naive Approaches
 All the approaches are based on the provided observation metadata and does not need any training.

## Loading Metadata
First we load all the metadata and import some very basic libraries.

In [1]:
import pandas as pd
from tqdm import tqdm
import os
tqdm.pandas()
os.chdir("../../")
# Load train and test metadata
po_train = pd.read_csv('Dataset/geolifeclef-2024/GLC24_P0_metadata_train.csv', dtype={'speciesId':int})
pa_train = pd.read_csv('Dataset/geolifeclef-2024/GLC24_PA_metadata_train.csv', dtype={'speciesId':int})
test = pd.read_csv('Dataset/geolifeclef-2024/GLC24_PA_metadata_test.csv',  dtype={'speciesId':int})

#  Upscale GPS to higher-level regions.
Location plays a crutial role in the prediction. Therefore, I will use [reverse_geocoder](https://github.com/thampiman/reverse-geocoder) to upscale the GPS to higher ranks such as district and county.

In [2]:
!pip install reverse_geocoder



In [2]:
from joblib import Parallel, delayed
import multiprocessing
import reverse_geocoder

n_jobs = multiprocessing.cpu_count()

def upscale_gps(survey):

    return dict(**reverse_geocoder.search((survey.lat, survey.lon), mode=1)[0], **{'surveyId': survey.surveyId})
    

def update_metadata():
    pass

### Update PA Train Surveys

In [3]:
pa_train_surveys = pa_train.drop_duplicates("surveyId").reset_index(drop=True)

pa_location_data = Parallel(n_jobs=n_jobs)(
            delayed(upscale_gps)(survey)
            for _, survey in tqdm(pa_train_surveys[["lon", "lat", "surveyId"]].iterrows(), total=len(pa_train_surveys))
        )

pa_location_data = pd.DataFrame(pa_location_data).rename(
    columns={"location_lvl3":"location_name",
             "cc": "countryCode",
             "admin1":"county",
             "admin2": "district"}
)
pa_location_data.head()

  0%|          | 0/88987 [00:00<?, ?it/s]

100%|██████████| 88987/88987 [00:45<00:00, 1946.77it/s]


Unnamed: 0,lat,lon,name,county,district,countryCode,surveyId
0,43.10759,3.08651,Gruissan,Languedoc-Roussillon,Departement de l'Aude,FR,212.0
1,56.88536,9.83839,Stovring,North Denmark,Rebild Kommune,DK,222.0
2,55.6268,8.28757,Oksbol,South Denmark,Varde Kommune,DK,243.0
3,43.4,-0.38333,Sauvagnon,Aquitaine,Departement des Pyrenees-Atlantiques,FR,324.0
4,45.82231,-0.50539,Ecoyeux,Poitou-Charentes,Departement de la Charente-Maritime,FR,333.0


In [4]:
# Lets check the granularity.
print(f"Number of unique countires: {len(pa_location_data.countryCode.unique())}")
print(f"Number of unique counties: {len(pa_location_data.county.unique())}")
print(f"Number of unique districts: {len(pa_location_data.district.unique())}")
print(f"Number of unique names: {len(pa_location_data.name.unique())}")

Number of unique countires: 33
Number of unique counties: 208
Number of unique districts: 772
Number of unique names: 5752


In [5]:
pa_location_data.drop(columns=["lat", "lon"])
pa_train = pd.merge(pa_train, pa_location_data.drop(columns=["lat", "lon"]), on=['surveyId'], how='inner')

### Update Test Surveys

In [6]:
test_location_data = Parallel(n_jobs=n_jobs)(
            delayed(upscale_gps)(survey)
            for _, survey in tqdm(test[["lon", "lat", "surveyId"]].iterrows(), total=len(test))
        )

test_location_data = pd.DataFrame(test_location_data).rename(
    columns={"location_lvl3":"location_name",
             "cc": "countryCode",
             "admin1":"county",
             "admin2": "district"}
)
test_location_data.head()

100%|██████████| 4716/4716 [00:02<00:00, 1981.65it/s]


Unnamed: 0,lat,lon,name,county,district,countryCode,surveyId
0,57.10854,10.02215,Vodskov,North Denmark,Alborg Kommune,DK,642.0
1,46.25115,7.34558,Saviese,Valais,Sion District,CH,1792.0
2,42.54277,1.73361,Pas de la Casa,Encamp,,AD,3256.0
3,46.15667,11.72998,Canal San Bovo,Trentino-Alto Adige,Provincia di Trento,IT,3855.0
4,55.83977,9.24998,Give,South Denmark,Vejle Kommune,DK,4889.0


In [7]:
test_location_data.drop(columns=["lat", "lon"])
test = pd.merge(test, test_location_data.drop(columns=["lat", "lon"]), on=['surveyId'], how='inner')

#  First Approach Top25 PA species **[0.11505]**

In [8]:
test['predictions'] = ' '.join([str(species) for species in pa_train.speciesId.value_counts().nlargest(25).index])

output = test[['surveyId','predictions']]
output.to_csv('research/Baseline_experiments/outputs/naive-approach/output-top25-pa.csv', index=False)
output

Unnamed: 0,surveyId,predictions
0,642,540 4397 254 4499 10317 2885 1964 10600 10073 ...
1,1792,540 4397 254 4499 10317 2885 1964 10600 10073 ...
2,3256,540 4397 254 4499 10317 2885 1964 10600 10073 ...
3,3855,540 4397 254 4499 10317 2885 1964 10600 10073 ...
4,4889,540 4397 254 4499 10317 2885 1964 10600 10073 ...
...,...,...
4711,3915838,540 4397 254 4499 10317 2885 1964 10600 10073 ...
4712,3916502,540 4397 254 4499 10317 2885 1964 10600 10073 ...
4713,3917793,540 4397 254 4499 10317 2885 1964 10600 10073 ...
4714,3918865,540 4397 254 4499 10317 2885 1964 10600 10073 ...


# Second Approach: Top-25 species per Country (PA) [**0.15677**]

In [9]:
!pip install -U ipywidgets swifter



In [10]:
import swifter
tqdm.pandas()

def get_topk_pa_species(country, k=25):
    query = list(pa_train.loc[(pa_train.country == country), "speciesId"].value_counts().nlargest(k).index)
    query.sort()
    return " ".join([str(c) for c in query]) if len(query) > 0 else "0"

test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["country"]), axis=1)
output = test[["surveyId","predictions"]]

output.to_csv("research/Baseline_experiments/outputs/naive-approach/output-pa-country-top25.csv", index=False)
output

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

Unnamed: 0,surveyId,predictions
0,642,254 540 581 1964 2885 4397 4499 4638 6310 6491...
1,1792,262 838 1015 1018 1092 1254 1497 3451 3639 374...
2,3256,146 963 981 1888 2474 4498 4609 4871 5071 5412...
3,3855,963 1677 1736 2386 2474 2715 3125 3166 4659 54...
4,4889,254 540 581 1964 2885 4397 4499 4638 6310 6491...
...,...,...
4711,3915838,254 540 1539 2025 2885 3722 4397 4499 5386 607...
4712,3916502,254 540 1539 2025 2885 3722 4397 4499 5386 607...
4713,3917793,423 1018 1254 1497 1818 3123 3722 4748 4758 53...
4714,3918865,254 540 581 1964 2885 4397 4499 4638 6310 6491...


# Third Approach: Top-25 species per district (PA) [**0.20001**]

In [11]:
import swifter
tqdm.pandas()

def get_topk_pa_species(district, k=25):
    query = list(pa_train.loc[(pa_train.district == district), "speciesId"].value_counts().nlargest(k).index)
    query.sort()
    return " ".join([str(c) for c in query]) if len(query) > 0 else "0"

test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"]), axis=1)
output = test[["surveyId","predictions"]]

output.to_csv("research/Baseline_experiments/outputs/naive-approach/output-pa-district-top25.csv", index=False)
output

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

Unnamed: 0,surveyId,predictions
0,642,254 340 540 581 1964 2885 4397 4499 4638 6310 ...
1,1792,0
2,3256,423 559 694 963 1015 1086 1497 1818 2753 2891 ...
3,3855,53 423 963 1162 2184 2474 2799 4109 4112 4734 ...
4,4889,254 540 581 843 958 963 1910 1964 2025 2142 28...
...,...,...
4711,3915838,254 540 581 1170 2025 2885 3361 4483 4492 4499...
4712,3916502,340 540 1545 1716 2025 2761 2885 2922 3226 385...
4713,3917793,1092 1139 1254 1851 2747 3043 5146 5412 6208 6...
4714,3918865,53 254 791 843 963 1964 2025 2823 3294 5114 51...


# Fourth Approach: Top-25 species per district & biogeographical zones (PA) [**0.20567**]

In [12]:
def get_topk_pa_species(district, region, k=25):
    query = list(pa_train.loc[(pa_train.region == region) & (pa_train.district == district), "speciesId"].value_counts().nlargest(k).index)
    query.sort()
    return " ".join([str(c) for c in query]) if len(query) > 0 else "0"

test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], test_observation["region"]), axis=1)
output = test[["surveyId","predictions"]]

output.to_csv("research/Baseline_experiments/outputs/naive-approach/output-pa-district+region.csv", index=False)
output

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

Unnamed: 0,surveyId,predictions
0,642,254 340 540 581 1964 2885 4397 4499 4638 6310 ...
1,1792,0
2,3256,53 544 791 963 1677 2002 3661 4946 5022 5071 5...
3,3855,53 423 963 1162 2184 2474 2799 4109 4112 4734 ...
4,4889,254 540 958 963 976 1910 1964 2025 2142 2398 2...
...,...,...
4711,3915838,254 540 581 1170 2025 2885 3361 4483 4492 4499...
4712,3916502,340 540 1545 1716 2025 2761 2885 2922 3226 385...
4713,3917793,963 1092 1139 1254 1851 2747 3043 5412 6208 66...
4714,3918865,53 254 791 843 963 1964 2025 2823 3294 5114 51...
