In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors


In [2]:
train_df = pd.read_csv('data-pipeline/_raw/GLC24_PA_metadata_train.csv')
test_df = pd.read_csv('data-pipeline/_raw/GLC24_PA_metadata_test.csv')

In [3]:
test_df.shape

(4716, 8)

In [4]:
test_df.surveyId.value_counts().sort_values(ascending=False)

surveyId
642        1
32525      1
4889       1
5884       1
6955       1
          ..
3917793    1
3918865    1
3914890    1
3908927    1
3919234    1
Name: count, Length: 4716, dtype: int64

In [5]:
test_df = test_df.drop(columns=['year', 'geoUncertaintyInM', 'areaInM2', 'region', 'country'])
train_df = train_df.drop(columns=['year', 'geoUncertaintyInM', 'areaInM2', 'region', 'country'])

In [6]:
train_df.speciesId = train_df.speciesId.astype(int)

In [7]:
train_df = pd.concat(
    [train_df, pd.get_dummies(train_df['speciesId'], prefix='speciesId')],
    axis=1,
)

In [8]:
train_df = train_df.drop(columns=['speciesId'])

In [9]:
train_df = train_df.groupby('surveyId').max().reset_index()

In [10]:
train_df = train_df.drop(columns=['surveyId'])

In [11]:
train_df.head()

Unnamed: 0,lon,lat,speciesId_2,speciesId_3,speciesId_4,speciesId_7,speciesId_9,speciesId_10,speciesId_12,speciesId_14,...,speciesId_11239,speciesId_11240,speciesId_11241,speciesId_11242,speciesId_11243,speciesId_11245,speciesId_11246,speciesId_11248,speciesId_11253,speciesId_11254
0,3.099038,43.134956,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,9.88456,56.91214,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,8.25602,55.63705,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,-0.40259,43.50563,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,-0.51736,45.80643,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
neigh = NearestNeighbors(n_neighbors=1)
neigh.fit(train_df[['lon', 'lat']])

In [13]:
test_df['nearest'] = neigh.kneighbors(test_df[['lon', 'lat']], 1, return_distance=False)

In [14]:
test_df.head()

Unnamed: 0,lon,lat,surveyId,nearest
0,10.03355,57.12081,642,51492
1,7.333,46.22997,1792,26503
2,1.843658,42.58006,3256,80814
3,11.72009,46.26149,3855,259
4,9.36187,55.90245,4889,21119


In [15]:
test_df.shape

(4716, 4)

In [16]:
train_df.loc[train_df.index.isin([51492, 26503, 80814,259,21119])]

Unnamed: 0,lon,lat,speciesId_2,speciesId_3,speciesId_4,speciesId_7,speciesId_9,speciesId_10,speciesId_12,speciesId_14,...,speciesId_11239,speciesId_11240,speciesId_11241,speciesId_11242,speciesId_11243,speciesId_11245,speciesId_11246,speciesId_11248,speciesId_11253,speciesId_11254
259,11.740398,46.260273,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
21119,9.37044,55.90973,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
26503,7.43762,46.16471,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
51492,10.0717,57.14022,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
80814,1.8525,42.64089,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
combined_test_df = pd.merge(test_df, train_df, left_on='nearest', right_index=True, how='left')

In [18]:
combined_test_df.head()

Unnamed: 0,lon_x,lat_x,surveyId,nearest,lon_y,lat_y,speciesId_2,speciesId_3,speciesId_4,speciesId_7,...,speciesId_11239,speciesId_11240,speciesId_11241,speciesId_11242,speciesId_11243,speciesId_11245,speciesId_11246,speciesId_11248,speciesId_11253,speciesId_11254
0,10.03355,57.12081,642,51492,10.0717,57.14022,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,7.333,46.22997,1792,26503,7.43762,46.16471,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1.843658,42.58006,3256,80814,1.8525,42.64089,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,11.72009,46.26149,3855,259,11.740398,46.260273,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,9.36187,55.90245,4889,21119,9.37044,55.90973,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
combined_test_df.shape

(4716, 5022)

In [20]:
combined_test_df.surveyId.value_counts().sort_values(ascending=False)

surveyId
642        1
32525      1
4889       1
5884       1
6955       1
          ..
3917793    1
3918865    1
3914890    1
3908927    1
3919234    1
Name: count, Length: 4716, dtype: int64

In [21]:
combined_test_df = combined_test_df.drop(columns=['lon_x','lat_x','nearest','lon_y','lat_y'])

In [22]:
combined_test_df.sort_values(by='surveyId').head()

Unnamed: 0,surveyId,speciesId_2,speciesId_3,speciesId_4,speciesId_7,speciesId_9,speciesId_10,speciesId_12,speciesId_14,speciesId_15,...,speciesId_11239,speciesId_11240,speciesId_11241,speciesId_11242,speciesId_11243,speciesId_11245,speciesId_11246,speciesId_11248,speciesId_11253,speciesId_11254
0,642,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1792,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3256,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3855,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,4889,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
from helper import verify_submission

In [28]:
species_columns = ["survey"] + list(
    combined_test_df.columns[combined_test_df.columns.str.startswith("speciesId_")]
)

species = [column[10:] for column in species_columns[1:]]

with open("submissions/submission-nearest.csv", "w") as f:
    f.write("surveyId,predictions\n")
    for _, row in tqdm(combined_test_df.sort_values("surveyId").iterrows()):
        f.write(
            str(row[0])
            + ","
            + " ".join([species[i - 1] for i in range(1, len(row)) if row[i] == 1])
            + "\n"
        )

  str(row[0])
  + " ".join([species[i - 1] for i in range(1, len(row)) if row[i] == 1])
4716it [01:13, 64.59it/s]


In [27]:
verify_submission("submission-nearest.csv")

True

In [32]:
count_species = 0
with open("submission-nearest.csv", "r") as f:
    for line in f:
        count_species += len(line.split(",")[1].split())
mean_species = count_species / 4716
print(f"Mean number of species per survey: {mean_species:.2f}")

Mean number of species per survey: 16.87
