In [5]:
from collections import Counter
import pandas as pd

In [6]:
# read in files
submission_df = pd.read_csv('output/2024-05-20_1606/submission.csv')
# submission_df = pd.read_csv('output/submission-nearest.csv')
# submission_df = pd.read_csv('output/2024-05-19_2017/submission.csv')
# submission_df = pd.read_csv('output/uploadable_2024-05-13_14-47_imp_nrm.csv')
current_best_df = pd.read_csv('output/current-best-0.34329.csv')


In [7]:
# check correct number of surveyIds and columns
assert submission_df.shape == current_best_df.shape

In [8]:
# check test surveyIds used
assert all(submission_df.surveyId == current_best_df.surveyId)

In [9]:
# check column headings
assert all(submission_df.columns == current_best_df.columns)

In [10]:
# check species is monotonically increasing
for surveyId, row in zip(submission_df.surveyId, submission_df.predictions):
    species = row.strip().split(' ')
    if not all(int(x) < int(y) for x, y in zip(species, species[1:])):
        print(f'Error in surveyId {surveyId}, not all species are monotonically increasing')
        print(f'Row: {row}')


In [11]:
# check species counts
for surveyId, row_pred, row_best in zip(submission_df.surveyId, submission_df.predictions, current_best_df.predictions):
    pred_species = row_pred.split(' ')
    best_species = row_best.split(' ')
    if len(best_species) != len(pred_species):
        print(f'Survey {surveyId} has a different number of species')
        print(len(pred_species), "pred", row_pred)
        print(len(best_species), "best", row_best, "\n")

Survey 642 has a different number of species
19 pred 254 540 1964 2885 3722 4397 4499 6310 6962 8208 8431 8818 9707 9816 10073 10317 10600 11140 11195
25 best 791 843 963 1910 1964 2025 2885 3177 3958 4483 5114 6788 6962 7760 7942 7999 8151 8807 9647 9669 10073 10600 10684 11195  

Survey 1792 has a different number of species
19 pred 254 540 1964 2885 3722 4397 4499 6310 6962 8208 8431 8818 9707 9816 10073 10317 10600 11140 11195
25 best 262 351 462 838 1092 1254 2492 3419 3451 4258 4755 5301 5483 5543 5557 6067 6603 6611 7493 7571 7588 7906 9555 10904  

Survey 3256 has a different number of species
16 pred 254 540 1964 2885 3722 4397 4499 6310 6962 8431 9816 10073 10317 10600 11140 11195
22 best 96 249 423 843 1018 2025 2644 3722 4109 4492 4638 5293 6497 7706 8428 8746 8747 8818 9024 10255 10600  

Survey 3855 has a different number of species
15 pred 254 540 1964 2885 3722 4397 4499 6310 8431 9816 10073 10317 10600 11140 11195
21 best 53 651 1162 2184 2587 2762 2799 3211 4109 4862 

In [12]:
# check for similarity to current best
row_similarity = []
for row_pred, row_best in zip(submission_df.predictions, current_best_df.predictions):
    pred = row_pred.split(' ')
    best = row_best.split(' ')
    row_similarity.append(len(set(pred) & set(best))/len(best))
print(f"Average similarity to current best: {sum(row_similarity)/len(row_similarity):.2%}")

Average similarity to current best: 16.42%
