### Make a correspondance between french bird data scrapped and species in database

In [54]:
import numpy as np
import pandas as pd
from nltk import edit_distance

In [81]:
def snakify_bird_name(name:str) -> str:
    '''
    change name to snkae case format
    - remove leading and trailing blanks
    - lower all char
    - replace " " and "-" by "_"
    - remove "'"
    '''
    snake_name = name.strip().lower().replace(" ", "_").replace("-", "_").replace("'", "")
    return snake_name

In [82]:
def find_closest_bird(bird_name:str, bird_lst:[str, ...]) -> (int, [str, ...]):
    """
    Find closest str to bird_name in bird_lst
    Distance is leveinstein distance

    Returns min_dist (int) and min_birds ([str, ...])
    """
    min_dist = len(bird_name)
    min_birds = []
    for other_bird in bird_lst:
        name_dist = edit_distance(bird_name, other_bird)
        if name_dist < min_dist:
            min_dist = name_dist
            min_birds = [other_bird]
        elif name_dist == min_dist:
            min_birds.append(other_bird)
    
    return min_dist, min_birds

### load databases

In [84]:
# load database
database_file = "all_DIB_image_data.csv"
scrapped_data_file = "../../src/french_bird_wiki.csv"
merge_data_file = "../../french_birds_metadata.csv"

database_df = pd.read_csv(database_file, header=0)
french_bird_df = pd.read_csv(scrapped_data_file, header=0, sep=";")  

### world birds database preprocessing

In [87]:
# there are exactly 70 Na values, one for each file. Need to revisit a bit the world metadata gathering to check on them
database_df = database_df.dropna()


### french bird data preprocessing

In [88]:
french_bird_df['english_name_snake_case'] = french_bird_df['Nom_EN'].apply(snakify_bird_name)

### Species list prepration

In [89]:
database_bird_lst = database_df['bird_name'].to_list()

### merge dataset and save

In [90]:
merge_df = pd.merge(french_bird_df, database_df, how='inner', left_on='english_name_snake_case', right_on='bird_name')
merge_df.head()

merge_df.to_csv(merge_data_file, sep=";", header=True, index=False)

### Missing birds analysis

In [91]:
# sanity check that there is no duplicated bird
french_birds_dict = dict(zip(french_bird_df['english_name_snake_case'], [0] * french_bird_df.shape[0]))

for bird in database_df['bird_name']:
    if bird in french_birds_dict:
        french_birds_dict[bird] = french_birds_dict[bird] + 1

In [92]:
count_df = pd.DataFrame([{"species":key, "count":value} for key, value in french_birds_dict.items()])
count_df['count'].value_counts()

count
1    546
0     76
Name: count, dtype: int64

In [40]:
missing_birds = count_df.loc[count_df['count']==0, ['species']].species.to_list()

In [75]:
for bird in missing_birds:
    min_dist, min_birds = find_closest_bird(bird, database_bird_lst)
    print(bird, min_dist, min_birds)


graylag_goose 1 ['greylag_goose']
brant 4 ['nene', 'besra', 'kagu', 'sora', 'brolga', 'ruff', 'kea', 'mao', 'brubru', 'rook', 'wrentit', 'omao']
mandarin_duck 4 ['andean_duck']
marbled_teal 4 ['marbled_duck']
ruddy_duck 4 ['ruddy_shelduck', 'wood_duck', 'musk_duck', 'ruddy_crake', 'rudds_lark']
black_grouse 4 ['black_guan', 'sage_grouse', 'black_crake', 'black_phoebe', 'black_oriole', 'black_drongo', 'black_robin', 'black_thrush']
gray_partridge 1 ['grey_partridge']
ring_necked_pheasant 7 ['white_eared_pheasant', 'brown_eared_pheasant', 'blue_eared_pheasant']
eared_grebe 3 ['horned_grebe']
rock_pigeon 3 ['trocaz_pigeon', 'pink_pigeon']
eurasian_nightjar 3 ['european_nightjar']
eurasian_moorhen 4 ['tristan_moorhen', 'eurasian_hoopoe', 'eurasian_wren']
eurasian_thick_knee 4 ['peruvian_thick_knee']
black_bellied_plover 5 ['black_bellied_tern', 'black_bellied_wren', 'black_billed_weaver']
little_ringed_plover 6 ['common_ringed_plover']
eurasian_whimbrel 5 ['eurasian_dotterel', 'eurasian_wr

In [None]:
name_correspondance_dict = {
    'graylag_goose':'greylag_goose',
    'gray_partridge':'gray_partridge',
    'marbled_teal':'marbled_duck',
    

}

In [None]:
alternative_names_dict = {
    'brant':['brent_goose'],
    'black_goose': ['northern_black_grouse', 'eurasian_black_grouse', 'blackgame', 'blackcock'],
    'ring_necked_pheasant': ['common_pheasant', 'pheasant'],
    'eared_grebe': ['black-necked_grebe'],    


}

In [None]:
absent_species_lst = [
    'mandarin_duck',
    'ruddy_duck',
    

]