### Make a correspondance between french bird data scrapped and species in database

In [16]:
import numpy as np
import pandas as pd
from nltk import edit_distance

In [17]:
def snakify_bird_name(name:str) -> str:
    '''
    change name to snkae case format
    - remove leading and trailing blanks
    - lower all char
    - replace " " and "-" by "_"
    - remove "'"
    '''
    snake_name = name.strip().lower().replace(" ", "_").replace("-", "_").replace("'", "")
    return snake_name

In [18]:
def find_closest_bird(bird_name:str, bird_lst:[str, ...]) -> (int, [str, ...]):
    """
    Find closest str to bird_name in bird_lst
    Distance is leveinstein distance

    Returns min_dist (int) and min_birds ([str, ...])
    """
    min_dist = len(bird_name)
    min_birds = []
    for other_bird in bird_lst:
        name_dist = edit_distance(bird_name, other_bird)
        if name_dist < min_dist:
            min_dist = name_dist
            min_birds = [other_bird]
        elif name_dist == min_dist:
            min_birds.append(other_bird)
    
    return min_dist, min_birds

### load databases

In [19]:
# load database
database_file = "all_DIB_image_data.csv"
scrapped_data_file = "../../src/french_bird_wiki.csv"
merge_data_file = "../../french_birds_metadata.csv"
missing_birds_data_file = "../../missing_french_birds_data.csv"

database_df = pd.read_csv(database_file, header=0)
french_bird_df = pd.read_csv(scrapped_data_file, header=0, sep=";")  

### world birds database preprocessing

In [20]:
# there are exactly 70 Na values, one for each file. Need to revisit a bit the world metadata gathering to check on them
database_df = database_df.dropna()


### french bird data preprocessing

In [21]:
french_bird_df['english_name_snake_case'] = french_bird_df['Nom_EN'].apply(snakify_bird_name)
french_bird_df['latin_name_snake_case'] = french_bird_df['Nom_LT'].apply(snakify_bird_name)


### Species list prepration

In [22]:
database_bird_lst = database_df['bird_name'].to_list()

### merge dataset and save

In [23]:
merge_df = pd.merge(french_bird_df, database_df, how='inner', left_on='english_name_snake_case', right_on='bird_name')
merge_df.head()

merge_df.to_csv(merge_data_file, sep=";", header=True, index=False)

### Missing birds analysis

In [24]:
# sanity check that there is no duplicated bird
french_birds_dict = dict(zip(french_bird_df['english_name_snake_case'], [0] * french_bird_df.shape[0]))

for bird in database_df['bird_name']:
    if bird in french_birds_dict:
        french_birds_dict[bird] = french_birds_dict[bird] + 1

In [25]:
count_df = pd.DataFrame([{"species":key, "count":value} for key, value in french_birds_dict.items()])
count_df['count'].value_counts()

count
1    546
0     76
Name: count, dtype: int64

In [26]:
missing_birds = count_df.loc[count_df['count']==0, ['species']].species.to_list()

In [27]:
missing_french_birds = [] 
for bird in missing_birds:
    min_dist, min_birds = find_closest_bird(bird, database_bird_lst)
    missing_bird_dict = {
        'missing_french_bird_name':bird,
        'min_distance_to_world_bird':min_dist,
        'world_birds_with_min_dist':';'.join(min_birds)
    }
    missing_french_birds.append(missing_bird_dict)

missing_birds_data_df = pd.DataFrame(missing_french_birds)
missing_birds_data_df.head()


KeyboardInterrupt: 

In [None]:
# save documents

missing_birds_data_df.to_csv(missing_birds_data_file, header=True, index=False, sep=",")

In [None]:
name_correspondance_dict = {
    'graylag_goose':'greylag_goose',
    'gray_partridge':'gray_partridge',
    'marbled_teal':'marbled_duck',
    

}

In [None]:
alternative_names_dict = {
    'brant':['brent_goose'],
    'black_goose': ['northern_black_grouse', 'eurasian_black_grouse', 'blackgame', 'blackcock'],
    'ring_necked_pheasant': ['common_pheasant', 'pheasant'],
    'eared_grebe': ['black-necked_grebe'],    


}

In [None]:
absent_species_lst = [
    'mandarin_duck',
    'ruddy_duck',
    

]