# 0. Importing the libraries needed for the dataset preparation task

In [1]:
import os
import pathlib
import shutil
import pandas as pd
import random
import requests
import time
import os
from tqdm import tqdm
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import json

#this is the library 
try:
    from scripts import model_setup
    from scripts import utilities
except ImportError:
    !git clone https://github.com/thomaspierantozzi/PyTorch_Course_by_Daniel_Bourke
    !mv PyTorch_Course_by_Daniel_Bourke/05_PyTorch_Food101/scripts/ .
    !rm -rf ./PyTorch_Course_by_Daniel_Bourke
    from scripts import model_setup
    from scripts import utilities



####Here under we create an ImageFolder folder, and we base its structure on the DataSetFolder from Torchvision documentation: [here](https://docs.pytorch.org/vision/main/generated/torchvision.datasets.DatasetFolder.html)

![Folder structure](https://github.com/thomaspierantozzi/Project_SARA/raw/main/Notebook_pictures/DataSetFolder_Structure.png)

In [28]:
#CONSTANTS AND GENERAL CONFIGURATION FOR THE SCRIPT
IMAGEFOLDER = pathlib.Path('./ImageFolder') #a folder which will be used as a base for the DataSet at training time
DATASET_TRAIN_FOLDER = IMAGEFOLDER / 'train'
DATASET_TEST_FOLDER = IMAGEFOLDER / 'cv'
ENG_ITA_NAMES = './eng_ita_scientific_names.json'

if not IMAGEFOLDER.exists():
    os.mkdir(IMAGEFOLDER)
    os.makedirs(DATASET_TRAIN_FOLDER, exist_ok=True)
    os.makedirs(DATASET_TEST_FOLDER, exist_ok=True)
    print('Created the base folders for the ImageFolder requested tree...')

Created the base folders for the ImageFolder requested tree...


# 1. BirdSnap Dataset
<img src='https://github.com/thomaspierantozzi/Project_SARA/raw/main/Notebook_pictures/Dataset_Flusso.png' height=400>

### 1.a Sorting the dataset content
#### HuggingFace holds a repo of the dataset called Birdsnap, which can be a great resource for our goal. 
#### We will use this repo [here](https://huggingface.co/datasets/HuggingFaceM4/Birdsnap/tree/main) by SaulLu to download the data later on
#### Anyway a good resource to start collecting the species in the dataset is this file [here](https://huggingface.co/datasets/sasha/birdsnap/blob/main/species.txt) found in [this](https://huggingface.co/datasets/sasha/birdsnap) repo

In [3]:
BIRDSNAP_DATASET_PATH = pathlib.Path('./Datasets/BirdSnap')

birdsnap_species = {} #initializing a dict to hold the common and scientific names of the Birdsnap classes

list_of_lowercase = [
    'Necked',
    'Winged',
    'Breasted',
    'Legged',
    'Tailed',
    'Fronted',
    'Crowned',
    'Bellied',
    'Billed',
    'Throated',
    'Backed',
    'Rumped',
    'Eared',
    'Cheeked',
    'Eyed'
]

counter = 0
with open(f'{BIRDSNAP_DATASET_PATH}/species.txt', 'r') as images_text_file:
    for line in images_text_file.readlines():
        words = line.split(sep='\t')
        common_name_formatted = words[1].title()
        common_name_formatted = common_name_formatted.replace("\'S", "s").replace(' ', '_').replace('-', '_')
        for word_to_lowercase in list_of_lowercase:
            if word_to_lowercase in common_name_formatted:
                print(common_name_formatted, end=' -> ')
                common_name_formatted = common_name_formatted.replace(word_to_lowercase, word_to_lowercase.lower())
                print(common_name_formatted)
            
#        common_name_formatted = common_name_formatted.lower()
        birdsnap_species[words[2]] = common_name_formatted

birdsnap_species.pop('scientific') #to remove the data collected in the header of the csv

print(f'The system found {len(birdsnap_species)} birdsnap classes')

White_Tailed_Hawk -> White_tailed_Hawk
Zone_Tailed_Hawk -> Zone_tailed_Hawk
Red_Tailed_Hawk -> Red_tailed_Hawk
Rough_Legged_Hawk -> Rough_legged_Hawk
Broad_Winged_Hawk -> Broad_winged_Hawk
Swallow_Tailed_Kite -> Swallow_tailed_Kite
White_Tailed_Kite -> White_tailed_Kite
Green_Winged_Teal -> Green_winged_Teal
Blue_Winged_Teal -> Blue_winged_Teal
Ring_Necked_Duck -> Ring_necked_Duck
Long_Tailed_Duck -> Long_tailed_Duck
White_Winged_Scoter -> White_winged_Scoter
Red_Breasted_Merganser -> Red_breasted_Merganser
Greater_White_Fronted_Goose -> Greater_White_fronted_Goose
White_Throated_Swift -> White_throated_Swift
Yellow_Crowned_Night_Heron -> Yellow_crowned_Night_Heron
Black_Crowned_Night_Heron -> Black_crowned_Night_Heron
Rose_Breasted_Grosbeak -> Rose_breasted_Grosbeak
Black_Bellied_Plover -> Black_bellied_Plover
Band_Tailed_Pigeon -> Band_tailed_Pigeon
White_Winged_Dove -> White_winged_Dove
Black_Billed_Magpie -> Black_billed_Magpie
Yellow_Billed_Magpie -> Yellow_billed_Magpie
Groove_Bi

In [4]:
birdsnap_species

{'Accipiter cooperii': 'Coopers_Hawk',
 'Accipiter gentilis': 'Northern_Goshawk',
 'Accipiter striatus': 'Sharp_Shinned_Hawk',
 'Aquila chrysaetos': 'Golden_Eagle',
 'Buteo albicaudatus': 'White_tailed_Hawk',
 'Buteo albonotatus': 'Zone_tailed_Hawk',
 'Buteo jamaicensis': 'Red_tailed_Hawk',
 'Buteo lagopus': 'Rough_legged_Hawk',
 'Buteo lineatus': 'Red_Shouldered_Hawk',
 'Buteo platypterus': 'Broad_winged_Hawk',
 'Buteo regalis': 'Ferruginous_Hawk',
 'Buteo swainsoni': 'Swainsons_Hawk',
 'Buteogallus anthracinus': 'Common_Black_Hawk',
 'Circus cyaneus': 'Northern_Harrier',
 'Elanoides forficatus': 'Swallow_tailed_Kite',
 'Elanus leucurus': 'White_tailed_Kite',
 'Haliaeetus leucocephalus': 'Bald_Eagle',
 'Ictinia mississippiensis': 'Mississippi_Kite',
 'Parabuteo unicinctus': 'Harriss_Hawk',
 'Rostrhamus sociabilis': 'Snail_Kite',
 'Psaltriparus minimus': 'Bushtit',
 'Eremophila alpestris': 'Horned_Lark',
 'Megaceryle alcyon': 'Belted_Kingfisher',
 'Cepphus columba': 'Pigeon_Guillemot',

## 1.b - Most commonly spottable birds in Italy
#### Now let's define a list of most commonly spotted birds in Italy, since the final app is meant to be use by Italian birdwatchers.
#### The list is scraped from the Wikipedia [link](https://en.wikipedia.org/wiki/List_of_birds_of_Italy)

In [5]:
def create_italian_birds_list():
    
    #if a file has been already written from the scrape function in the past, then the function collects it back and return its content
    if pathlib.Path('./birds_of_italy.txt').exists():
        print(f'The list already exists in memory...retrieving {pathlib.Path('./birds_of_italy.txt')}')
        with open('./birds_of_italy.txt', 'r') as f:
            return f.read().splitlines()
            
    #extracting a list of birds most likely to be spotted in Italy from wikipedia
    import requests
    from bs4 import BeautifulSoup

    url = "https://en.wikipedia.org/wiki/List_of_birds_of_Italy" #the web page from witch we will basically scrap the info
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    birds = []

    for bird in soup.select("div div div main div div div div ul li i"):
        bird_name = bird.get_text(strip=True)
        birds.append(bird_name)

    #adding the birds of the orders represented by fewer species, which are formatted differently in the webpage
    for bird in soup.select("div div div main div div div ul li i"):
        bird_name = bird.get_text(strip=True)
        birds.append(bird_name)

    # Save to file
    with open("birds_of_italy.txt", "w", encoding="utf-8") as f:
        for bird in sorted(set(birds)):
            f.write(bird + "\n")
        print(f'A list of the italian birds has been exported in: {os.getcwd()}/birds_of_italy.txt')
    
    print('The variable now holds a list of the scientific names for the birds most likely to spot in Italy...')
    return birds

In [6]:
italian_birds_list = create_italian_birds_list() #this will be a list loaded in memory with all the italian birds. A *.txt file is available as well.
print(f'Found a list of {len(italian_birds_list)} italian birds')
print('First 10 samples:')
print('\t', italian_birds_list[:10], '....')


The list already exists in memory...retrieving birds_of_italy.txt
Found a list of 559 italian birds
First 10 samples:
	 ['Acanthis cabaret', 'Acanthis flammea', 'Accipiter brevipes', 'Accipiter gentilis', 'Accipiter nisus', 'Acrocephalus agricola', 'Acrocephalus arundinaceus', 'Acrocephalus dumetorum', 'Acrocephalus melanopogon', 'Acrocephalus paludicola'] ....


## 2. Matching the Birsdnap content with the 'Italian' birds list
#### Now we have to match what we have in our dataset and whose scientific name is available, with the list of the most common birds of Italy. First off, let's push the list of the scientific names just collected, into our dataframe

In [7]:
birds_for_training = [bird for bird in birdsnap_species if bird.replace('_', ' ').lower() in [bird.lower() for bird in italian_birds_list]]
print(f'Found {len(birds_for_training)} classes for training which are both in the \'italian birds\' and in the \'Birdsnap\' dataset...')

Found 101 classes for training which are both in the 'italian birds' and in the 'Birdsnap' dataset...


#### The result is quite disappointing. Only 101 birds match the fact of being in the Birdsnap and being commonly spottable in Italy.

## 2. Downloading the dataset content for italian species

In [8]:
birds_for_training[0]

'Accipiter gentilis'

In [9]:
birdsnap_species[birds_for_training[0]]

'Northern_Goshawk'

In [11]:
#let's import hugging face hub. a library to manage hugging_face repos
birdsnap_download_path = BIRDSNAP_DATASET_PATH / 'download'

if not os.listdir(birdsnap_download_path / 'images'): #the path where the original .tar files are download from the remote repo. If that's empty then the download starts
    from huggingface_hub import hf_hub_download

    downloaded_species = [] #local paths of files

    for class_for_training in birds_for_training:
        try:
            class_for_training = birdsnap_species[class_for_training]
            local_path = hf_hub_download(
                repo_id='HuggingFaceM4/Birdsnap',
                filename=f'images/{class_for_training}.tar',
                repo_type='dataset',
                local_dir=birdsnap_download_path,
            )
            downloaded_species.append(local_path)
        except Exception as exception:
            print(f'An exception occurred: {exception.__class__} for {class_for_training}')
    print(f'Downloaded {len(downloaded_species)} images')
else:
    #if the ./Birdsnap/download/images contains folders (which should be species) then it means that the download must not start and that we can retrieve a list of species directly from the folder structure
    downloaded_species = os.listdir(birdsnap_download_path / 'images')

In [12]:
import tarfile

for dirpath, dirnames, filenames in os.walk(birdsnap_download_path / 'images'):
    for tar_file in filenames:
        with tarfile.open(os.path.join(dirpath, tar_file)) as tar_file_opened:
            tar_file_opened.extractall(path=birdsnap_download_path / 'images')
        os.remove(os.path.join(dirpath, tar_file))

ReadError: file could not be opened successfully:
- method gz: ReadError('not a gzip file')
- method bz2: ReadError('not a bzip2 file')
- method xz: ReadError('not an lzma file')
- method tar: ReadError('bad checksum')

#### Once the folders are downloaded from the remote repo we must translate the names of the local folders to Italian, this way the labels used for the training (and then for the inference step) will be in Italian

In [25]:
#opens the database of species which gives the italian and scientific name of a bird identified by its english common name
with open(ENG_ITA_NAMES, 'r') as names_dict:
    eng_ita_names = json.load(names_dict)

for index, folder_name in enumerate(downloaded_species):
    try:
        ita_name = eng_ita_names[folder_name]['italian']
        print(f'{index} | {folder_name} ---> {ita_name}')
        if os.path.isdir(birdsnap_download_path / 'images' / folder_name):
            os.rename(birdsnap_download_path / 'images' / folder_name, birdsnap_download_path / 'images' / ita_name)
    except KeyError:
        print(f'No italian birds found for {folder_name}')
        continue

0 | Pectoral_Sandpiper ---> Piro pettorale
1 | Pacific_Golden_Plover ---> Piviere dorato del Pacifico
2 | Bairds_Sandpiper ---> Gambecchio di Baird
3 | Herring_Gull ---> Gabbiano reale americano
4 | Greater_Scaup ---> Moretta grigia
5 | Pine_Grosbeak ---> Beccogrosso di pini
6 | Short_eared_Owl ---> Allocco delle paludi
7 | Black_Tern ---> Sterna nera
8 | Barn_Swallow ---> Rondine comune
9 | Lesser_Yellowlegs ---> Piro pettegola minore
10 | Wilsons_Phalarope ---> Falco di Wilson
11 | Franklins_Gull ---> Gabbiano di Franklin
12 | Mute_Swan ---> Cigno reale
13 | Merlin ---> Smeriglio
14 | Ring_necked_Pheasant ---> Fagiano di monte
15 | Laughing_Gull ---> Gabbiano ridens
16 | Harlequin_Duck ---> Anatra arlecchino
17 | White_winged_Scoter ---> Moretta ali bianche
18 | Long_tailed_Duck ---> Moretta codona
19 | Green_winged_Teal ---> Alzavola americana
20 | Greater_White_fronted_Goose ---> Oca lombardella
21 | Hermit_Thrush ---> Tordo eremita
22 | Bohemian_Waxwing ---> Beccofrusone boreale
2

In [29]:
utilities.create_train_cv_from_folder(
    root=BIRDSNAP_DATASET_PATH/'download/images',
    train_cv_perc=0.9,
    train_folder=DATASET_TRAIN_FOLDER,
    cv_folder=DATASET_TEST_FOLDER,
    
)

Working on images | Moving 0 pictures
Working on Moretta americana | Moving 89 pictures
Working on Piro piccolo | Moving 97 pictures
Working on Astore | Moving 91 pictures
Working on Aquila reale | Moving 88 pictures
Working on Smeriglio | Moving 94 pictures
Working on Cuculo beccogiallo | Moving 89 pictures
Working on Gabbiano tridattilo | Moving 77 pictures
Working on Gabbiano reale nordico | Moving 69 pictures
Working on Airone bianco maggiore | Moving 89 pictures
Working on Passero domestico | Moving 87 pictures
Working on Sterna reale | Moving 73 pictures
Working on Piviere panciabianca | Moving 90 pictures
Working on Albanella codabianca | Moving 89 pictures
Working on Tordo eremita | Moving 94 pictures
Working on Piovanello viola | Moving 86 pictures
Working on Gabbiano glauco | Moving 85 pictures
Working on Averla maggiore americana | Moving 83 pictures
Working on Cigno siberiano | Moving 67 pictures
Working on Strolaga minore | Moving 84 pictures
Working on Crociere ali bianch