# 0. Importing the libraries needed for the dataset preparation task

In [1]:
import os
import pathlib
import shutil
import pandas as pd
import random
import requests
import time
import os
from tqdm import tqdm
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import json

#this is the library 
try:
    from scripts import model_setup
    from scripts import utilities
except ImportError:
    !git clone https://github.com/thomaspierantozzi/PyTorch_Course_by_Daniel_Bourke
    !mv PyTorch_Course_by_Daniel_Bourke/05_PyTorch_Food101/scripts/ .
    !rm -rf ./PyTorch_Course_by_Daniel_Bourke
    from scripts import model_setup
    from scripts import utilities



####Here under we create an ImageFolder folder, and we base its structure on the DataSetFolder from Torchvision documentation: [here](https://docs.pytorch.org/vision/main/generated/torchvision.datasets.DatasetFolder.html)

![Folder structure](

In [2]:
#CONSTANTS AND GENERAL CONFIGURATION FOR THE SCRIPT
IMAGEFOLDER = pathlib.Path('./ImageFolder') #a folder which will be used as a base for the DataSet at training time
DATASET_TRAIN_FOLDER = IMAGEFOLDER / 'train'
DATASET_TEST_FOLDER = IMAGEFOLDER / 'cv'

if not IMAGEFOLDER.exists():
    os.mkdir(IMAGEFOLDER)
    os.makedirs(DATASET_TRAIN_FOLDER, exist_ok=True)
    os.makedirs(DATASET_TEST_FOLDER, exist_ok=True)
    print('Created the base folders for the ImageFolder requested tree...')

Created the base folders for the ImageFolder requested tree...
Dataset path exists:
	 /Users/thomaspierantozzi/PycharmProjects/Project_Birds/Datasets

Dataset_ready path exists:
	 /Users/thomaspierantozzi/PycharmProjects/Project_Birds/Datasets_ready


#### Now let's define a list of most commonly spotted birds in Italy, since the final app is meant to be use by Italian birdwatchers.

In [3]:
def create_italian_birds_list():
    #extracting a list of birds most likely to be spotted in Italy from wikipedia
    import requests
    from bs4 import BeautifulSoup

    url = "https://en.wikipedia.org/wiki/List_of_birds_of_Italy" #the web page from witch we will basically scrap the info
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    birds = []

    for bird in soup.select("div div div main div div div div ul li i"):
        bird_name = bird.get_text(strip=True)
        birds.append(bird_name)

    #adding the birds of the orders represented by fewer species, which are formatted differently in the webpage
    for bird in soup.select("div div div main div div div ul li i"):
        bird_name = bird.get_text(strip=True)
        birds.append(bird_name)

    # Save to file
    with open("birds_of_italy.txt", "w", encoding="utf-8") as f:
        for bird in sorted(set(birds)):
            f.write(bird + "\n")
        print(f'A list of the italian birds has been exported in: {os.getcwd()}/birds_of_italy.txt')
    
    print('The variable now holds a list of the scientific names for the birds most likely to spot in Italy...')
    return birds

In [4]:
italian_birds_list = create_italian_birds_list() #this will be a list loaded in memory with all the italian birds. A *.txt file is available as well.

A list of the italian birds has been exported in: /Users/thomaspierantozzi/PycharmProjects/Project_Birds/birds_of_italy.txt
The variable now holds a list of the scientific names for the birds most likely to spot in Italy...


# 1. BirdSnap Dataset

### 1. Sorting the dataset content
#### The dataset found on huggingface reports just a couple of *.txt file which list only the species and the links to collect the 50,000 and counting pictures. The best way to deal with that is to set some lines of code with the aid of multithreading module

In [5]:
BIRDSNAP_DATASET_PATH = pathlib.Path('./Datasets/BirdSnap')

birdsnap_file_name = []
birdsnap_links = []

counter = 0
with open(f'{BIRDSNAP_DATASET_PATH}/images.txt', 'r') as images_text_file:
    for line in images_text_file.readlines():
        words = line.split()
        birdsnap_links.append(words[0])
        birdsnap_file_name.append(words[2])

birdsnap_file_name.pop(0) #removing the values found in the first line which are only representing the header of the file
birdsnap_links.pop(0)
        
print(f'The system found {len(birdsnap_file_name)} birdsnap examples and {len(birdsnap_links)} birdsnap links.')

The system found 49829 birdsnap examples and 49829 birdsnap links.


#### To turn the ommon names reported in the dataset, we need to leverage an API. The documentation can be find [here](https://nuthatch.lastelm.software/swagger.html)

In [6]:
#API key for the Nuthatch enddpoint. Needed to turn the common names of birds into the scientific ones
API_KEY='0587f3d3-b727-4a97-89d7-d3d27a850ff5'
API_ENDPOINT = 'https://nuthatch.lastelm.software/v2/birds'
COMMON_TO_SCI_NAME_FILE = pathlib.Path(BIRDSNAP_DATASET_PATH / 'Common_to_sci_name_NUTHATCH.json')

headers = {
    'accept': 'application/json',
    'API-Key': API_KEY,
}

#birds_commons_names = list(birdsnap_df['species'].unique())
birds_sci_names = {} 
ok_counter = 0 #a counter to plot how many ok and ok results we gather, while downloading
nok_counter = 0

if not COMMON_TO_SCI_NAME_FILE.exists():
    for name in birds_commons_names:
        response_bird = requests.get(API_ENDPOINT, headers=headers, params={'name': name.replace('_', ' ')})
        response_bird.raise_for_status()
        response_bird_json = response_bird.json()
        try:
            birds_sci_names.update({name:response_bird_json['entities'][0]['sciName']})
            ok_counter += 1
        except IndexError:
            birds_sci_names.update({name:f'{name}**'}) #where a sci_name is not found then i mark the common name with ** to highlight that fact
            nok_counter += 1
            print(f'Results gathered: {ok_counter:0>3} OK / {nok_counter:0>3} NOK - {ok_counter + nok_counter:0>3} / 500', end='\r')
        time.sleep(0.3)
    
    with open(COMMON_TO_SCI_NAME_FILE, 'a') as file:
        json.dump(birds_sci_names, file, indent=4)
        
else:
    with open(COMMON_TO_SCI_NAME_FILE, 'r') as file:
        birds_sci_names=json.load(file)

#### Now we have to match what we have in our dataset and whose scientific name is available, with the list of the most common birds of Italy. First off, let's push the list of the scientific names just collected, into our dataframe

In [8]:
birds_for_training = [bird for bird in birds_sci_names.values() if bird.replace('_', ' ').lower() in [bird.lower() for bird in italian_birds_list]]
print(f'Found {len(birds_for_training)} classes for training which are both in the \'italian birds\' and in the \'Birdsnap\' dataset...')

Found 71 classes for training which are both in the 'italian birds' and in the 'Birdsnap' dataset...


#### The result is quite disappointing. Only 71 birds match the fact of being in the Birdsnap and being commonly spottable in Italy.

## 2. Downloading the dataset content for italian species

#### it seems that downloading the dataset via Flickr URLs creates lots of non-valid files. Let's try finding a repository where the pictures are already available ready-to-use.
#### The following one ([here](https://huggingface.co/datasets/HuggingFaceM4/Birdsnap/tree/main)) seems promising

In [9]:
#let's import hugging face hub. a library to manage hugging_face repos
from huggingface_hub import hf_hub_download

#inverting the birds_sci_name dictionary: the keys and values must be swapped for the purpose of the next loop
birds_sci_names_inverted = {value: key for key, value in birds_sci_names.items()}

#having the dictionary inverted we can draw a name from birds_for_training (which contains sc_names) and get the file downloaded from HF (where the files are stored labelled with their common name)
for class_for_training in birds_for_training: #TODO 1: QUI DEVI INSERIRE IL NOME COMUNE E NON QUELLO SCIENTIFICO. USA IL DICT BIRDS_SCI_NAME PER AIUTARTI
    class_for_training = birds_sci_names_inverted[class_for_training]
    hf_hub_download(
        repo_id='HuggingFaceM4/Birdsnap',
        filename=f'images/{class_for_training}.tar',
        repo_type='dataset',
        local_dir=BIRDSNAP_DATASET_PATH / 'download',
    )

Northern_Goshawk.tar:   0%|          | 0.00/198M [00:00<?, ?B/s]

Golden_Eagle.tar:   0%|          | 0.00/260M [00:00<?, ?B/s]

Horned_Lark.tar:   0%|          | 0.00/159M [00:00<?, ?B/s]

Common_Murre.tar:   0%|          | 0.00/94.5M [00:00<?, ?B/s]

Northern_Pintail.tar:   0%|          | 0.00/264M [00:00<?, ?B/s]

Eurasian_Wigeon.tar:   0%|          | 0.00/93.1M [00:00<?, ?B/s]

Mallard.tar:   0%|          | 0.00/169M [00:00<?, ?B/s]

Gadwall.tar:   0%|          | 0.00/157M [00:00<?, ?B/s]

Greater_Scaup.tar:   0%|          | 0.00/121M [00:00<?, ?B/s]

Common_Goldeneye.tar:   0%|          | 0.00/131M [00:00<?, ?B/s]

Harlequin_Duck.tar:   0%|          | 0.00/129M [00:00<?, ?B/s]

Common_Merganser.tar:   0%|          | 0.00/146M [00:00<?, ?B/s]

Ruddy_Duck.tar:   0%|          | 0.00/161M [00:00<?, ?B/s]

Common_Eider.tar:   0%|          | 0.00/205M [00:00<?, ?B/s]

Brant.tar:   0%|          | 0.00/136M [00:00<?, ?B/s]

Canada_Goose.tar:   0%|          | 0.00/219M [00:00<?, ?B/s]

Snow_Goose.tar:   0%|          | 0.00/79.0M [00:00<?, ?B/s]

Tundra_Swan.tar:   0%|          | 0.00/107M [00:00<?, ?B/s]

Mute_Swan.tar:   0%|          | 0.00/155M [00:00<?, ?B/s]

Great_Egret.tar:   0%|          | 0.00/217M [00:00<?, ?B/s]

Cattle_Egret.tar:   0%|          | 0.00/169M [00:00<?, ?B/s]

Bohemian_Waxwing.tar:   0%|          | 0.00/123M [00:00<?, ?B/s]

Lapland_Longspur.tar:   0%|          | 0.00/160M [00:00<?, ?B/s]

Snow_Bunting.tar:   0%|          | 0.00/118M [00:00<?, ?B/s]

Rock_Pigeon.tar:   0%|          | 0.00/209M [00:00<?, ?B/s]

Common_Raven.tar:   0%|          | 0.00/168M [00:00<?, ?B/s]

Merlin.tar:   0%|          | 0.00/146M [00:00<?, ?B/s]

Peregrine_Falcon.tar:   0%|          | 0.00/65.3M [00:00<?, ?B/s]

Common_Redpoll.tar:   0%|          | 0.00/124M [00:00<?, ?B/s]

Red_Crossbill.tar:   0%|          | 0.00/85.1M [00:00<?, ?B/s]

Pine_Grosbeak.tar:   0%|          | 0.00/233M [00:00<?, ?B/s]

Common_Loon.tar:   0%|          | 0.00/194M [00:00<?, ?B/s]

Barn_Swallow.tar:   0%|          | 0.00/95.7M [00:00<?, ?B/s]

Bobolink.tar:   0%|          | 0.00/129M [00:00<?, ?B/s]

Herring_Gull.tar:   0%|          | 0.00/204M [00:00<?, ?B/s]

Mew_Gull.tar:   0%|          | 0.00/205M [00:00<?, ?B/s]

Iceland_Gull.tar:   0%|          | 0.00/89.8M [00:00<?, ?B/s]

Glaucous_Gull.tar:   0%|          | 0.00/163M [00:00<?, ?B/s]

Laughing_Gull.tar:   0%|          | 0.00/186M [00:00<?, ?B/s]

Black_Tern.tar:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Caspian_Tern.tar:   0%|          | 0.00/106M [00:00<?, ?B/s]

Roseate_Tern.tar:   0%|          | 0.00/84.5M [00:00<?, ?B/s]

Common_Tern.tar:   0%|          | 0.00/98.1M [00:00<?, ?B/s]

Arctic_Tern.tar:   0%|          | 0.00/224M [00:00<?, ?B/s]

Sandwich_Tern.tar:   0%|          | 0.00/115M [00:00<?, ?B/s]

American_Pipit.tar:   0%|          | 0.00/162M [00:00<?, ?B/s]

Northern_Bobwhite.tar:   0%|          | 0.00/101M [00:00<?, ?B/s]

Osprey.tar:   0%|          | 0.00/146M [00:00<?, ?B/s]

House_Sparrow.tar:   0%|          | 0.00/124M [00:00<?, ?B/s]

Great_Cormorant.tar:   0%|          | 0.00/98.9M [00:00<?, ?B/s]

Rock_Ptarmigan.tar:   0%|          | 0.00/150M [00:00<?, ?B/s]

Horned_Grebe.tar:   0%|          | 0.00/90.0M [00:00<?, ?B/s]

Eared_Grebe.tar:   0%|          | 0.00/89.3M [00:00<?, ?B/s]

Monk_Parakeet.tar:   0%|          | 0.00/208M [00:00<?, ?B/s]

Purple_Gallinule.tar:   0%|          | 0.00/167M [00:00<?, ?B/s]

Red_Phalarope.tar:   0%|          | 0.00/117M [00:00<?, ?B/s]

Ruddy_Turnstone.tar:   0%|          | 0.00/176M [00:00<?, ?B/s]

Upland_Sandpiper.tar:   0%|          | 0.00/80.8M [00:00<?, ?B/s]

Sanderling.tar:   0%|          | 0.00/103M [00:00<?, ?B/s]

Dunlin.tar:   0%|          | 0.00/92.7M [00:00<?, ?B/s]

Red_Knot.tar:   0%|          | 0.00/86.8M [00:00<?, ?B/s]

Purple_Sandpiper.tar:   0%|          | 0.00/216M [00:00<?, ?B/s]

Pectoral_Sandpiper.tar:   0%|          | 0.00/117M [00:00<?, ?B/s]

Least_Sandpiper.tar:   0%|          | 0.00/165M [00:00<?, ?B/s]

Whimbrel.tar:   0%|          | 0.00/162M [00:00<?, ?B/s]

Lesser_Yellowlegs.tar:   0%|          | 0.00/211M [00:00<?, ?B/s]

Willet.tar:   0%|          | 0.00/223M [00:00<?, ?B/s]

European_Starling.tar:   0%|          | 0.00/176M [00:00<?, ?B/s]

Northern_Gannet.tar:   0%|          | 0.00/158M [00:00<?, ?B/s]

Glossy_Ibis.tar:   0%|          | 0.00/167M [00:00<?, ?B/s]

Hermit_Thrush.tar:   0%|          | 0.00/183M [00:00<?, ?B/s]

In [10]:
import tarfile

for dirpath, dirnames, filenames in os.walk(BIRDSNAP_DATASET_PATH / 'download/images'):
    for tar_file in filenames:
        with tarfile.open(os.path.join(dirpath, tar_file)) as tar_file_opened:
            tar_file_opened.extractall(path=BIRDSNAP_DATASET_PATH / 'download/images')
        os.remove(os.path.join(dirpath, tar_file))

In [11]:
utilities.create_train_cv_from_folder(
    root=BIRDSNAP_DATASET_PATH/'download/images',
    train_cv_perc=0.9,
    train_folder=DATASET_TRAIN_FOLDER,
    cv_folder=DATASET_TEST_FOLDER,
    
)

Working on images | Moving 0 pictures
Working on Pectoral_Sandpiper | Moving 78 pictures
Working on Herring_Gull | Moving 82 pictures
Working on Greater_Scaup | Moving 85 pictures
Working on Pine_Grosbeak | Moving 86 pictures
Working on Black_Tern | Moving 74 pictures
Working on Barn_Swallow | Moving 84 pictures
Working on Lesser_Yellowlegs | Moving 90 pictures
Working on Mute_Swan | Moving 74 pictures
Working on Merlin | Moving 94 pictures
Working on Laughing_Gull | Moving 88 pictures
Working on Harlequin_Duck | Moving 64 pictures
Working on Purple_Gallinule | Moving 67 pictures
Working on Hermit_Thrush | Moving 94 pictures
Working on Bohemian_Waxwing | Moving 69 pictures
Working on Northern_Pintail | Moving 91 pictures
Working on Sanderling | Moving 77 pictures
Working on Monk_Parakeet | Moving 91 pictures
Working on Common_Eider | Moving 87 pictures
Working on Lapland_Longspur | Moving 80 pictures
Working on Arctic_Tern | Moving 84 pictures
Working on Ruddy_Duck | Moving 89 pictures