In [22]:
import tarfile
import os
import pandas as pd
from pathlib import Path

In [23]:
def get_valid_bird_data(bird_metadata:'DataFrame', nb_image_threshold:int=50) -> 'DataFrame':
    """
    Filter bird data to keep only species with more than X images
    """
    return bird_metadata.loc[bird_metadata['nb_images'] >= nb_image_threshold,:]

In [24]:
def get_top_birds(bird_metadata:'DataFrame', nb_birds:int=50) -> 'DataFrame':
    '''
    Return dataframe with top nb_birds in term of image number
    '''
    return bird_metadata.sort_values(by='nb_images', ascending=False).iloc[:nb_birds,:]

In [25]:
def gzip_folder(folder_path:str, output_tgz_path:str) -> None:
    if not os.path.isdir(folder_path):
        raise ValueError(f"The folder {folder_path} does not exist.")
    
    with tarfile.open(output_tgz_path, "w:gz") as tar:
        tar.add(folder_path, arcname=os.path.basename(folder_path))

In [26]:
def extract_folder_from_tgz(tgz_path:str, target_folder:str, output_dir_path:str) -> None:
    """
    Extract target folder from a .tgz file to output_dir_path

    tgz_path: path to .tgz file
    target_folder: relative path to folder inside .tgz file
    output_dir_paht: path to compressed folder 
    """

    with tarfile.open(tgz_path, 'r:gz') as tar:
        for member in tar.getmembers():
            # print(member)
            # Check if the member belongs to the target folder
            if member.name.startswith(target_folder):
                # Extract the member to the output directory
                tar.extract(member, path=output_dir_path)

In [27]:
def extract_folders_from_tgz(tgz_path:str, target_folders_dict:set, output_dir_path:str) -> None:
    """
    Extract target folders from a .tgz file to given location

    tgz_path: path to .tgz file
    targets_folders_dict: dict (target_folder:species_name)
    output_dir_paht: path to compressed folder 
    """

    with tarfile.open(tgz_path, 'r:gz') as tar:
        for member in tar.getmembers():
            # print(member)
            # Check if the member belongs to the target folder
            for target_folder in target_folders_dict:
                if member.name.startswith(target_folder):
                    # Extract the member to the output directory
                    tar.extract(member, path=output_dir_path)

In [36]:
def rename_and_compress_folder(folder_path:str, bird_latin_name:str) -> None:
    """
    Rename extracted image folder to a proper name
    Also rename images
    Create an old_name, new_name table for each image
    Finally compress folder
    
    Naming convention for folder is latin_name
    naming convention for image is latin_name_imgNumber.jpg
    """
    target_folder_location = os.path.dirname(folder_path)
    
    # rename images
    files = Path(folder_path).glob("*.jpg")
    for index, filename in enumerate(files):
        new_image_name = bird_latin_name + "_" + str(index) + ".jpg"
        new_image_path = os.path.join(folder_path, new_image_name)
        os.rename(filename, new_image_path)

    # rename folder
    new_folder_path = os.path.join(target_folder_location, bird_latin_name)
    os.rename(folder_path, new_folder_path)

    # compress folder
    gzip_folder(new_folder_path, new_folder_path + '.tgz')



In [29]:
french_bird_metadata_file = "../../french_birds_metadata.csv"
french_bird_metadata = pd.read_csv(french_bird_metadata_file, sep=";", header=0)
french_bird_metadata.head(2)

Unnamed: 0,Categorie,Ordre,Famille,Nom_EN,Nom_LT,Nom_FR,Url_EN,Url_FR,english_name_snake_case,latin_name_snake_case,bird_name,bird_name_raw,dataset_part,folder,nb_images
0,"Ducks, geese, and waterfowl",Anseriformes,Anatidae,Snow goose,Anser caerulescens,oie des neiges,https://en.wikipedia.org/wiki/Snow_goose,https://fr.wikipedia.org/wiki/Oie_des_neiges,snow_goose,anser_caerulescens,snow_goose,Snow Goose,DIB-10K_1,83.Snow Goose,1908
1,"Ducks, geese, and waterfowl",Anseriformes,Anatidae,Greater white-fronted goose,Anser albifrons,oie rieuse,https://en.wikipedia.org/wiki/Greater_white-fr...,https://fr.wikipedia.org/wiki/Oie_rieuse,greater_white_fronted_goose,anser_albifrons,greater_white_fronted_goose,Greater White-fronted Goose,DIB-10K_2,89.Greater White-fronted Goose,2613


In [30]:
french_bird_metadata_valid = get_valid_bird_data(french_bird_metadata, 100)
french_bird_metadata.shape, french_bird_metadata_valid.shape

((546, 15), (541, 15))

In [31]:
top_birds_metadata = get_top_birds(french_bird_metadata_valid, 50)
top_birds_metadata.shape

(50, 15)

In [32]:
# data_folder = '../raw_data/IDB'
# folder = french_bird_metadata['folder'].iloc[16]
# tgz_path = data_folder + '/' + french_bird_metadata['dataset_part'].iloc[16] + ".tgz"

# output_dir='test'
# # os.mkdir(output_dir)
# extract_folder_from_tgz(tgz_path, folder, output_dir)

In [38]:
dataset_parts = list(top_birds_metadata['dataset_part'].unique())
output_dir='../french_bird_db'
data_folder = '../raw_data/IDB'

# for each dataset part, list folder that need to be retrieved and do it
for dataset_part in dataset_parts:
    print(dataset_part)
    sub_bird_df = top_birds_metadata.loc[top_birds_metadata['dataset_part'] == dataset_part]
    target_species_dict = dict(zip(sub_bird_df['folder'], sub_bird_df['english_name_snake_case']))
    folder2latin_dict = dict(zip(sub_bird_df['folder'], sub_bird_df['latin_name_snake_case']))
    tgz_path = data_folder + '/' + dataset_part + ".tgz"

    for target_folder in target_species_dict:
        print("extracting ", target_folder)
        extract_folder_from_tgz(tgz_path, target_folder, output_dir)
        extracted_folder = os.path.join(output_dir, target_folder)
        print("renaming images and folder then compressing")
        rename_and_compress_folder(extracted_folder, folder2latin_dict[target_folder])

DIB-10K_2
extracting  109.Egyptian Goose
renaming images and folder then compressing
extracting  120.Ruddy Shelduck
renaming images and folder then compressing
extracting  94.Mute Swan
renaming images and folder then compressing
extracting  92.Black Swan
renaming images and folder then compressing
DIB-10K_9
extracting  750.White Stork
renaming images and folder then compressing
extracting  745.Black Stork
renaming images and folder then compressing
extracting  758.African Sacred Ibis
renaming images and folder then compressing
DIB-10K_24
extracting  1762.Common Tern
renaming images and folder then compressing
extracting  1764.Arctic Tern


KeyboardInterrupt: 

In [37]:
dataset_parts = list(top_birds_metadata['dataset_part'].unique())
output_dir='../french_bird_db'
data_folder = '../raw_data/IDB'

# for each dataset part, list folder that need to be retrieved and do it
dataset_part = dataset_parts[0]

sub_bird_df = top_birds_metadata.loc[top_birds_metadata['dataset_part'] == dataset_part]
target_species_dict = dict(zip(sub_bird_df['folder'], sub_bird_df['english_name_snake_case']))
folder2latin_dict = dict(zip(sub_bird_df['folder'], sub_bird_df['latin_name_snake_case']))
tgz_path = data_folder + '/' + dataset_part + ".tgz"

for target_folder in target_species_dict:
    print("extracting ", target_folder)
    extract_folder_from_tgz(tgz_path, target_folder, output_dir)
    extracted_folder = os.path.join(output_dir, target_folder)
    print("renaming images and folder then compressing")
    rename_and_compress_folder(extracted_folder, folder2latin_dict[target_folder])
# extract_folders_from_tgz(tgz_path, target_species_dict, output_dir)

extracting  109.Egyptian Goose
renaming images and folder then compressing


OSError: [Errno 39] Directory not empty: '../french_bird_db/109.Egyptian Goose' -> '../french_bird_db/alopochen_aegyptiaca'

In [None]:
# compress all images into a tgz file