In [1]:
import tarfile
import os
import pandas as pd

In [27]:
def get_valid_bird_data(bird_metadata:'DataFrame', nb_image_threshold:int=50) -> 'DataFrame':
    """
    Filter bird data to keep only species with more than X images
    """
    return bird_metadata.loc[bird_metadata['nb_images'] >= nb_image_threshold,:]

In [33]:
def get_top_birds(bird_metadata:'DataFrame', nb_birds:int=50) -> 'DataFrame':
    '''
    Return dataframe with top nb_birds in term of image number
    '''
    return bird_metadata.sort_values(by='nb_images', ascending=False).iloc[:nb_birds,:]

In [11]:
def gzip_folder(folder_path:str, output_tgz_path:str) -> None:
    if not os.path.isdir(folder_path):
        raise ValueError(f"The folder {folder_path} does not exist.")
    
    with tarfile.open(output_tgz_path, "w:gz") as tar:
        tar.add(folder_path, arcname=os.path.basename(folder_path))

In [16]:
def extract_folder_from_tgz(tgz_path:str, target_folder:str, output_dir_path:str) -> None:
    """
    Extract target folder from a .tgz file to output_dir_path

    tgz_path: path to .tgz file
    target_folder: relative path to folder inside .tgz file
    output_dir_paht: path to compressed folder 
    """

    with tarfile.open(tgz_path, 'r:gz') as tar:
        for member in tar.getmembers():
            # print(member)
            # Check if the member belongs to the target folder
            if member.name.startswith(target_folder):
                # Extract the member to the output directory
                tar.extract(member, path=output_dir_path)

In [24]:
def extract_folders_from_tgz(tgz_path:str, target_folders_dict:set, output_dir_path:str) -> None:
    """
    Extract target folders from a .tgz file to given location

    tgz_path: path to .tgz file
    targets_folders_dict: dict (target_folder:species_name)
    output_dir_paht: path to compressed folder 
    """

    with tarfile.open(tgz_path, 'r:gz') as tar:
        for member in tar.getmembers():
            # print(member)
            # Check if the member belongs to the target folder
            for target_folder in target_folders_dict:
                if member.name.startswith(target_folder):
                    # Extract the member to the output directory
                    tar.extract(member, path=output_dir_path)
                
                    # rename and compress output dir
                    current_dir_name = os.path.join(output_dir_path, target_folder)
                    new_dir_name = os.path.join(output_dir_path, target_folders_dict[target_folder]) 
                    os.rename(current_dir_name, new_dir_name)
                    gzip_folder(new_dir_name, new_dir_name + '.tgz')

In [None]:
def rename_and_compress_folder(folder_path:str, bird_name:str) -> None:
    """
    Rename extracted image folder to folder
    """

In [13]:
french_bird_metadata_file = "../../french_birds_metadata.csv"
french_bird_metadata = pd.read_csv(french_bird_metadata_file, sep=";", header=0)
french_bird_metadata.head(2)

Unnamed: 0,Categorie,Ordre,Famille,Nom_EN,Nom_LT,Nom_FR,Url_EN,Url_FR,english_name_snake_case,bird_name,bird_name_raw,dataset_part,folder,nb_images
0,"Ducks, geese, and waterfowl",Anseriformes,Anatidae,Snow goose,Anser caerulescens,oie des neiges,https://en.wikipedia.org/wiki/Snow_goose,https://fr.wikipedia.org/wiki/Oie_des_neiges,snow_goose,snow_goose,Snow Goose,DIB-10K_1,83.Snow Goose,1908
1,"Ducks, geese, and waterfowl",Anseriformes,Anatidae,Greater white-fronted goose,Anser albifrons,oie rieuse,https://en.wikipedia.org/wiki/Greater_white-fr...,https://fr.wikipedia.org/wiki/Oie_rieuse,greater_white_fronted_goose,greater_white_fronted_goose,Greater White-fronted Goose,DIB-10K_2,89.Greater White-fronted Goose,2613


In [32]:
french_bird_metadata_valid = get_valid_bird_data(french_bird_metadata, 100)
french_bird_metadata.shape, french_bird_metadata_valid.shape

((546, 14), (424, 14))

In [34]:
top_birds_metadata = get_top_birds(french_bird_metadata_valid, 50)
top_birds_metadata.shape

(50, 14)

In [17]:
# data_folder = '../raw_data/IDB'
# folder = french_bird_metadata['folder'].iloc[16]
# tgz_path = data_folder + '/' + french_bird_metadata['dataset_part'].iloc[16] + ".tgz"

# output_dir='test'
# # os.mkdir(output_dir)
# extract_folder_from_tgz(tgz_path, folder, output_dir)

In [26]:
dataset_parts = list(top_birds_metadata['dataset_part'].unique())
output_dir='test'
data_folder = '../raw_data/IDB'

# for each dataset part, list folder that need to be retrieved and do it
dataset_part = dataset_parts[0]

sub_bird_df = top_birds_metadata.loc[top_birds_metadata['dataset_part'] == dataset_part]
target_species_dict = dict(zip(sub_bird_df['folder'], sub_bird_df['bird_name']))
tgz_path = data_folder + '/' + dataset_part + ".tgz"

for target_folder in target_species_dict:
    extract_folder_from_tgz(tgz_path, target_folder, output_dir)
# extract_folders_from_tgz(tgz_path, target_species_dict, output_dir)