# First Download Data using api:
https://www.kaggle.com/discussions/general/74235

In [1]:
 !pip install -q kaggle

In [2]:
 from google.colab import files
 # upload json file:
 files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jackparkinson","key":"fab3b257b9f0ef42e6a66b63e329f43e"}'}

In [3]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [4]:
 ! chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle datasets download -d gpiosenka/100-bird-species

Dataset URL: https://www.kaggle.com/datasets/gpiosenka/100-bird-species
License(s): CC0-1.0
Downloading 100-bird-species.zip to /content
100% 1.95G/1.96G [00:21<00:00, 117MB/s] 
100% 1.96G/1.96G [00:21<00:00, 98.6MB/s]


## Set up paths

In [6]:
import zipfile
import os

In [128]:
zip_path = '/content/100-bird-species.zip'

# directory to unzip (can change if you want)
extract_to = '/content/birds_dataset'

# removes any current directory with name
if os.path.exists(extract_to):
    !rm -rf {extract_to}

os.makedirs(extract_to, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Unzipping completed!")

Unzipping completed!


In [129]:
train_dir = '/content/birds_dataset/train'
test_dir = '/content/birds_dataset/test'
valid_dir = '/content/birds_dataset/valid'

# counts files
def count_images(directory):
    count = 0
    for _, _, files in os.walk(directory):
        count += len(files)
    return count

train_count = count_images(train_dir)
valid_count = count_images(valid_dir)
test_count = count_images(test_dir)
train_split = train_count / (train_count+test_count+valid_count)
valid_split = valid_count / (train_count+test_count+valid_count)
test_split = test_count / (train_count+test_count+valid_count)


print(f"Number of images in Train: {train_count} with split: {train_split}")
print(f"Number of images in Valid: {valid_count} with split: {valid_split}")
print(f"Number of images in Test: {test_count} with split: {test_split}")

Number of images in Train: 84635 with split: 0.9415920342660066
Number of images in Valid: 2625 with split: 0.029203982866996717
Number of images in Test: 2625 with split: 0.029203982866996717


#### There's an extra space for one of the birds in the train and test dir:

In [130]:
def check_directories(path):
    subdirs = os.listdir(path)
    print(f'Total directories in {path}: {len(subdirs)}')
    return set(os.listdir(path))

# retrieve the list of species in each directory
train_species = check_directories(train_dir)
test_species = check_directories(test_dir)
valid_species = check_directories(valid_dir)

# find the union of all species (all species that appear in any directory)
all_species = train_species.union(test_species).union(valid_species)

# find what is missing from each set
missing_from_train = all_species - train_species
missing_from_test = all_species - test_species
missing_from_valid = all_species - valid_species

print("Species missing from train:", missing_from_train)
print("Species missing from test:", missing_from_test)
print("Species missing from valid:", missing_from_valid)


Total directories in /content/birds_dataset/train: 525
Total directories in /content/birds_dataset/test: 525
Total directories in /content/birds_dataset/valid: 525
Species missing from train: {'PARAKETT AUKLET'}
Species missing from test: {'PARAKETT AUKLET'}
Species missing from valid: {'PARAKETT  AUKLET'}


In [131]:
import os

train_dir = '/content/birds_dataset/train'
test_dir = '/content/birds_dataset/test'

# define the incorrect and correct paths for train and test
incorrect_path_train = os.path.join(train_dir, 'PARAKETT  AUKLET')
correct_path_train = os.path.join(train_dir, 'PARAKETT AUKLET')

incorrect_path_test = os.path.join(test_dir, 'PARAKETT  AUKLET')
correct_path_test = os.path.join(test_dir, 'PARAKETT AUKLET')

# rename the directories in train and test
try:
    os.rename(incorrect_path_train, correct_path_train)
    print(f"Renamed in train from '{incorrect_path_train}' to '{correct_path_train}'")
except FileNotFoundError as e:
    print(f"Error in train rename: {e}")

try:
    os.rename(incorrect_path_test, correct_path_test)
    print(f"Renamed in test from '{incorrect_path_test}' to '{correct_path_test}'")
except FileNotFoundError as e:
    print(f"Error in test rename: {e}")


Renamed in train from '/content/birds_dataset/train/PARAKETT  AUKLET' to '/content/birds_dataset/train/PARAKETT AUKLET'
Renamed in test from '/content/birds_dataset/test/PARAKETT  AUKLET' to '/content/birds_dataset/test/PARAKETT AUKLET'


In [132]:
def check_directories(path):
    subdirs = os.listdir(path)
    print(f'Total directories in {path}: {len(subdirs)}')
    return set(os.listdir(path))


train_species = check_directories(train_dir)
test_species = check_directories(test_dir)
valid_species = check_directories(valid_dir)


all_species = train_species.union(test_species).union(valid_species)


missing_from_train = all_species - train_species
missing_from_test = all_species - test_species
missing_from_valid = all_species - valid_species

print("Species missing from train:", missing_from_train)
print("Species missing from test:", missing_from_test)
print("Species missing from valid:", missing_from_valid)


Total directories in /content/birds_dataset/train: 525
Total directories in /content/birds_dataset/test: 525
Total directories in /content/birds_dataset/valid: 525
Species missing from train: set()
Species missing from test: set()
Species missing from valid: set()


In [133]:
def check_species_file_count(directory, expected_count, directory_name):
    discrepancies = []
    for species in os.listdir(directory):
        species_dir = os.path.join(directory, species)
        file_count = len(os.listdir(species_dir))
        if file_count != expected_count:
            discrepancies.append((species, file_count, directory_name))
    return discrepancies


expected_files = 5
# check discrepancies in test and valid directories
test_discrepancies = check_species_file_count(test_dir, expected_files, "Test")
valid_discrepancies = check_species_file_count(valid_dir, expected_files, "Valid")

print("printing species that don't have 5 birds in train or validation dir: ")
# print discrepancies
for species, count, directory in test_discrepancies:
    print(f"Species '{species}' has {count} files in {directory} directory, expected {expected_files}.")
for species, count, directory in valid_discrepancies:
    print(f"Species '{species}' has {count} files in {directory} directory, expected {expected_files}.")

printing species that don't have 5 birds in train or validation dir: 


## Move around images so its roughly 80/10/10 split:
Can manually specify a different split by choosing num_test_files and num_valid_files variables

By moving 25 files into test/valid

In [134]:
import os
import shutil

def adjust_dataset_split(train_dir, test_dir, valid_dir, num_test_files, num_valid_files):
    """
    Adjusts the dataset split by moving specified numbers of files from each species' training directory
    to its corresponding test and validation directories, ensuring files are moved in numerical order.
    Skips files named '1.jpg' to '5.jpg' to avoid conflicts.
    """
    skipped_files = {'1.jpg', '2.jpg', '3.jpg', '4.jpg', '5.jpg'}  # Files to skip

    for species in os.listdir(train_dir):
        species_train_dir = os.path.join(train_dir, species)
        species_test_dir = os.path.join(test_dir, species)
        species_valid_dir = os.path.join(valid_dir, species)

        os.makedirs(species_test_dir, exist_ok=True)
        os.makedirs(species_valid_dir, exist_ok=True)

        # sort by numeric order
        files = os.listdir(species_train_dir)
        files.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))  # Sort by numeric value extracted from filenames

        # ensure enough files are moved
        available_files = [f for f in files if f not in skipped_files]
        test_files_to_move = available_files[:num_test_files]
        for file in test_files_to_move:
            shutil.move(os.path.join(species_train_dir, file), os.path.join(species_test_dir, file))
           # print(f"Moved {file} to {species_test_dir}")

        # remaining files
        remaining_files = [f for f in os.listdir(species_train_dir) if f not in skipped_files]
        remaining_files.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))

        valid_files_to_move = remaining_files[:num_valid_files]
        for file in valid_files_to_move:
            shutil.move(os.path.join(species_train_dir, file), os.path.join(species_valid_dir, file))
            print(f"Moved {file} to {species_valid_dir}")

#  number of files to move to respective directories
num_test_files = 25
num_valid_files = 25

adjust_dataset_split(train_dir, test_dir, valid_dir, num_test_files, num_valid_files)
print("Files have been moved to achieve the specified dataset split.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Moved 027.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 028.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 029.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 030.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 031.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 032.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 033.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 034.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 035.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 036.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 037.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 038.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 039.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 040.jpg to /content/birds_dataset/valid/PALM NUT VULTURE
Moved 041.jpg to /content/birds_dataset/valid/PALM NU

In [135]:
def check_species_file_count(directory, expected_count, directory_name):
    discrepancies = []
    for species in os.listdir(directory):
        species_dir = os.path.join(directory, species)
        file_count = len(os.listdir(species_dir))
        if file_count != expected_count:
            discrepancies.append((species, file_count, directory_name))
    return discrepancies


expected_files = 30
# check discrepancies in test and valid directories
test_discrepancies = check_species_file_count(test_dir, expected_files, "Test")
valid_discrepancies = check_species_file_count(valid_dir, expected_files, "Valid")

print('number of species without 30 birds in train or test:')
# print discrepancies
for species, count, directory in test_discrepancies:
    print(f"Species '{species}' has {count} files in {directory} directory, expected {expected_files}.")
for species, count, directory in valid_discrepancies:
    print(f"Species '{species}' has {count} files in {directory} directory, expected {expected_files}.")

number of species without 30 birds in train or test:


In [136]:
train_count = count_images(train_dir)
valid_count = count_images(valid_dir)
test_count = count_images(test_dir)
train_split = train_count / (train_count+test_count+valid_count)
valid_split = valid_count / (train_count+test_count+valid_count)
test_split = test_count / (train_count+test_count+valid_count)


print(f"Number of images in Train: {train_count} with proportion: {train_split}")
print(f"Number of images in Valid: {valid_count} with proportion: {valid_split}")
print(f"Number of images in Test: {test_count} with proportion: {test_split}")

Number of images in Train: 58385 with proportion: 0.6495522055960394
Number of images in Valid: 15750 with proportion: 0.1752238972019803
Number of images in Test: 15750 with proportion: 0.1752238972019803


## Augmentations:
First check how many birds in max train, and how many birds we need to add:

In [137]:
import os
import matplotlib.pyplot as plt
import statistics

def count_images_by_species(directory):
    species_counts = {}
    for species in os.listdir(directory):
        species_path = os.path.join(directory, species)
        if os.path.isdir(species_path):  # Ensure it's a directory
            species_count = sum(len(files) for _, _, files in os.walk(species_path))
            species_counts[species] = species_count
    return species_counts

# Directories
train_dir = '/content/birds_dataset/train'
test_dir = '/content/birds_dataset/test'
valid_dir = '/content/birds_dataset/valid'

# Count images by species in train directory
train_counts_by_species = count_images_by_species(train_dir)

# Calculate total images and splits
train_count = sum(train_counts_by_species.values())
valid_count = count_images(valid_dir)
test_count = count_images(test_dir)
total_count = train_count + test_count + valid_count
train_split = train_count / total_count
valid_split = valid_count / total_count
test_split = test_count / total_count

# Maximum and Median calculations
max_count = max(train_counts_by_species.values())
max_species = max(train_counts_by_species, key=train_counts_by_species.get)  # get the species with the max count
median_count = statistics.median(train_counts_by_species.values())  # Calculate median number of images per species

# Calculate the difference from the maximum species count
sum_differences = sum(max_count - count for count in train_counts_by_species.values())

# Output results
print(f"Number of images in Train: {train_count} with split: {train_split}")
print(f"Number of images in Valid: {valid_count} with split: {valid_split}")
print(f"Number of images in Test: {test_count} with split: {test_split}")
print(f"Species with the most images: {max_species} with {max_count} images")
print(f"Median number of images per species: {median_count}")
print(f"Total difference sum from the maximum species count: {sum_differences}")


Number of images in Train: 58385 with split: 0.6495522055960394
Number of images in Valid: 15750 with split: 0.1752238972019803
Number of images in Test: 15750 with split: 0.1752238972019803
Species with the most images: RUFOUS TREPE with 213 images
Median number of images per species: 108
Total difference sum from the maximum species count: 53440


## Augmentations on random birds in train:

In [138]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

data_gen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)


In [139]:
def augment_images(train_dir, target_count, batch_size=32, seed=42):
    species_list = os.listdir(train_dir)
    for index, species in enumerate(species_list):
        species_dir = os.path.join(train_dir, species)
        total_images = len(os.listdir(species_dir))
        if total_images < target_count:
            additional_images_needed = target_count - total_images

            print(f"Processing {index+1}/{len(species_list)}: {species} - currently has {total_images} images; needs {additional_images_needed} more.")

            # Create a temporary generator for the current species
            generator = data_gen.flow_from_directory(
                directory=train_dir,
                classes=[species],
                target_size=(224, 224),
                batch_size=batch_size,
                class_mode='categorical',
                save_to_dir=species_dir,
                save_prefix='aug-',
                save_format='jpg',
                seed=seed
            )

            # Generate the required number of images
            num_batches = additional_images_needed // batch_size + (1 if additional_images_needed % batch_size != 0 else 0)
            for _ in range(num_batches):
                generator.next()


In [140]:
max_images_per_class = max_count

# Call the function to augment images
augment_images(train_dir, max_images_per_class)

Processing 1/525: KING EIDER - currently has 120 images; needs 93 more.
Found 120 images belonging to 1 classes.
Processing 2/525: BANANAQUIT - currently has 115 images; needs 98 more.
Found 115 images belonging to 1 classes.
Processing 3/525: CHESTNET BELLIED EUPHONIA - currently has 82 images; needs 131 more.
Found 82 images belonging to 1 classes.
Processing 4/525: AMERICAN GOLDFINCH - currently has 83 images; needs 130 more.
Found 83 images belonging to 1 classes.
Processing 5/525: ANIANIAU - currently has 100 images; needs 113 more.
Found 100 images belonging to 1 classes.
Processing 6/525: YELLOW BELLIED FLOWERPECKER - currently has 104 images; needs 109 more.
Found 104 images belonging to 1 classes.
Processing 7/525: TRICOLORED BLACKBIRD - currently has 89 images; needs 124 more.
Found 89 images belonging to 1 classes.
Processing 8/525: YELLOW CACIQUE - currently has 105 images; needs 108 more.
Found 105 images belonging to 1 classes.
Processing 9/525: VIOLET BACKED STARLING - c

In [141]:
train_dir = '/content/birds_dataset/train'
test_dir = '/content/birds_dataset/test'
valid_dir = '/content/birds_dataset/valid'

# counts files
def count_images(directory):
    count = 0
    for _, _, files in os.walk(directory):
        count += len(files)
    return count

train_count = count_images(train_dir)
valid_count = count_images(valid_dir)
test_count = count_images(test_dir)
train_split = train_count / (train_count+test_count+valid_count)
valid_split = valid_count / (train_count+test_count+valid_count)
test_split = test_count / (train_count+test_count+valid_count)


print(f"Number of images in Train: {train_count} with split: {train_split}")
print(f"Number of images in Valid: {valid_count} with split: {valid_split}")
print(f"Number of images in Test: {test_count} with split: {test_split}")

Number of images in Train: 114501 with split: 0.7842480530955267
Number of images in Valid: 15750 with split: 0.10787597345223662
Number of images in Test: 15750 with split: 0.10787597345223662
