### Part 1. Preprocessing and selecting images from hyena and leopard dataset

We want pictures that are sufficiently bright, that the animal takes a sufficiently large area in the picture, and that it is not grayscale.

#### Step 1: Selecting large-area-of-animal pictures

In [52]:
import os
import random
import pandas as pd
import numpy as np
import shutil
from PIL import Image
import torch
from torchvision import transforms
import ast
from matplotlib import pyplot as plt
import statistics
import csv

Read the file containing areas of the dataset

In [4]:
leo_hye_df = pd.read_csv('C:/mixed/metadata.csv')

In [5]:
leo_hye_df

Unnamed: 0,file_name,width,height,bboxes,areas,normalized_bboxes,category,label
0,000000000001.jpg,2400,1800,"[[163.02083333333334, 411.9791666666667, 2161....",[2674804.6875],"[[0.06792534722222222, 0.22887731481481483, 0....",[0],leopard
1,000000000002.jpg,2400,1800,"[[448.43750000000006, 301.0416666666667, 1794....",[1952204.0473090282],"[[0.18684895833333337, 0.16724537037037038, 0....",[0],leopard
2,000000000003.jpg,2400,1800,"[[476.0416666666667, 628.6458333333334, 1743.7...",[1964443.3593750002],"[[0.19835069444444445, 0.34924768518518523, 0....",[0],leopard
3,000000000004.jpg,2400,1800,"[[206.77083333333334, 255.20833333333334, 2193...",[3080801.595052084],"[[0.0861545138888889, 0.1417824074074074, 0.91...",[0],leopard
4,000000000005.jpg,2400,1800,"[[296.3541666666667, 633.8541666666667, 1947.9...",[1867768.012152778],"[[0.12348090277777779, 0.3521412037037037, 0.8...",[0],leopard
...,...,...,...,...,...,...,...,...
9892,1000000003099.jpg,2400,1600,"[[754.7945205479451, 498.63013698630135, 764.3...",[520932.6327641209],"[[0.31449771689497713, 0.3116438356164383, 0.3...",[1],hyena
9893,1000000003100.jpg,2400,1600,"[[612.3287671232877, 503.42465753424653, 1026....",[754059.861137174],"[[0.2551369863013699, 0.3146404109589041, 0.42...",[1],hyena
9894,1000000003101.jpg,2400,2225,"[[0.0, 202.02020202020202, 2338.5858585858587,...",[4422053.259871442],"[[0, 0.09079559641357395, 0.9744107744107744, ...",[1],hyena
9895,1000000003102.jpg,2400,1600,"[[980.1369863013698, 276.027397260274, 1266.43...",[938552.2612122349],"[[0.4083904109589041, 0.17251712328767124, 0.5...",[1],hyena


collect all the areas and find the distribution

In [7]:
area_list = []
for i in range(len(leo_hye_df)):
    temp = ast.literal_eval(leo_hye_df.loc[i,'areas'])
    area = temp[0]
    area_list.append(area)

In [12]:
median = statistics.median(area_list)
mean = statistics.mean(area_list)
print(median, mean)

772578.125 1265025.384172909


After looking at the pictures around this region, I decide to use 750000 as my threshold of selecting pictures

In [15]:
input_path = 'C:/mixed'
output_path = 'C:/mixed_large'
for i in range(len(area_list)):
    if area_list[i] > 750000:
        image_name = leo_hye_df.loc[i,'file_name']
        shutil.copy(os.path.join(input_path,image_name),os.path.join(output_path,image_name))

Through this process, we obtain 5016 pictures out of 9200.

#### Step 2: Separate the grayscale pictures

In [20]:
def is_grayscale(image_path, threshold=0.95):
    # Open the image
    img = Image.open(image_path)
    
    # Convert to RGB (if the image is in a different mode)
    img = img.convert('RGB')
    
    # Convert image to numpy array for easy manipulation
    img_array = np.array(img)
    
    # Calculate the number of pixels that are grayscale
    grayscale_count = 0
    total_pixels = img_array.shape[0] * img_array.shape[1]
    
    for row in img_array:
        for pixel in row:
            r, g, b = pixel
            # Check if the pixel is grayscale (i.e., r == g == b)
            if r == g == b:
                grayscale_count += 1
    
    # Calculate the proportion of grayscale pixels
    grayscale_ratio = grayscale_count / total_pixels
    
    # Check if the grayscale ratio is above the threshold
    return grayscale_ratio >= threshold

In [21]:
image_dir = 'C:/mixed_large'
output_path_gray = 'C:/mixed_large_gray'
output_path_color = 'C:/mixed_large_color'
image_list = os.listdir(image_dir)
for image in image_list:
    if is_grayscale(os.path.join(image_dir, image)):
        shutil.copy(os.path.join(image_dir,image),os.path.join(output_path_gray,image))
    else:
        shutil.copy(os.path.join(image_dir,image),os.path.join(output_path_color,image))

KeyboardInterrupt: 

It turned out to be too time-consuming, so I ended it and manually separated the pictures (tears). There are 657 grayscale pictures in total.

#### Step 3: Separate the extremely dark pictures, relatively dark pictures, and sufficiently bright pictures

In [22]:
def calculate_brightness(image):
    # Convert image to grayscale
    grayscale_image = image.convert('L')

    # Get pixel values as a numpy array
    pixels = np.array(grayscale_image)

    # Calculate mean pixel value
    mean_brightness = np.mean(pixels)

    return mean_brightness

In [26]:
input_path = 'C:/mixed_large'
output_path_vd = 'C:/mixed_large_verydark'
output_path_d = 'C:/mixed_large_dark'
out_path_b = 'C:/mixed_large_bright'
image_name_list = os.listdir(input_path)
for image_name in image_name_list:
    image = Image.open(os.path.join(input_path,image_name))
    width,height = image.size
    resized_image = image.resize((int(0.3*width),int(0.3*height)))
    if calculate_brightness(resized_image) <= 20:
        if image_name[0] == '0':
            image_name_new = '9' + '0'*3 + image_name[-8:]
            resized_image.save(os.path.join(output_path_vd,image_name_new))
        else:
            image_name_new = '10' + '0'*3 + image_name[-8:]
            resized_image.save(os.path.join(output_path_vd,image_name_new))
    elif calculate_brightness(resized_image) <= 70:
        if image_name[0] == '0':
            image_name_new = '9' + '0'*3 + image_name[-8:]
            resized_image.save(os.path.join(output_path_d,image_name_new))
        else:
            image_name_new = '10' + '0'*3 + image_name[-8:]
            resized_image.save(os.path.join(output_path_d,image_name_new))
    else:
        if image_name[0] == '0':
            image_name_new = '9' + '0'*3 + image_name[-8:]
            resized_image.save(os.path.join(out_path_b,image_name_new))
        else:
            image_name_new = '10' + '0'*3 + image_name[-8:]
            resized_image.save(os.path.join(out_path_b,image_name_new))

### Part 2. Separate the general animal dataset for later use

#### Step 1. Separate the animal datasets into original

The fox dataset contains dark images with very poor quality. We first separate them and get rid of them.

In [34]:
image_dir = 'C:/Users/leiyi/OneDrive/Desktop/Dataset/fox'
output_path = 'C:/Users/leiyi/OneDrive/Desktop/Dataset/red_fox'
image_list = os.listdir(image_dir)
for image_name in image_list:
    image = Image.open(os.path.join(image_dir,image_name))
    if calculate_brightness(image) > 45:
        image.save(os.path.join(output_path,image_name))

Now, we separate these images according to a 0.6:0.1:0.1:0.1:0.1 ratio. They will separately be the set of:
1) well-illuminated pictures
2) under-exposed pictures
3) night-setting pictures
4) extremely dark pictures
5) grayscale pictures

In [35]:
def split_list_by_ratio(lst, ratios):
    """Splits a list into sublists according to given ratios."""

    random.shuffle(lst)  # Shuffle the list to ensure randomness

    sublists = []
    start = 0
    for ratio in ratios:
        end = start + int(len(lst) * ratio)
        sublists.append(lst[start:end])
        start = end

    return sublists

In [39]:
input_path = 'C:/Users/leiyi/OneDrive/Desktop/Dataset'
output_path_gen = 'C:/Users/leiyi/OneDrive/Desktop/dataset_separated'
output_path_list = os.listdir(output_path_gen)
for animal_type in os.listdir(input_path):
    image_name_list = os.listdir(os.path.join(input_path,animal_type))
    image_name_sublists = split_list_by_ratio(image_name_list,[0.6,0.1,0.1,0.1,0.1])
    for i in range(len(image_name_sublists)):
        for image_name in image_name_sublists[i]:
            shutil.copy(os.path.join(input_path,animal_type,image_name),os.path.join(output_path_gen,output_path_list[i],image_name))

Next, we perform certain actions via photoshop to process the images. We also add images of hyena and leopard into the sets

In [42]:
leo_hye_dir = 'C:/mixed_large_bright'
output_path = 'C:/Users/leiyi/OneDrive/Desktop/data_sep/well-illuminated'
image_name = os.listdir(leo_hye_dir)
image_sublist = random.choices(image_name, k = int(0.3*len(image_name)))
for image in image_sublist:
    shutil.copy(os.path.join(leo_hye_dir,image),os.path.join(output_path,image))

Rename the ones with the wrong name:

In [None]:
def rename_files(directory,newnames):
    '''
    take in a directory containing multiple files that need to be renamed,
    and rename them to what is specified in a list
    '''

    for filename in os.listdir(directory):
        old_path = os.path.join(directory, filename)
        new_filename = newnames.pop(0)
        new_path = os.path.join(directory, new_filename)

        # Rename file
        os.rename(old_path, new_path)

In [46]:
input_path = 'C:/Users/leiyi/OneDrive/Desktop/data_sep/grayscale'
for filename in os.listdir(input_path):
    if len(filename) == len('000000001977.jpg'):
        filename_rem = filename[-8:]
        filename_new = '9000' + filename_rem
        old_path = os.path.join(input_path,filename)
        new_path = os.path.join(input_path,filename_new)
        os.rename(old_path,new_path)
    elif len(filename) == len('000000001977.jpg') + 1:
        filename_rem = filename[-8:]
        filename_new = '10000' + filename_rem
        old_path = os.path.join(input_path,filename)
        new_path = os.path.join(input_path,filename_new)
        os.rename(old_path,new_path)

Now, having processed the images, we add metadata files to each dataset.

In [40]:
def create_a_metadata(images_dir):
    '''
    take in an image dir, and create a metadata.csv file for the images contained
    '''

    metadata = [['file_name', 'image_id','width','height','objects']]
    with open("C:/Users/leiyi/OneDrive/Desktop/metadata.csv", "w", newline="") as file:
        writer = csv.writer(file)

        # Write the header row (optional)
        writer.writerow(metadata[0])
    image_id = 0
    transform = transforms.ToTensor()
    for filename in os.listdir(images_dir):
        image = Image.open(os.path.join(images_dir,filename))
        newline = [filename]
        objects = dict()
        newline.append(image_id)
        width, height = image.size
        newline.append(width)
        newline.append(height)
        image_tensor = transform(image)
        objects['pixel_values'] = image_tensor
        objects['labels'] = ast.literal_eval(filename[:-11])
        
        if objects['labels'] == 0:
            objects['category'] = 'antelope'
        elif objects['labels'] == 1:
            objects['category'] = 'bear'
        elif objects['labels'] == 2:
            objects['category'] = 'deer'
        elif objects['labels'] == 3:
            objects['category'] = 'fox'
        elif objects['labels'] == 4:
            objects['category'] = 'hare'
        elif objects['labels'] == 5:
            objects['category'] = 'lion'
        elif objects['labels'] == 6:
            objects['category'] = 'raccoon'
        elif objects['labels'] == 7:
            objects['category'] = 'tiger'
        elif objects['labels'] == 8:
            objects['category'] = 'wolf'
        elif objects['labels'] == 9:
            objects['category'] = 'leopard'
        else:
            objects['category'] = 'hyena'
        
        newline.append(objects)
        
        with open("C:/Users/leiyi/OneDrive/Desktop/metadata.csv", "a", newline="") as file:
            writer = csv.writer(file)
            writer.writerows([newline])
            
        image_id += 1

In [53]:
create_a_metadata('C:/Users/leiyi/OneDrive/Desktop/data_sep/well-illuminated')

In [54]:
create_a_metadata('C:/Users/leiyi/OneDrive/Desktop/data_sep/grayscale')

In [55]:
create_a_metadata('C:/Users/leiyi/OneDrive/Desktop/data_sep/very dark')

In [56]:
create_a_metadata('C:/Users/leiyi/OneDrive/Desktop/data_sep/less-saturated')

In [57]:
create_a_metadata('C:/Users/leiyi/OneDrive/Desktop/data_sep/underexposed')

In [58]:
create_a_metadata('C:/Users/leiyi/OneDrive/Desktop/datasetmixed')