In [31]:
import os

def getListOfFiles(directory):
    whaleDict = {}

    # Walk through the directory tree
    for root, _, files in os.walk(directory):
        # Get the relative path from the root directory
        relativePath = os.path.relpath(root, directory)
        # Split the relative path to find the first subdirectory (whale name)
        pathParts = relativePath.split(os.sep)

        if len(pathParts) > 0 and pathParts[0] != ".":
            whaleName = pathParts[0]  # First subdirectory is the whale name

            # Initialize the list for this whale if it doesn't exist
            if whaleName not in whaleDict:
                whaleDict[whaleName] = []

            # Add each file to the whale's list
            for file in files:
                fullPath = os.path.join(root, file)
                whaleDict[whaleName].append((whaleName, fullPath))

    return whaleDict


In [32]:
from PIL import Image
from PIL.ExifTags import TAGS
from tqdm import tqdm

def getImageDate(filepath):
	try:
		with Image.open(filepath) as img:
			exif_data = img._getexif()
			if exif_data:
				for tag_id, value in exif_data.items():
					tag = TAGS.get(tag_id, tag_id)
					if tag == 'DateTimeOriginal':
						return value.split(' ')[0].replace(':', '-')
	except Exception:
		pass
	return None

def countDates(dataset):
	unique_dates = set()

	for whale, images in tqdm(dataset.items(), desc="Processing whales"):
		for _, filepath in images:
			date = getImageDate(filepath)
			if date:
				unique_dates.add(date)

	numberOfImages = sum(len(images) for images in dataset.values())
	print("Processed ", numberOfImages, " images")

	return len(unique_dates)


def countOfCameraUseage(dataset):
	camera_usage = {}

	for whale, images in tqdm(dataset.items(), desc="Counting camera usage"):
		for _, filepath in images:
			try:
				with Image.open(filepath) as img:
					exif_data = img._getexif()
					if exif_data:
						for tag_id, value in exif_data.items():
							tag = TAGS.get(tag_id, tag_id)
							if tag == 'Model':
								camera_make = value
								if camera_make not in camera_usage:
									camera_usage[camera_make] = 0
								camera_usage[camera_make] += 1
			except Exception:
				pass

			

	return camera_usage



In [33]:
allDataSets = {
	"pm": "G:\Whale Stuff\_data\cetaceans\curated\pm\\all",
	"pma": "G:\Whale Stuff\_data\cetaceans\curated\pm\\flank",
	"pmu": "G:\Whale Stuff\_data\cetaceans\curated\pm\\fluke",
	# "dd": "G:\Whale Stuff\_data\cetaceans\curated\dd",
	# "gg": "G:\Whale Stuff\_data\cetaceans\curated\gg",
	# "tt": "G:\Whale Stuff\_data\cetaceans\curated\\tt",
}

datasetCounts = {}

for dataset_name, directory in allDataSets.items():
	print(f"Processing dataset: {dataset_name}")
	whale_dataset = getListOfFiles(directory)

	# count = countDates(whale_dataset)
	# datasetCounts[dataset_name] = count
	cameraCounts = countOfCameraUseage(whale_dataset)
	sorted_cameras = sorted(cameraCounts.items(), key=lambda x: x[1], reverse=True)
	print("Camera usage counts (sorted):")
	for camera, count in sorted_cameras:
		print(f"{camera}: {count}")

	total_count = sum(cameraCounts.values())
	print(f"Total images with camera EXIF data: {total_count}")


print("\nEncounters for each dataset:")
for dataset_name, count in datasetCounts.items():
	print(f"{dataset_name}: {count}")


Processing dataset: pm


Counting camera usage: 100%|██████████| 103/103 [00:12<00:00,  7.99it/s]


Camera usage counts (sorted):
Canon EOS 40D: 7253
Canon EOS 10D: 1788
Canon EOS 5D Mark II: 439
Canon EOS 700D: 372
Canon EOS 80D: 366
Canon EOS 70D: 363
Canon EOS 350D DIGITAL: 299
Canon EOS 6D: 84
Canon EOS 5D Mark III: 79
Canon EOS DIGITAL REBEL XTi: 62
Canon EOS 450D: 41
Canon EOS 100D: 26
NIKON D800: 14
DSC-F828: 12
Canon EOS 5D Mark II           : 8
NIKON D5200: 8
Canon EOS 5D: 8
NIKON D5100: 7
Canon PowerShot A720 IS: 6
COOLPIX P600: 6
Canon EOS 300D DIGITAL: 4
NIKON D7500: 4
NIKON D90: 3
Canon EOS 600D: 3
Canon EOS 10D                  : 1
FinePix S5000 : 1
Canon EOS 7D                   : 1
Canon EOS 1000D: 1
Total images with camera EXIF data: 11259
Processing dataset: pma


Counting camera usage: 100%|██████████| 111/111 [00:09<00:00, 12.06it/s]


Camera usage counts (sorted):
Canon EOS 40D: 5092
Canon EOS 10D: 1423
Canon EOS 5D Mark II: 335
Canon EOS 700D: 326
Canon EOS 70D: 298
Canon EOS 80D: 278
Canon EOS 350D DIGITAL: 223
Canon EOS 6D: 66
Canon EOS 5D Mark III: 44
Canon EOS DIGITAL REBEL XTi: 40
Canon EOS 450D: 30
Canon EOS 100D: 19
DSC-F828: 11
Canon EOS 5D Mark II           : 7
NIKON D5200: 7
NIKON D800: 6
NIKON D5100: 6
COOLPIX P600: 4
NIKON D90: 3
Canon EOS 300D DIGITAL: 3
Canon EOS 5D: 2
NIKON D7500: 2
Canon PowerShot A720 IS: 1
Total images with camera EXIF data: 8226
Processing dataset: pmu


Counting camera usage: 100%|██████████| 106/106 [00:03<00:00, 29.90it/s]

Camera usage counts (sorted):
Canon EOS 40D: 2211
Canon EOS 10D: 385
Canon EOS 5D Mark II: 104
Canon EOS 80D: 88
Canon EOS 70D: 87
Canon EOS 350D DIGITAL: 79
Canon EOS 700D: 46
Canon EOS 5D Mark III: 35
Canon EOS DIGITAL REBEL XTi: 22
Canon EOS 6D: 20
Canon EOS 450D: 11
NIKON D800: 8
Canon EOS 100D: 7
Canon EOS 5D: 6
Canon PowerShot A720 IS: 5
Canon EOS 600D: 3
NIKON D7500: 2
NIKON D5200: 2
COOLPIX P600: 2
Canon EOS 10D                  : 1
Canon EOS 300D DIGITAL: 1
DSC-F828: 1
FinePix S5000 : 1
Canon EOS 5D Mark II           : 1
Canon EOS 7D                   : 1
Canon EOS 1000D: 1
NIKON D5100: 1
Total images with camera EXIF data: 3131

Encounters for each dataset:



