In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import requests
import json
from tqdm.notebook import tqdm
import requests
import shutil

with os.add_dll_directory('C://openslide-win64/bin'):
	import openslide

# Read the CSV file
data_file = 'annotation.csv'
df = pd.read_csv(data_file)

diagnosis_counts = df['diagnosis'].value_counts()

diagnosis_labels = df['diagnosis'].fillna('nan').unique()

In [12]:
def get_image_with_min_size(file_path, min_size=2048):

	ndpi_file = openslide.OpenSlide(file_path)
	ndpi_metadata = dict(ndpi_file.properties)

	# Get the number of magnification levels in the NDPI file
	mag_level_count = int(ndpi_metadata['openslide.level-count']) - 1
	target_mag_level = mag_level_count
	while target_mag_level >= 0:
		# Get the width and height of the requested magnification level
		target_ndpi_width = int(ndpi_metadata[f'openslide.level[{target_mag_level}].width'])
		target_ndpi_height = int(ndpi_metadata[f'openslide.level[{target_mag_level}].height'])
		if target_ndpi_width >= min_size and target_ndpi_height >= min_size:
			break
		target_mag_level -= 1
	if target_mag_level < 0:
		print('Error: NDPI file is too small')
		return None, ndpi_metadata

	target_ndpi_width = int(ndpi_metadata[f'openslide.level[{target_mag_level}].width'])
	target_ndpi_height = int(ndpi_metadata[f'openslide.level[{target_mag_level}].height'])

	# Sometimes there are error reading the JPEG files so we try at different mag levels and downsample

	mag_level = target_mag_level
	while mag_level >= 0:
		# Open the NDPI file using OpenSlide
		if mag_level < target_mag_level:
			ndpi_file = openslide.OpenSlide(file_path)

		# Get the width and height of the requested magnification level
		ndpi_width = int(ndpi_metadata[f'openslide.level[{mag_level}].width'])
		ndpi_height = int(ndpi_metadata[f'openslide.level[{mag_level}].height'])
		try:
            # Load the image at the requested magnification level
			ndpi_image = ndpi_file.read_region((0, 0), mag_level, (ndpi_width, ndpi_height))

            # Convert the image to RGB format
			ndpi_image = ndpi_image.convert('RGB')
			if mag_level < target_mag_level:
				ndpi_image = ndpi_image.resize((target_ndpi_width, target_ndpi_height))

            # Close the NDPI file and return the image
			ndpi_file.close()
			return ndpi_image, ndpi_metadata
		except:
			print('Trying again with mag', mag_level - 1)
			mag_level -= 1
			ndpi_file.close()
	
	print(f'Error: Could not load image from {file_path} at any magnification level')

def save_image(processed_path, ndpi_file_name):
	file_path = os.path.join(processed_path, ndpi_file_name)
	output_image_path = file_path[:-5] + '.png'

	ndpi_image, metadata = get_image_with_min_size(file_path)

	# Save the image
	try:
		ndpi_image.save(output_image_path)
	except:
		print(f'Didn\'t save {output_image_path}')

    # Save the metadata as a JSON file
	metadata_path = os.path.join(processed_path, 'metadata')
	if not os.path.exists(metadata_path):
		os.mkdir(metadata_path)
	with open(os.path.join(os.path.join(processed_path, 'metadata'), ndpi_file_name[:-5] + '.json'), 'w') as metadata_file:
		json.dump(metadata, metadata_file)

In [27]:
diagnosis_counts = df['diagnosis'].value_counts()

diagnosis_labels = df['diagnosis'].fillna('nan').unique()

def compare(compare_path):
    missing = []
    for label in diagnosis_labels:
        folder_path = os.path.join(compare_path, label.replace("/", "-"))
        num_files = len(glob.glob(os.path.join(folder_path, '*.png')))
        if os.path.exists(folder_path) and num_files == 0:
            shutil.rmtree(folder_path)
        if not os.path.exists(folder_path):
            missing.append((label, -1))
        else:
            missing.append((label, num_files))
    return missing

folder_path = "C:/Users/Kontor/Github Repos/Notebooks/biomedical/processed"
missing_labels = compare(folder_path)

matching_labels = []
non_matching_labels = []
total_missing = 0

for label, num_files in missing_labels:
    if label != 'nan':
        if num_files == diagnosis_counts[label]:
            matching_labels.append((label, num_files))
        else:
            total_missing += diagnosis_counts[label] - num_files
            non_matching_labels.append((label, num_files))

matching_labels = sorted(matching_labels, key=lambda x: x[1], reverse=True)
non_matching_labels = sorted(non_matching_labels, key=lambda x: x[1], reverse=True)

#print("Matching labels:")
#for label, num_files in matching_labels:
#    print(f"{label}: {num_files}")
#print()
print('Total missing:', total_missing)
print("Non-matching labels:")
for label, num_files in non_matching_labels:
    print(f"{label}: {num_files}", diagnosis_counts[label])

Total missing: 2042
Non-matching labels:
Adamantinomatous craniopharyngioma: 65 85
Angiomatous meningioma: 31 32
Myxopapillary ependymoma: 22 23
Haemangioblastoma: 19 88
Pilomyxoid astrocytoma: 18 24
Anaplastic oligodendroglioma, IDH-mutant and 1p/19q codeleted: 17 91
Diffuse astrocytoma, IDH-wildtype: 17 19
Meningothelial meningioma: 16 104
Chondrosarcoma: 15 21
Giant cell glioblastoma: 15 21
Germinoma: 15 20
Haemangiopericytoma: 13 34
Psammomatous meningioma: 12 28
Microcystic meningioma: 12 23
Dysembryoplastic neuroepithelial tumour: 11 25
Atypical meningioma: 10 83
Olfactory neuroblastoma: 8 10
Osteoma: 8 9
Medulloblastoma, non-WNT/non-SHH: 8 32
Schwannoma: 7 81
Anaplastic astrocytoma, IDH-wildtype: 7 47
Gliosarcoma: 7 59
Fibrous meningioma: 7 57
Desmoplastic infantile astrocytoma and ganglioglioma: 5 11
Ganglioglioma: 4 88
Anaplastic meningioma: 3 46
Cellular schwannoma: 3 25
Pilocytic astrocytoma: 3 173
Papillary craniopharyngioma: 3 13
Neurofibroma: 3 16
Chordoid glioma of the t

# Automate download

Size of the dataset is 3948.2021661920003 GB

In [23]:
url = 'https://data-proxy.ebrains.eu/api/v1/datasets/8fc108ab-e2b4-406f-8999-60269dc1f994?limit=5000'
header = "Bearer eyJhbGciOiJSUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJfNkZVSHFaSDNIRmVhS0pEZDhXcUx6LWFlZ3kzYXFodVNJZ1RXaTA1U2k0In0.eyJleHAiOjE2ODQzMzI0OTEsImlhdCI6MTY4MzcyODEwMiwiYXV0aF90aW1lIjoxNjgzNzI3NjkxLCJqdGkiOiI4MTE2ZmJhMC1lNmMwLTQxMzMtOWY3Zi0xNmM5NDQxZDIyMzciLCJpc3MiOiJodHRwczovL2lhbS5lYnJhaW5zLmV1L2F1dGgvcmVhbG1zL2hicCIsImF1ZCI6InRlYW0iLCJzdWIiOiI4NTBlNTA2Ni1mNGQwLTRjOGItYmNiYy02ZjM4ZWQzYjIzMjIiLCJ0eXAiOiJCZWFyZXIiLCJhenAiOiJkYXRhLXByb3h5LWZyb250Iiwibm9uY2UiOiJiYzM0NGYxMS00MDBmLTRiNmUtOTRiOC04MDBkZDcyYWM4MDAiLCJzZXNzaW9uX3N0YXRlIjoiNzNjZmNlMWYtOTQ5MC00OTg4LTlhOTktMjBlZGJkYmFkNjIwIiwiYWNyIjoiMCIsImFsbG93ZWQtb3JpZ2lucyI6WyJodHRwczovL2RhdGEtcHJveHkuZWJyYWlucy5ldSIsImh0dHBzOi8vZGF0YS1wcm94eS1wcGQuZWJyYWlucy5ldSJdLCJzY29wZSI6InByb2ZpbGUgcm9sZXMgZW1haWwgb3BlbmlkIHRlYW0iLCJzaWQiOiI3M2NmY2UxZi05NDkwLTQ5ODgtOWE5OS0yMGVkYmRiYWQ2MjAiLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwibmFtZSI6IkZhYmlhbiBLb250b3IiLCJtaXRyZWlkLXN1YiI6IjMxMDM5NiIsInByZWZlcnJlZF91c2VybmFtZSI6ImZrb250b3IiLCJnaXZlbl9uYW1lIjoiRmFiaWFuIiwiZmFtaWx5X25hbWUiOiJLb250b3IiLCJlbWFpbCI6ImYua29udG9yQHN0dWQudW5pLWhlaWRlbGJlcmcuZGUifQ.Fh2l9MKbT3wMxQsOy7afpMWzotylHJnumL_TCl784zXT-7pGMXOAPN5jNuEiGCAhQOoWrs2Jj3TRWnoDb3IJ2ts2cgeC4NdNjYGP4g9PX5ZamFv60B3g6rh_JXqwLzCYHzcP_K4_Z4J3PENKiRMQq-miN_xjjzYXSleCpMR7AEIyKA7amsx1vx2PQ_VXZC3mCX8qoMQFrqwg0HXpNY6a9yj5WbR_cXat65BCcS5Q52bB8tr9EHfZs9RX_fLgt4C-gCvBAfh0QBCaoNV0P4CWPS959SIcCSAUlb8cXqojyMZADLjjw7BVXombxYAURycIMX_84K9wrnWW5L-WGOOM4g"

def request(url, header):
	response = requests.get(url, headers={'Authorization': header})
	if response.status_code == 200:
		return response.json()
	else:
		print(response.json())
		print(f'Request failed with status code {response.status_code}')
		return None
	
def download(url, filename, header):
	
	with requests.get(url, headers={'Authorization': header}, stream=True) as r:
		total_size = int(r.headers.get('content-length', 0))
		block_size = 1024 #1 Kibibyte

		progress_bar = tqdm(total=total_size, unit="iB", unit_scale=True)

		with open(filename, "wb") as f:
			for chunk in r.iter_content(block_size):
				progress_bar.update(len(chunk))
				f.write(chunk)
		
		progress_bar.close()
		r.close()
		
	if total_size != 0 and progress_bar.n != total_size:
		print("ERROR, something went wrong")
		return False
	else:
		return True

data = request(url, header)

In [30]:
total_downloads = 0

api_base_url = 'https://data-proxy.ebrains.eu/api/v1/datasets/8fc108ab-e2b4-406f-8999-60269dc1f994/'

for obj in data['objects']:
	name = obj['name'].replace('v1.0/', '')
	name = name.replace('Embryonal tumour with multilayered rosette, C19MC-altered', 'Embryonal tumour with multilayered rosettes, C19MC-altered')
	if name == 'annotation.csv':
		continue
	label, file = name.split('/')

	data_folder_path = os.path.join(folder_path, label)
	if not os.path.exists(data_folder_path):
		os.makedirs(data_folder_path)

	file_path = os.path.join(data_folder_path, file)
	png_path = file_path.replace('.ndpi', '.png')
	if not os.path.exists(png_path):
		n_files = len(glob.glob(os.path.join(data_folder_path, '*.png')))
		print(f'{n_files+1}/{diagnosis_counts[label]}: Starting download of', name)
		if download(api_base_url + 'v1.0/' + name, file_path, header):
			save_image(data_folder_path, file)
		os.remove(file_path)
		total_downloads += 1

print('Finished downloading', total_downloads, 'files')

KeyError: 'Adamantinomatous craniopharyngioma/a1976886-357f-11eb-b250-001a7dda7111.ndpi'

Deprecated: was used to compare two versions of downloader folders to see which one has more data

In [11]:


img = get_image_with_min_size(file_path)
img.save("test.png")

Trying again with mag 3
Trying again with mag 2


ArgumentError: argument 1: <class 'ValueError'>: Passing closed slide object

In [1]:
def compare(compare_path):
	missing = []
	present = []
	for label in diagnosis_labels:
		folder_path = os.path.join(compare_path, label)
		if not os.path.exists(folder_path):
			missing.append(label)
		else:
			present.append(label)
	return sorted(missing), sorted(present)

path1 = "C:\\Users\\Kontor\\Github Repos\\Notebooks\\biomedical\\processed"
path2 = "C:\\Users\\Kontor\\Github Repos\\Notebooks\\biomedical\\data"

missing1, present1 = compare(path1)
missing2, present2 = compare(path2)

print(len(missing1), missing1)
print(len(missing2), missing2)
overlapping_labels = sorted(list(set(missing1).intersection(set(missing2))))

print(len(overlapping_labels), overlapping_labels)

def get_file_overlap(path1, path2):
	files1 = [file for file in os.listdir(path1) if file.endswith('.png')]
	files2 = [file for file in os.listdir(path2) if file.endswith('.png')]
	return sorted(list(set(files1).intersection(set(files2)))), sorted(files1), sorted(files2)

for folder in os.listdir(path1):
	if folder in present1 and folder in present2:
		overlap, files1, files2 = get_file_overlap(os.path.join(path1, folder), os.path.join(path2, folder))
		if len(overlap) != len(files1) or len(overlap) != len(files2):
			print(folder, len(overlap), len(files1), len(files2))

NameError: name 'diagnosis_labels' is not defined