In [15]:
import pandas as pd
import numpy as np
def get_samples_per_domain(metadata_path):
	"""
	Computes the number of samples per domain in the dataset.
	We use hardcoded offsets because we want X-01 and Y-01 to have different indices,
	where X and Y are different cell types.
	Args:
		metadata_path: path to the metadata file.
	Returns:
		spd: a 3xN matrix where N is the number of domains in the dataset. 
			
	"""
	print("Computing samples per domain...")
	dataset = pd.read_csv(metadata_path)	
	train = dataset[dataset['dataset'] == 'train']
	test = dataset[dataset['dataset'] == 'test']
	val = dataset[dataset['dataset'] == 'val']
	print(f"Train set: found {len(train)} images")
	print(f"Test set: found {len(test)} images")
	print(f"Validation set: found {len(val)} images")
	mapping = {k:i for i,k in enumerate(train["experiment"].unique())}
	n_of_domains = len(train["experiment"].unique())
	spd = np.zeros(n_of_domains)	
	vals = train['experiment'].value_counts()
	for domain in vals.index:
		spd[mapping[domain]] = vals[domain]
	return spd, mapping

In [None]:
meta = "/work/h2020deciderficarra_shared/rxrx1/rxrx1_orig/metadata/meta.csv"
spd, mapping = get_samples_per_domain(meta)

Computing samples per domain...
Train set: found 81224 images
Test set: found 34432 images
Validation set: found 9854 images


{'HEPG2-01': 0,
 'HEPG2-02': 1,
 'HEPG2-03': 2,
 'HEPG2-04': 3,
 'HEPG2-05': 4,
 'HEPG2-06': 5,
 'HEPG2-07': 6,
 'HUVEC-01': 7,
 'HUVEC-02': 8,
 'HUVEC-03': 9,
 'HUVEC-04': 10,
 'HUVEC-05': 11,
 'HUVEC-06': 12,
 'HUVEC-07': 13,
 'HUVEC-08': 14,
 'HUVEC-09': 15,
 'HUVEC-10': 16,
 'HUVEC-11': 17,
 'HUVEC-12': 18,
 'HUVEC-13': 19,
 'HUVEC-14': 20,
 'HUVEC-15': 21,
 'HUVEC-16': 22,
 'RPE-01': 23,
 'RPE-02': 24,
 'RPE-03': 25,
 'RPE-04': 26,
 'RPE-05': 27,
 'RPE-06': 28,
 'RPE-07': 29,
 'U2OS-01': 30,
 'U2OS-02': 31,
 'U2OS-03': 32}