# Measure dataset sizes

Use information from `measure_dandi.py` and `measure_openneuro.py` to estimate the total
size of records available on DANDI hub and openneuro. Gather metadata from the Dandi API. Add to that information recordings
scraped on iEEG.org. Clean up the records in cases where the data is nonsensical. 

In [47]:
from dandi.dandiapi import DandiAPIClient
from tqdm import tqdm
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression

# Load up all relevant metadata from DANDI archive and aggregate it.
metadata = []
for i in tqdm(range(730)):
    dandiset_id = f"{i:06d}"

    client = DandiAPIClient()
    try:
        dandiset = client.get_dandiset(dandiset_id)
        next(dandiset.get_assets())
    except:
        #print(f"Error getting {dandiset_id}")
        continue
    
    m_ = dandiset.get_raw_metadata()
    try:
        data_format = m_['assetsSummary']['dataStandard'][0]['identifier']
    except:
        data_format = 'RRID:SCR_015242'
    
    if data_format != 'RRID:SCR_015242':
        print(data_format)

    metadata.append(
        m_
    )

  0%|          | 0/730 [00:00<?, ?it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000000/: {"detail":"Not found."}
  0%|          | 1/730 [00:00<02:00,  6.04it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000001/: {"detail":"Not found."}
  0%|          | 2/730 [00:00<01:34,  7.67it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000002/: {"detail":"Not found."}
  2%|▏         | 14/730 [00:02<02:33,  4.67it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000014/: {"detail":"Not found."}
  4%|▍         | 28/730 [00:05<03:33,  3.29it/s]

RRID:SCR_016124


  7%|▋         | 53/730 [00:10<02:13,  5.08it/s]

RRID:SCR_016124


  8%|▊         | 57/730 [00:11<02:18,  4.85it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000057/: {"detail":"Not found."}
  8%|▊         | 59/730 [00:11<01:55,  5.82it/s]

RRID:SCR_016124


  8%|▊         | 62/730 [00:12<02:09,  5.17it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000062/: {"detail":"Not found."}
  9%|▉         | 68/730 [00:13<01:58,  5.59it/s]

RRID:SCR_016124


  9%|▉         | 69/730 [00:13<01:57,  5.61it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000069/: {"detail":"Not found."}
 10%|█         | 73/730 [00:14<01:39,  6.62it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000073/: {"detail":"Not found."}
 10%|█         | 74/730 [00:14<01:30,  7.26it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000074/: {"detail":"Not found."}
Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000075/: {"detail":"Not found."}
 10%|█         | 76/730 [00:14<01:17,  8.46it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000076/: {"detail":"Not found."}
Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000077/: {"detail":"Not found."}
 11%|█         | 78/730 [00:14<01:11,  9.15it/s]Error 404 while sending GET request to https://api.dandiarchive.org/ap

RRID:SCR_016124


 15%|█▍        | 109/730 [00:17<01:36,  6.42it/s]

RRID:SCR_016124


 15%|█▌        | 110/730 [00:18<01:43,  5.97it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000110/: {"detail":"Not found."}
 16%|█▋        | 119/730 [00:19<01:43,  5.89it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000119/: {"detail":"Not found."}
 20%|█▉        | 145/730 [00:23<01:49,  5.34it/s]

RRID:SCR_016124


 21%|██        | 152/730 [00:24<01:30,  6.37it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000152/: {"detail":"Not found."}
 24%|██▍       | 174/730 [00:28<01:36,  5.78it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000174/: {"detail":"Not found."}
 24%|██▍       | 175/730 [00:28<01:24,  6.54it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000175/: {"detail":"Not found."}
Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000176/: {"detail":"Not found."}
 24%|██▍       | 177/730 [00:28<01:10,  7.89it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000177/: {"detail":"Not found."}
Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000178/: {"detail":"Not found."}
 25%|██▍       | 179/730 [00:28<01:03,  8.70it/s]Error 404 while sending GET request to https://api.dandiarchive.o

RRID:SCR_016124


 34%|███▍      | 248/730 [00:39<01:37,  4.95it/s]Error 401 while sending GET request to https://api.dandiarchive.org/api/dandisets/000248/: {"detail":"Authentication credentials were not provided."}
 35%|███▍      | 253/730 [00:40<01:34,  5.07it/s]Error 401 while sending GET request to https://api.dandiarchive.org/api/dandisets/000253/: {"detail":"Authentication credentials were not provided."}
Error 401 while sending GET request to https://api.dandiarchive.org/api/dandisets/000254/: {"detail":"Authentication credentials were not provided."}
 35%|███▌      | 256/730 [00:40<01:11,  6.66it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000256/: {"detail":"Not found."}
Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000257/: {"detail":"Not found."}
 35%|███▌      | 258/730 [00:40<01:00,  7.80it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000258/: {"detail":"Not found."}
Error 404

RRID:SCR_016124


 78%|███████▊  | 573/730 [01:20<00:32,  4.89it/s]Error 401 while sending GET request to https://api.dandiarchive.org/api/dandisets/000573/: {"detail":"Authentication credentials were not provided."}
 79%|███████▉  | 578/730 [01:21<00:26,  5.80it/s]Error 401 while sending GET request to https://api.dandiarchive.org/api/dandisets/000578/: {"detail":"Authentication credentials were not provided."}
 79%|███████▉  | 580/730 [01:21<00:24,  6.08it/s]Error 401 while sending GET request to https://api.dandiarchive.org/api/dandisets/000580/: {"detail":"Authentication credentials were not provided."}
Error 401 while sending GET request to https://api.dandiarchive.org/api/dandisets/000581/: {"detail":"Authentication credentials were not provided."}
 80%|███████▉  | 583/730 [01:21<00:22,  6.54it/s]Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000583/: {"detail":"Not found."}
Error 404 while sending GET request to https://api.dandiarchive.org/api/dandisets/000584/

In [48]:
# For each dataset, take the 3 last digits of the identifier.
# Then from ../scripts/results, load dandiset-{digits}.jsonl as a json lines file via pandas.
# Deduplicate the rows by asset_id, picking the last element. Then, aggregate the results 
# via a sum and cast the results back to a dict. 
# Get all the results into a list.
file_metadata = []
for dandiset in tqdm(metadata):
    dandiset_id = dandiset["identifier"][-3:]
    try:
        df = pd.read_json(f"../scripts/results/dandiset-{dandiset_id}.jsonl", lines=True)
    except:
        continue
    df = df.drop_duplicates(subset="asset_id", keep="last")
    files = df.sum().to_dict()
    files["nfiles"] = df.shape[0]
    dandiset.update(files)

  files = df.sum().to_dict()
  files = df.sum().to_dict()
100%|██████████| 220/220 [00:02<00:00, 94.66it/s] 


In [83]:
openneuro_metadata = []
for dataset in Path('../scripts/results').glob('openneuro*.jsonl'):
    try:
        df = pd.read_json(dataset, lines=True)
    except:
        continue
    df = df.drop_duplicates(subset=["dataset_id", "subject"], keep="last")
    files = df.groupby(['dataset_id']).sum().reset_index()
    files['subjects'] = df.shape[0]
    files['modality'] = df.data_type.iloc[0]
    openneuro_metadata.append(files.to_dict(orient='records')[0])

df_openneuro = pd.DataFrame(openneuro_metadata)

df_openneuro['species'] = 'Homo sapiens - Human'
df_openneuro = df_openneuro.rename(columns={'dataset_id': 'identifier',
                                            'total_duration': 'recording_length'})

  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_id']).sum().reset_index()
  files = df.groupby(['dataset_

In [84]:
species_map = {'DANDI:000052': 'Unknown',
               'DANDI:000058': 'Unknown',
               'DANDI:000105': 'Unknown',
               'DANDI:000117': 'Unknown',
               'DANDI:000683': 'Unknown',
               'DANDI:000724': 'Homo sapiens - Human'}

clean_results = []

normalized_species = {
    'Macaca nemestrina': 'Macaca nemestrina - Pigtail macaque',
    'Macaca nemestrina - Pig-tailed macaque': 'Macaca nemestrina - Pigtail macaque',
    'Human': 'Homo sapiens - Human',
    'Rhesus monkey': 'Macaca mulatta - Rhesus monkey',
    'House mouse': 'Mus musculus - House mouse',
    'Rat': 'Rattus norvegicus - Norway rat',
    'Rat; norway rat; rats; brown rat': 'Rattus norvegicus - Norway rat',
    'Brown rat': 'Rattus norvegicus - Norway rat',
}

blacklist = [8, 20, 23, 26, 45, 109, 142, 209, 212, 288, 341, 489, 537]
blacklisted = [f"{x:03}" for x in blacklist]
all_species = []
for row in metadata:
    try:
        species = row['assetsSummary']['species'][0]['name']
    except (IndexError, KeyError):
        # Look in the name, description, and keywords, in that order
        if ' rat' in row['name'].lower():
            species = 'Rat'
        elif ' mouse' in row['name'].lower():
            species = 'House mouse'
        elif ' human' in row['name'].lower():
            species = 'Human'
        if row['name'].lower().startswith('rat'):
            species = 'Rat'
        elif row['name'].lower().startswith('mouse'):
            species = 'House mouse'
        elif row['name'].lower().startswith('human'):
            species = 'Human'
        elif ' rat' in row['description'].lower():
            species = 'Rat'
        elif ' mouse' in row['description'].lower():
            species = 'House mouse'
        elif ' mice' in row['description'].lower():
            species = 'House mouse'
        elif ' human' in row['description'].lower():
            species = 'Human'
        elif [kw for kw in row.get('keywords', []) if kw.lower().startswith('rat')]:
            species = 'Rat'
        elif [kw for kw in row.get('keywords', []) if kw.lower().startswith('mouse')]:
            species = 'House mouse'
        elif [kw for kw in row.get('keywords', []) if kw.lower().startswith('human')]:
            species = 'Human'
        elif 'simulation' in row['name'].lower():
            species = 'Simulation'
        elif 'test' in row['name'].lower():
            species = 'Unknown'
        elif 'stimulus' in row['name'].lower():
            species = 'Unknown'
        elif 'organoid' in row['name'].lower():
            species = 'Organoid'
        elif 'elegans' in row['name'].lower():
            species = 'Caenorhabditis elegans'
        else:
            species = species_map[row['identifier']]
    refined_species = normalized_species.get(species, species)
    recording_length = max([row.get('spike_recording_length', 0),
                            row.get('lfp_recording_length', 0),
                            row.get('ca_recording_length', 0)])
    row['recording_length'] = recording_length
    row['canonical_species'] = refined_species

    # Figure out the recording modality from context
    modalities = []
    if row.get('lfp_recording_length', 0) > 0:
        if refined_species == 'Homo sapiens - Human':
            modalities.append('iEEG')
        else:
            modalities.append('lfp')
    if row.get('nspikes', 0) > 0:
        modalities.append('spikes')
    if row.get('ca_recording_length', 0) > 0:
        modalities.append('calcium')
    if row.get('recording_type', "") == "icephys":
        modalities.append("icephys")
        
    if len(modalities) == 0:
        # Check for a clamp indicator
        if [x for x in row['assetsSummary'].get('variableMeasured', []) if 'Clamp' in x]:
            modalities.append('icephys')
        elif row['identifier'][-3:] in blacklisted:
            modalities.append("blacklisted")
        else:
            modalities.append('unknown')

    for modality in modalities:
        nframes_samples_or_spikes = 0
        if modality in ('lfp', 'iEEG'):
            nframes_samples_or_spikes = row['lfp_samples']
        elif modality == 'spikes':
            nframes_samples_or_spikes = row['nspikes']
        elif modality == 'calcium':
            nframes_samples_or_spikes = row['ca_samples']

        clean_results.append(
            {'identifier': row['identifier'],
             'species': refined_species,
             'recording_length': recording_length,
             'modality': modality,
             'nframes_samples_or_spikes': nframes_samples_or_spikes,
             'bytes': row['assetsSummary'].get("numberOfBytes", 0), 
             'subjects': row['assetsSummary'].get("numberOfSubjects", 0), 
             'files': row['assetsSummary'].get("numberOfFiles", 0), 
            }
        )

    print(row['identifier'],
          refined_species, 
          recording_length, 
          modalities, 
          row['assetsSummary'].get("numberOfBytes", 0), 
          row['assetsSummary'].get("numberOfSubjects", 0), 
          row['assetsSummary'].get("numberOfFiles", 0)
          )
    
clean_results += df_openneuro.to_dict(orient='rows')

DANDI:000003 Mus musculus - House mouse 3665367.9255999997 ['lfp', 'spikes'] 2559248010229 16 101
DANDI:000004 Homo sapiens - Human 172911.3089485524 ['spikes'] 6197474020 59 87
DANDI:000005 Mus musculus - House mouse 31132.45 ['spikes'] 46436686324 55 148
DANDI:000006 Mus musculus - House mouse 34789.238391000006 ['spikes'] 139600500 12 53
DANDI:000007 Mus musculus - House mouse 110743.41024337358 ['spikes'] 199439472 13 54
DANDI:000008 Mus musculus - House mouse 0 ['blacklisted'] 11922334254 266 1328
DANDI:000009 Mus musculus - House mouse 159106.66765316797 ['spikes'] 12919706852 31 173
DANDI:000010 Mus musculus - House mouse 224628.36568132794 ['spikes', 'calcium'] 40006570644 23 158
DANDI:000011 Mus musculus - House mouse 253327.97210887514 ['spikes'] 32435325542 19 92
DANDI:000012 Homo sapiens - Human 0 ['icephys'] 487524911 4 297
DANDI:000013 Mus musculus - House mouse 0 ['icephys'] 11408735292 23 52
DANDI:000015 Mus musculus - House mouse 280013.2054845643 ['calcium'] 171597277

  clean_results += df_openneuro.to_dict(orient='rows')


Add iEEG data from iEEG.org and from the ETH archives.

In [85]:
clean_results.append(
    {'identifier': 'iEEG.org:all',
     'modality': 'iEEG',
     'species': 'Homo sapiens - Human',
     'recording_length': 98103600,
     'nframes_samples_or_spikes': 49051800000,  # Estimated at 500Hz
     'bytes': 49051800000 * 2 * 8, # Assuming 16-bit precision, 8 contacts
     'subjects': 607,
     'files': 607,
     }
)

clean_results.append(
    {'identifier': 'SWEC-ETHZ',
     'modality': 'iEEG',
     'species': 'Homo sapiens - Human',
     'recording_length': 9000000,
     'nframes_samples_or_spikes': 9000000 * 512,  # Estimated at 500Hz
     'bytes': 9000000 * 512 * 4 * 64, # Assuming 32-bit precision, 64 contacts
     'subjects': 18,
     'files': 2500,
     }
)

In [171]:
df = pd.DataFrame(clean_results)
df.modality = df.modality.str.lower()
df = df[~df.identifier.isin(['ds004091', 'ds004271', 'ds001607', 'ds003972', 'ds003974', 'ds003872', 'ds003988', 'ds004312'])]

to_lower = ['ds004278', 'ds004011', 'ds003343', 'ds004802', 'ds003620', 'ds004019', 
            'ds004018', 'ds003751', 'ds004151', 'ds003887', 'ds003885', 'ds002721', 
            'ds004018', 'ds003751', 'ds004043', 'ds003420', 'ds004816', 'ds004817', 
            'ds004252', 'ds003694']
idx = df.identifier.isin(to_lower)
df.loc[idx, 'recording_length'] = df.loc[idx, 'recording_length'] / 1e3

# Engage in extrapolation for ieeg, eeg, task-fmri, rs-fmri
def extrapolate(df, modality):
    """
    Extrapolate the duration of datasets with missing time information from their
    size. Find those which have a recording length which is non-zero. Run a linear 
    regression between the size and the duration of datasets, Then apply that to the 
    zero elements.
    """
    df_ = df[(df.modality == modality) & (df.recording_length != 0)]
    fit = LinearRegression(fit_intercept=False)
    fit.fit(df_.bytes.values.reshape((-1, 1)), df_.recording_length)

    idx = (df.modality == modality) & (df.recording_length == 0)
    df_ = df[idx]
    df.loc[idx, 'recording_length'] = fit.predict(df[idx].bytes.values.reshape((-1, 1)))
    return fit.coef_

extrapolate(df, 'eeg')
extrapolate(df, 'ieeg')
coef = extrapolate(df, 'task-fmri')
idx = df.modality == 'rs-fmri'
df.loc[idx, 'recording_length'] = df[idx, 'bytes'] * coef

df = df[df['modality'].isin(['calcium', 'ieeg', 'lfp', 'spikes', 'task-fmri', 'eeg', 'meg', 'rs-fmri'])]
df = df[df['species'].isin(['Drosophila melanogaster - Fruit fly', 
                           'Homo sapiens - Human', 
                           'Macaca mulatta - Rhesus monkey',
                           'Mus musculus - House mouse',
                           'Rattus norvegicus - Norway rat'])]
df.groupby(['species', 'modality']).sum()

  df.groupby(['species', 'modality']).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,recording_length,nframes_samples_or_spikes,bytes,subjects,files
species,modality,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Drosophila melanogaster - Fruit fly,calcium,5636158.0,16001420.0,776243300000.0,1347,1473.0
Homo sapiens - Human,eeg,9025890.0,0.0,15425020000000.0,9888,21467.0
Homo sapiens - Human,ieeg,214195200.0,105944400000.0,5331130000000.0,2086,5645.0
Homo sapiens - Human,meg,793541.2,0.0,1028498000000.0,294,1672.0
Homo sapiens - Human,rs-fmri,792012.3,0.0,4819255000000.0,9275,38362.0
Homo sapiens - Human,spikes,364549.9,34411530.0,235278100000.0,149,287.0
Homo sapiens - Human,task-fmri,34287350.0,0.0,20484820000000.0,14702,95914.0
Macaca mulatta - Rhesus monkey,lfp,69164.94,138329900.0,187535000000.0,2,15.0
Macaca mulatta - Rhesus monkey,spikes,3390446.0,5114969000.0,290112300000.0,31,2097.0
Mus musculus - House mouse,calcium,11578000.0,423047400.0,14609580000000.0,902,11917.0


In [169]:
df_ = df.groupby(['modality']).sum()
df_['recording_length_hours'] = df_.recording_length * (1/ 3600)
df_['equiv_tokens'] = df_.recording_length * 100
df_['terabytes'] = df_.bytes / (1024 * 1024 * 1024 * 1024)
df_[['recording_length', 'recording_length_hours', 'equiv_tokens', 'nframes_samples_or_spikes', 'terabytes', 'subjects', 'files']]

  df_ = df.groupby(['modality']).sum()


Unnamed: 0_level_0,recording_length,recording_length_hours,equiv_tokens,nframes_samples_or_spikes,terabytes,subjects,files
modality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
calcium,17217680.0,4782.688367,1721768000.0,439080700.0,14.069975,2259,13410.0
eeg,9025890.0,2507.191722,902589000.0,0.0,14.028972,9888,21467.0
ieeg,214195200.0,59498.671914,21419520000.0,105944400000.0,4.848635,2086,5645.0
lfp,44940920.0,12483.58954,4494092000.0,233200700000.0,162.35187,1074,14830.0
meg,793541.2,220.42811,79354120.0,0.0,0.935413,294,1672.0
spikes,38535150.0,10704.208613,3853515000.0,52122680000.0,116.061051,1262,18627.0
task-fmri,34287350.0,9524.262609,3428735000.0,0.0,18.630835,14702,95914.0
