In [3]:
import os
import sys
os.chdir('/home/mhoerold/entrack')
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
#os.environ["CUDA_VISIBLE_DEVICES"]="4"
#sys.path.append('/local/home/dhaziza/entrack/')

import src.data.streaming.features as ft_def
import csv
import re
import glob
import pickle
import time
import xml.etree.ElementTree as ET

class NoSuchXMLElementException(Exception):
    pass

def intvals(s):
    vals = re.findall(r'\d+', s)
    return [int(i) for i in vals]

def intval(s):
    vals = intvals(s)
    return vals[0]

def boolval(cond):
    return 1 if cond else 0

def print_stats(dataset, name, field, cond):
    subject_ids = set()
    num = 0.0
    total_value = 0.0
    for row in dataset:
        if cond(row):
            num += 1
            total_value += row[field]
            subject_ids.add(row[ft_def.STUDY_PATIENT_ID])

    if num == 0:
        print('!! NO DATA FOR %s' % name)
        return
    value_mean = total_value/num

    value_std = 0.0
    for row in dataset:
        if cond(row):
            value_std += (row[field] - value_mean)*(row[field] - value_mean)
    value_std /= num
    print('%s Mean %s %f, variance %f [%d images / %d unique patients]' % (
        name, field, value_mean, value_std, num, len(subject_ids)))

In [2]:
# Koln data
csv_orig = 'data/raw/csv/orig/koln.csv'
csv_output = 'data/raw/csv/koln.csv'

data = []
with open(csv_orig) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append({
            ft_def.AGE: int(row['Alter']),
            'health_pd': 1,
            ft_def.HEALTHY: 0,
            ft_def.SEX: int(row['Geschlecht (male =1; female =2)'])-1,
            ft_def.STUDY_PATIENT_ID: intval(row['ID'])
        })

with open(csv_output, "wb+") as csvfile:
    writer = csv.DictWriter(csvfile, data[0].keys(), dialect='excel')
    writer.writeheader()
    for row in data:
        writer.writerow(row)

print_stats(data, 'KOLN', 'age', lambda r: True)

KOLN Mean age 63.546875, variance 83.904053 [128 images / 128 unique patients]


In [30]:
# Load ADNI/AIBL features from XML
def load_image_features_from_xml(path, image_id_prefix=''):
    image_id_to_feature = {}
    all_files = glob.glob(path)
    print('Loading %d XML files... Please be patient! :)' % len(all_files))
    begin_time = time.time()
    milestone = 100
    for i, f in enumerate(all_files):
        if i >= milestone:
            milestone *= 2
            print('  .. done %d in %ds' % (i, time.time() - begin_time))
        tree = ET.parse(f)
        root = tree.getroot()
        def elem_unique(path):
            e = root.findall(path)
            if len(e) == 0:
                raise NoSuchXMLElementException()
            assert(len(e) == 1)
            return e[0]
        image_id_text = elem_unique("./project/subject/study/imagingProtocol/imageUID").text
        sex_text = elem_unique("./project/subject/subjectSex").text
        age_text = elem_unique("./project/subject/study/subjectAge").text
        research_group = elem_unique("./project/subject/researchGroup").text
        image_id = int(elem_unique("./project/subject/study/imagingProtocol/imageUID").text)
        subject_id_text = elem_unique("./project/subject/subjectIdentifier").text
        manufacturer = elem_unique("./project/subject/study/imagingProtocol/protocolTerm/protocol[@term='Manufacturer']").text
        field_strength = elem_unique("./project/subject/study/imagingProtocol/protocolTerm/protocol[@term='Field Strength']").text
        try:
            weighting = elem_unique("./project/subject/study/imagingProtocol/protocolTerm/protocol[@term='Weighting']").text
        except NoSuchXMLElementException:
            weighting = 'DTI'
        key = '%s%s' % (image_id_prefix, image_id)
        assert key not in image_id_to_feature
        image_id_to_feature[key] = {
            ft_def.AGE: float(age_text),
            ft_def.SEX: 0 if sex_text == 'M' else 1,
            ft_def.STUDY_IMAGE_ID: int(image_id_text),
            ft_def.SUBJECT_LABEL: subject_id_text,
            ft_def.MRI_MANUFACTURER: manufacturer,
            ft_def.MRI_FIELD_STRENGTH: field_strength,
            ft_def.WEIGHTING: weighting,
        }
    return image_id_to_feature

adni_aibl_features_from_xml = load_image_features_from_xml('/home/mhoerold/Downloads/T1/ADNI/*.xml')
adni_aibl_features_from_xml.update(load_image_features_from_xml(
    '/home/mhoerold/Downloads/AIBL_T1/AIBL/*.xml', image_id_prefix='A'))

Loading 28504 XML files... Please be patient! :)
  .. done 100 in 0s
  .. done 200 in 0s
  .. done 400 in 0s
  .. done 800 in 0s
  .. done 1600 in 0s
  .. done 3200 in 0s
  .. done 6400 in 0s
  .. done 12800 in 1s
  .. done 25600 in 3s
Loading 1615 XML files... Please be patient! :)
  .. done 100 in 0s
  .. done 200 in 0s
  .. done 400 in 0s
  .. done 800 in 0s
  .. done 1600 in 0s


In [4]:
# ADNI AIBL
csv_orig = 'data/raw/csv/orig/loni_adni_aibl.csv'
adni_aibl_labels = pickle.load(open('/local/ADNI_AIBL/ADNI_AIBL_T1_normalized/py2/AIBL_ADNI_class_labels_T1.pkl', 'r'))
adni_aibl_valid_set = pickle.load(open('/local/ADNI_AIBL/ADNI_AIBL_T1_normalized/py2/AIBL_ADNI_valid_T1_NC_AD.pkl', 'r'))

ADNI_AIBL_GROUPS = ['Normal', 'MCI', 'AD', 'EMCI', 'LMCI', 'SMC']
def convert_adni_aibl(csv_output, cond):
    print('ADNI_AIBL: Converting to %s' % (csv_output))
    data = []
    with open(csv_orig) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['Project'] not in ['ADNI', 'AIBL']:
                continue
            image_id_sai = image_id = str(row['Image ID'])
            image_id = int(image_id)
            study_id = 0
            if row['Project'] == 'AIBL':
                image_id_sai = 'A' + image_id_sai
                study_id = 1
            if image_id_sai not in adni_aibl_labels:
                continue
            label_id = adni_aibl_labels[image_id_sai]
            group = ADNI_AIBL_GROUPS[label_id]
            if group not in ADNI_AIBL_GROUPS:
                continue
            cur_data = {
                ft_def.SUBJECT_LABEL: row['Subject ID'],
                ft_def.IMAGE_LABEL: image_id_sai,
                ft_def.AGE: intval(row['Age']),
                'health_ad': boolval(group == 'AD'),
                'health_emci': boolval(group == 'EMCI'),
                'health_lmci': boolval(group == 'LMCI'),
                'health_mci': boolval(group == 'MCI'),
                'health_smc': boolval(group == 'SMC'),
                ft_def.HEALTHY: boolval(group == 'Normal'),
                ft_def.SEX: 0 if row['Sex'] == 'M' else 1,
                ft_def.STUDY_IMAGE_ID: image_id,
                ft_def.STUDY_PATIENT_ID: hash(row['Subject ID']) % 100000000,
                ft_def.STUDY_ID: study_id,
                ft_def.DATASET: 'test' if image_id_sai in adni_aibl_valid_set else 'train',
            }
            cur_data.update(adni_aibl_features_from_xml[image_id_sai])
            if not cond(row, cur_data):
                continue
            data.append(cur_data)

    with open(csv_output, "wb+") as csvfile:
        writer = csv.DictWriter(csvfile, data[0].keys(), dialect='excel')
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    name = os.path.basename(csv_output).split('.csv')[0]
    print_stats(data, '*', 'age', lambda x: True)
    for ft in ['health_ad', ft_def.HEALTHY]:
        print_stats(data, ft, 'age', lambda row: row[ft] == 1)
        print_stats(data, 'SAI_TEST/%s' % ft, 'age', lambda row: row[ft] == 1 and row['dataset'] == 'test')
        print_stats(data, 'SAI_TRAIN/%s' % ft, 'age', lambda row: row[ft] == 1 and row['dataset'] == 'train')
        print_stats(data, 'ADNI/%s' % ft, 'age', lambda row: row[ft] == 1 and row['study_id'] == 0)
        print_stats(data, 'AIBL/%s' % ft, 'age', lambda row: row[ft] == 1 and row['study_id'] == 1)
    print('')

def is_ad_or_hc(f):
    return f['health_ad'] == 1 or f[ft_def.HEALTHY] == 1

convert_adni_aibl('data/raw/csv/adni_aibl.csv', lambda r, f: True)
convert_adni_aibl('data/raw/csv/adni_aibl__ad_hc.csv', lambda r, f: is_ad_or_hc(f))

ADNI_AIBL: Converting to data/raw/csv/adni_aibl.csv
* Mean age 74.851736, variance 53.252787 [19762 images / 2449 unique patients]
health_ad Mean age 75.643432, variance 58.140956 [4476 images / 742 unique patients]
SAI_TEST/health_ad Mean age 76.192500, variance 51.725444 [400 images / 70 unique patients]
SAI_TRAIN/health_ad Mean age 75.589549, variance 58.738056 [4076 images / 672 unique patients]
ADNI/health_ad Mean age 75.687428, variance 57.892841 [4335 images / 646 unique patients]
AIBL/health_ad Mean age 74.290780, variance 63.879986 [141 images / 96 unique patients]
healthy Mean age 75.355394, variance 39.633441 [6618 images / 1048 unique patients]
SAI_TEST/healthy Mean age 75.210000, variance 39.330900 [400 images / 62 unique patients]
SAI_TRAIN/healthy Mean age 75.364748, variance 39.651456 [6218 images / 986 unique patients]
ADNI/healthy Mean age 75.694186, variance 38.329251 [5814 images / 591 unique patients]
AIBL/healthy Mean age 72.905473, variance 42.232358 [804 images 

In [5]:
# Erasmus data
csv_orig = '/local/ERSM/ADNI_Diagnosis_mciconv.csv'

ERASMUS_ADNI_GROUPS = ['CN', 'AD', 'MCI']
def convert_erasmus_adni(csv_output, cond):
    print('ERASMUS_ADNI: Converting to %s' % (csv_output))
    data = []
    i = 0
    with open(csv_orig) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # Reconstruct a single int subject id from something like '002_S_0295'
            subject_label = row['PTID']
            group = row['Diagnosis']
            if group not in ERASMUS_ADNI_GROUPS:
                continue
            cur_data = {
                'health_ad': boolval(group == 'AD'),
                'health_mci': boolval(group == 'MCI'),
                ft_def.HEALTHY: boolval(group == 'CN'),
                ft_def.IMAGE_LABEL: subject_label,
                ft_def.SUBJECT_LABEL: subject_label,
                ft_def.STUDY_PATIENT_ID: i,
            }
            i += 1
            if not cond(row, cur_data):
                continue
            data.append(cur_data)

    with open(csv_output, "wb+") as csvfile:
        writer = csv.DictWriter(csvfile, data[0].keys(), dialect='excel')
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    name = os.path.basename(csv_output).split('.csv')[0]
    print_stats(data, '*', 'healthy', lambda x: True)
    for ft in ['health_ad', 'health_mci', ft_def.HEALTHY]:
        print_stats(data, ft, ft, lambda row: row[ft] == 1)
    print('')

convert_erasmus_adni('data/raw/csv/erasmus_adni.csv', lambda r, d: True)

ERASMUS_ADNI: Converting to data/raw/csv/erasmus_adni.csv
* Mean healthy 0.240069, variance 0.182436 [1737 images / 1737 unique patients]
health_ad Mean health_ad 1.000000, variance 0.000000 [342 images / 342 unique patients]
health_mci Mean health_mci 1.000000, variance 0.000000 [978 images / 978 unique patients]
healthy Mean healthy 1.000000, variance 0.000000 [417 images / 417 unique patients]



In [6]:
# ADNI/AIBL: sai dataset
train_ad_nc = pickle.load(open('/local/ADNI_AIBL/ADNI_AIBL_T1_normalized/py2/AIBL_ADNI_train_T1_NC_AD.pkl', 'r'))
valid_ad_nc = pickle.load(open('/local/ADNI_AIBL/ADNI_AIBL_T1_normalized/py2/AIBL_ADNI_valid_T1_NC_AD.pkl', 'r'))
csv_output = 'data/raw/csv/adni_aibl__ad_hc__sai.csv'

print('ADNI_AIBL[sai_ad_nc]: Converting to %s' % (csv_output))
data = []
for image_id_sai, is_ad in train_ad_nc.items():
    cur_data = adni_aibl_features_from_xml[image_id_sai.split('_')[0]]
    cur_data.update({
        ft_def.IMAGE_LABEL: image_id_sai,
        'health_ad': is_ad,
        ft_def.HEALTHY: boolval(is_ad == 0),
        ft_def.DATASET: 'train',
    })
    data.append(cur_data)
for image_id_sai, is_ad in valid_ad_nc.items():
    cur_data = adni_aibl_features_from_xml[image_id_sai.split('_')[0]]
    cur_data.update({
        ft_def.IMAGE_LABEL: image_id_sai,
        'health_ad': is_ad,
        ft_def.HEALTHY: boolval(is_ad == 0),
        ft_def.DATASET: 'test',
    })
    data.append(cur_data)

with open(csv_output, "wb+") as csvfile:
    writer = csv.DictWriter(csvfile, data[0].keys(), dialect='excel')
    writer.writeheader()
    for row in data:
        writer.writerow(row)

ADNI_AIBL[sai_ad_nc]: Converting to data/raw/csv/adni_aibl__ad_hc__sai.csv


In [7]:
# PPMI - datakey is image_id
# Example:
# <subject_id>/Axial_PD-T2_TSE_FS/2013-04-09_09_24_46.0/S<serie_id>/
# ... PPMI_4139_MR_Axial_PD-T2_TSE_FS_br_raw_20130625124532171_76_S<serie_id>_I<image_id>.nii

csv_output = 'data/raw/csv/ppmi.csv'

image_sizes = {}
ppmi_path = '/local/PPMI/raw/'
data = []
for f in glob.glob(ppmi_path + '*.xml'):
    tree = ET.parse(f)
    root = tree.getroot()
    def elem_unique(path):
        e = root.findall(path)
        if len(e) == 0:
            raise NoSuchXMLElementException()
        assert(len(e) == 1)
        return e[0]
    sex_text = elem_unique("./project/subject/subjectSex").text
    age_text = elem_unique("./project/subject/study/subjectAge").text
    research_group = elem_unique("./project/subject/researchGroup").text
    image_id_text = elem_unique("./project/subject/study/imagingProtocol/imageUID").text
    subject_id_text = elem_unique("./project/subject/subjectIdentifier").text
    post_mortem_text = elem_unique("./project/subject/study/postMortem").text
    try:
        weighting = elem_unique("./project/subject/study/imagingProtocol/protocolTerm/protocol[@term='Weighting']").text
    except NoSuchXMLElementException:
        weighting = 'DTI'
    assert(post_mortem_text == 'F')
    assert(weighting in ['T1', 'T2', 'DTI'])
    if research_group not in ['Control', 'PD', 'GenCohort PD', 'GenCohort Unaff', 'Prodromal', 'SWEDD']:
        continue
    data.append({
        ft_def.AGE: intval(age_text),
        'health_pd': boolval(research_group == 'PD'),
        'health_prodromal': boolval(research_group == 'Prodromal'),
        'health_swedd': boolval(research_group == 'SWEDD'),
        'health_gencohort_unaff': boolval(research_group == 'GenCohort Unaff'),
        'health_gencohort_pd': boolval(research_group == 'GenCohort PD'),
        ft_def.HEALTHY: boolval(research_group == 'Control'),
        ft_def.SEX: 0 if sex_text == 'M' else 1,
        ft_def.STUDY_IMAGE_ID: int(image_id_text),
        ft_def.STUDY_PATIENT_ID: int(subject_id_text),
        ft_def.SUBJECT_LABEL: subject_id_text,
    })
    # Also count how many samples per image size
    img_size = (
        elem_unique("./project/subject/study/imagingProtocol/protocolTerm/protocol[@term='Matrix X']").text,
        elem_unique("./project/subject/study/imagingProtocol/protocolTerm/protocol[@term='Matrix Y']").text,
        elem_unique("./project/subject/study/imagingProtocol/protocolTerm/protocol[@term='Matrix Z']").text,
    )
    if not img_size in image_sizes:
        image_sizes[img_size] = 0
    image_sizes[img_size] += 1

with open(csv_output, "wb+") as csvfile:
    writer = csv.DictWriter(csvfile, data[0].keys(), dialect='excel')
    writer.writeheader()
    for row in data:
        writer.writerow(row)

print('%s different sizes found!' % (len(image_sizes)))
print_stats(data, 'PPMI', 'age', lambda x: True)
print_stats(data, 'PPMI/PD', 'age', lambda x: x['health_pd'])
print_stats(data, 'PPMI/Control', 'age', lambda x: x[ft_def.HEALTHY])

194 different sizes found!
PPMI Mean age 61.572225, variance 96.216637 [3766 images / 990 unique patients]
PPMI/PD Mean age 61.389093, variance 94.107652 [1907 images / 402 unique patients]
PPMI/Control Mean age 60.088195, variance 126.102127 [737 images / 183 unique patients]


In [5]:
dic = adni_aibl_features_from_xml

In [6]:
len(dic)

30119

In [8]:
dic['143220']

{'age': 79.3945,
 'sex': 1,
 'study_image_id': 143220,
 'patient_label': '002_S_0413',
 'mri_manufacturer': 'GE MEDICAL SYSTEMS',
 'mri_field_strength': '1.5'}

In [8]:
dic['A444850']

{'age': 81.0,
 'sex': 0,
 'study_image_id': 444850,
 'patient_label': '902',
 'mri_manufacturer': 'SIEMENS',
 'mri_field_strength': '3.0',
 'weighting': 'T1'}

In [9]:
import pandas as pd
df = pd.DataFrame.from_dict(dic, orient='index')

In [10]:
df

Unnamed: 0,age,sex,study_image_id,patient_label,mri_manufacturer,mri_field_strength,weighting
100003,74.3973,0,100003,094_S_1267,GE MEDICAL SYSTEMS,3.0,T1
100004,74.3973,0,100004,094_S_1267,GE MEDICAL SYSTEMS,3.0,T1
10002,75.8904,1,10002,099_S_0111,SIEMENS,1.494,T1
100020,62.8575,0,100020,094_S_1330,GE MEDICAL SYSTEMS,1.5,T1
100021,62.8575,0,100021,094_S_1330,GE MEDICAL SYSTEMS,1.5,T1
100022,62.8575,0,100022,094_S_1330,GE MEDICAL SYSTEMS,1.5,T1
100023,62.8575,0,100023,094_S_1330,GE MEDICAL SYSTEMS,1.5,T1
100029,86.0411,0,100029,941_S_1194,SIEMENS,1.5,T1
10003,75.8904,1,10003,099_S_0111,SIEMENS,1.494,T1
100030,86.0411,0,100030,941_S_1194,SIEMENS,1.5,T1


In [11]:
df['image_label'] = df.index

In [12]:
df

Unnamed: 0,age,sex,study_image_id,patient_label,mri_manufacturer,mri_field_strength,weighting,image_label
100003,74.3973,0,100003,094_S_1267,GE MEDICAL SYSTEMS,3.0,T1,100003
100004,74.3973,0,100004,094_S_1267,GE MEDICAL SYSTEMS,3.0,T1,100004
10002,75.8904,1,10002,099_S_0111,SIEMENS,1.494,T1,10002
100020,62.8575,0,100020,094_S_1330,GE MEDICAL SYSTEMS,1.5,T1,100020
100021,62.8575,0,100021,094_S_1330,GE MEDICAL SYSTEMS,1.5,T1,100021
100022,62.8575,0,100022,094_S_1330,GE MEDICAL SYSTEMS,1.5,T1,100022
100023,62.8575,0,100023,094_S_1330,GE MEDICAL SYSTEMS,1.5,T1,100023
100029,86.0411,0,100029,941_S_1194,SIEMENS,1.5,T1,100029
10003,75.8904,1,10003,099_S_0111,SIEMENS,1.494,T1,10003
100030,86.0411,0,100030,941_S_1194,SIEMENS,1.5,T1,100030


In [13]:
df_all = pd.read_csv('data/raw/csv/adni_aibl.csv')

In [14]:
l1 = set(df.image_label.unique())
l2 = set(df_all.image_label.unique())

In [15]:
inter = l1.intersection(l2)

In [16]:
len(inter)

19760

In [17]:
len(l1.difference(l2))

10359

In [18]:
l2.difference(l1)

{'269070', '269074'}

In [19]:
df

Unnamed: 0,age,sex,study_image_id,patient_label,mri_manufacturer,mri_field_strength,weighting,image_label
100003,74.3973,0,100003,094_S_1267,GE MEDICAL SYSTEMS,3.0,T1,100003
100004,74.3973,0,100004,094_S_1267,GE MEDICAL SYSTEMS,3.0,T1,100004
10002,75.8904,1,10002,099_S_0111,SIEMENS,1.494,T1,10002
100020,62.8575,0,100020,094_S_1330,GE MEDICAL SYSTEMS,1.5,T1,100020
100021,62.8575,0,100021,094_S_1330,GE MEDICAL SYSTEMS,1.5,T1,100021
100022,62.8575,0,100022,094_S_1330,GE MEDICAL SYSTEMS,1.5,T1,100022
100023,62.8575,0,100023,094_S_1330,GE MEDICAL SYSTEMS,1.5,T1,100023
100029,86.0411,0,100029,941_S_1194,SIEMENS,1.5,T1,100029
10003,75.8904,1,10003,099_S_0111,SIEMENS,1.494,T1,10003
100030,86.0411,0,100030,941_S_1194,SIEMENS,1.5,T1,100030


In [37]:
sub_df = df[['age', 'image_label', 'weighting']]
sub_df = sub_df.rename(columns={'age': 'age_exact'})
#sub_df_all = df_all.drop(['age'], axis=1)
sub_df_all = df_all

In [38]:
m = pd.merge(sub_df_all, sub_df, how='inner')

In [39]:
m

Unnamed: 0,health_ad,patient_label,sex,health_emci,mri_manufacturer,health_lmci,health_smc,dataset,health_mci,study_patient_id,healthy,age,study_id,mri_field_strength,study_image_id,image_label,age_exact,weighting
0,0,002_S_0295,0,0,GE MEDICAL SYSTEMS,0,0,train,0,93801557,1,84,0,1.5,13721,13721,84.9041,T1
1,0,002_S_0295,0,0,GE MEDICAL SYSTEMS,0,0,train,0,93801557,1,84,0,1.5,13722,13722,84.9041,T1
2,0,002_S_0295,0,0,GE MEDICAL SYSTEMS,0,0,train,0,93801557,1,85,0,1.5,28560,28560,85.4466,T1
3,0,002_S_0295,0,0,GE MEDICAL SYSTEMS,0,0,train,0,93801557,1,85,0,1.5,28561,28561,85.4466,T1
4,0,002_S_0295,0,0,GE MEDICAL SYSTEMS,0,0,train,0,93801557,1,86,0,1.5,55275,55275,86.0055,T1
5,0,002_S_0295,0,0,GE MEDICAL SYSTEMS,0,0,train,0,93801557,1,86,0,1.5,55276,55276,86.0055,T1
6,0,002_S_0295,0,0,GE MEDICAL SYSTEMS,0,0,train,0,93801557,1,87,0,1.5,114209,114209,87.1699,T1
7,0,002_S_0295,0,0,GE MEDICAL SYSTEMS,0,0,train,0,93801557,1,87,0,1.5,114210,114210,87.1699,T1
8,0,002_S_0295,0,0,GE MEDICAL SYSTEMS,0,0,train,0,93801557,1,88,0,1.5,144446,144446,88.0000,T1
9,0,002_S_0295,0,0,GE MEDICAL SYSTEMS,0,0,train,0,93801557,1,88,0,1.5,144447,144447,88.0000,T1


In [40]:
len(m)

19760

In [41]:
len(l2)

19762

In [42]:
dic

{'82064': {'age': 82.137,
  'sex': 1,
  'study_image_id': 82064,
  'patient_label': '023_S_0030',
  'mri_manufacturer': 'SIEMENS',
  'mri_field_strength': '1.5',
  'weighting': 'T1'},
 '113318': {'age': 65.6822,
  'sex': 0,
  'study_image_id': 113318,
  'patient_label': '012_S_0689',
  'mri_manufacturer': 'Philips Medical Systems',
  'mri_field_strength': '1.5',
  'weighting': 'T1'},
 '20544': {'age': 72.5808,
  'sex': 0,
  'study_image_id': 20544,
  'patient_label': '129_S_0778',
  'mri_manufacturer': 'GE MEDICAL SYSTEMS',
  'mri_field_strength': '1.5',
  'weighting': 'T1'},
 '244093': {'age': 67.4,
  'sex': 0,
  'study_image_id': 244093,
  'patient_label': '141_S_2333',
  'mri_manufacturer': 'SIEMENS',
  'mri_field_strength': '3.0',
  'weighting': 'T1'},
 '119877': {'age': 89.2685,
  'sex': 0,
  'study_image_id': 119877,
  'patient_label': '011_S_0861',
  'mri_manufacturer': 'SIEMENS',
  'mri_field_strength': '1.5',
  'weighting': 'T1'},
 '486895': {'age': 77.0877,
  'sex': 1,
  'stu

In [31]:
len(m.loc[m['weighting'] == 'T2'])

0

In [43]:
m.to_csv('adni_aibl_exact_age.csv', index=False)

In [44]:
records = m.to_dict('records')

In [47]:
for r in records:
    assert int(r['age_exact']) == r['age']

In [48]:
len(records)

19760

In [49]:
all_records = df_all.to_dict('records')

In [50]:
len(all_records)

19762

In [55]:
for i in range(15000):
    r = all_records[i]
    for k in r:
        if r[k] != records[i][k]:
            print(r)
        assert r[k] == records[i][k]

{'health_ad': 0, 'patient_label': '033_S_1016', 'sex': 1, 'health_emci': 0, 'mri_manufacturer': 'GE MEDICAL SYSTEMS', 'health_lmci': 0, 'health_smc': 0, 'dataset': 'train', 'health_mci': 0, 'study_patient_id': 97254427, 'healthy': 1, 'age': 83, 'study_id': 0, 'mri_field_strength': 1.5, 'study_image_id': 269070, 'image_label': '269070'}


AssertionError: 