# Preparing dataset size (9, 64, 64)

#### Saving dataset in reduced size (original synthetic size) for training, validation, test and synthetic
The hdf5 version of synthetic with labels is the version of data shared with the code

In [1]:
import sys
import os
import torch
import h5py
import pickle
import umap

import numpy as np
import matplotlib.pyplot as plt

from scipy.spatial.distance import cdist
from torch.nn.functional import interpolate
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm

In [2]:
gpu = torch.cuda.is_available()
if gpu:
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')
    
if os.path.basename(os.getcwd()) != 'BDI-imaging':  # change dir to ROOT
    os.chdir("../../")
sys.path.insert(0, "src")
    
print('============ Demo information ============')
print('- Working directory: /{}'.format(os.getcwd().split('/')[-1]))
print('- Cuda device: {}'.format(device))
print('==========================================')

- Working directory: /src
- Cuda device: cpu


In [3]:
# load batchers for each dataset
from manuscript.Train.restricted.train_dataset import F2305Dataset, spineNet_split
from manuscript.Train.restricted.test_dataset import A2209Dataset
from manuscript.Train.restricted.synthetic_dataset import SynthDataset
from manuscript.Train.batchers import F2305Batcher, A2209Batcher, SynthBatcher

In [4]:
image_shape = (9, 64, 64)
batch_size = 1

# Train

### F2305 training fold
Reading images from the training set with the batcher and saving them as a hdf5 file with labels. Splitting F2305 into train and validation.

In [5]:
image_list = spineNet_split()['train']
print("Training using F2305...")
traindata_f = F2305Dataset(shape=image_shape)
traindata_f.prepare(label_by="reader2", types=('T1',), subset_fraction=1.0, image_list=image_list)

vu_loader_f = DataLoader(F2305Batcher(traindata_f.dataset, traindata_f.scan_path), batch_size=batch_size,
                                    shuffle=True, num_workers=1)

Training using F2305...


In [7]:
train_dataset = np.zeros([len(vu_loader_f), 9, 64, 64])
train_region = np.zeros([len(vu_loader_f), 1])
for index, sample in tqdm(enumerate(vu_loader_f)):
    im = sample['im']
    region = sample['region']
    train_dataset[index,:,:,:] = im
    train_region[index,:] = region

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [8]:
h5f = h5py.File('./manuscript/Privacy/privacy_saves/train_set.h5', 'w')
h5f.create_dataset('images', data=train_dataset)
h5f.create_dataset('regions', data=train_region)
h5f.close()

# Val

### F2305 validation fold
Reading images from the training set with the batcher and saving them as a hdf5 file with labels.
Selecting validation and test folds from prior work. There is no overlap between patient present in each fold. Validation set has no images from patients in the training set

In [9]:
image_list = spineNet_split()['val'] + spineNet_split()['test']
print("Val using F2305...")
valdata = F2305Dataset(shape=image_shape)
valdata.prepare(label_by="reader2", types=('T1',), subset_fraction=1.0, image_list=image_list)

vu_loader_val = DataLoader(F2305Batcher(valdata.dataset, valdata.scan_path), batch_size=batch_size,
                                    shuffle=True, num_workers=1)

Val using F2305...


In [10]:
val_dataset = np.zeros([len(vu_loader_val), 9, 64, 64])
val_region = np.zeros([len(vu_loader_val), 1])
for index, sample in tqdm(enumerate(vu_loader_val)):
    im = sample['im']
    region = sample['region']
    val_dataset[index,:,:,:] = im
    val_region[index,:] = region

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [11]:
h5f = h5py.File('./manuscript/Privacy/privacy_saves/val_set.h5', 'w')
h5f.create_dataset('images', data=val_dataset)
h5f.create_dataset('regions', data=val_region)
h5f.close()

# Test

### Full A2209 dataset as test set
Reading images from A2209 using the batcher and saving them with labels in a hdf5 format

In [12]:
testdata = A2209Dataset(shape=image_shape)
print("Test using A2209...")
testdata.prepare(label_by="berlin_clinical", types=('T1',), subset_fraction=1.0)

vu_loader_test = DataLoader(A2209Batcher(testdata.dataset, testdata.scan_path), batch_size=batch_size, 
                        shuffle=True, num_workers=1)

Test using A2209...


In [13]:
test_dataset = np.zeros([len(vu_loader_test), 9, 64, 64])
test_region = np.zeros([len(vu_loader_test), 1])
for index, sample in tqdm(enumerate(vu_loader_test)):
    im = sample['im']
    region = sample['region']
    test_dataset[index,:,:,:] = im
    test_region[index,:] = region

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [14]:
h5f = h5py.File('./manuscript/Privacy/privacy_saves/test_set.h5', 'w')
h5f.create_dataset('images', data=test_dataset)
h5f.create_dataset('regions', data=test_region)
h5f.close()

# Synth

### Synthetic dataset
Reading images from genrated synthetic dataset using the batcher and saving them with labels in a hdf5 format

In [15]:
print("Synth dataset...")
traindata_synth = SynthDataset()

vu_loader_synth = DataLoader(SynthBatcher(traindata_synth.dataset, traindata_synth.scan_path), batch_size=batch_size,
                                    shuffle=True, num_workers=1)

Synth dataset...


In [16]:
synth_dataset = np.zeros([len(vu_loader_synth), 9, 64, 64])
synth_region = np.zeros([len(vu_loader_synth), 1])
for index, sample in tqdm(enumerate(vu_loader_synth)):
    im = sample['im']
    im = interpolate(im, size=image_shape[1:], mode='bicubic')
    region = sample['region']
    synth_dataset[index,:,:,:] = im
    synth_region[index,:] = region

In [17]:
h5f = h5py.File('./manuscript/Privacy/privacy_saves/synth_set.h5', 'w')
h5f.create_dataset('images', data=synth_dataset)
h5f.create_dataset('regions', data=synth_region)
h5f.close()

# Candidate dataset

We simulate an attack on the synthetic dataset, for this purpose we create a "candidate dataset", this dataset is composed of 334 images from the training set (F2305 used in GAN training), 333 from the validation dataset (F2305 not used in GAN training) and 333 from the testing dataset (A2209 not used in training).
The goal is to assess if it is to tell if a candidate sample comes from the training set when compared with the synthetic data.

In [18]:
# 334 samples from the training set
h5f = h5py.File('./manuscript/Privacy/privacy_saves/train_set.h5', 'r')
candidate_train = h5f['images'][-334:]
h5f.close()

In [19]:
# 333 samples from the validation set
h5f = h5py.File('./manuscript/Privacy/privacy_saves/val_set.h5', 'r')
candidate_val = h5f['images'][-333:]
h5f.close()

In [20]:
# 333 samples from the test set
h5f = h5py.File('./manuscript/Privacy/privacy_saves/test_set.h5', 'r')
candidate_test = h5f['images'][-333:]
h5f.close()

In [21]:
# Concatenate and save
candidate = np.concatenate((candidate_train, candidate_val, candidate_test))
np.save('./manuscript/Privacy/privacy_saves/candidate.npy', candidate)

In [22]:
candidate = np.load('./manuscript/Privacy/privacy_saves/candidate.npy', allow_pickle=True)