## Data Preparation

You should prepare the following things before running this step. 
1. **A patient list** that emunarates fixed CT scans.  
   - we have 100 fixed CT in this study.
   - each fixed CT has two simulations of type 1 noise (poisson noise) and two simulations of type 2 (guassian noise) (see step1.ipynb for noise definition).
---
## Task: Split simulated dataset into batches

- 5 batches (0-4) for training and validation, 1 batch (5) for testing

---

### Docker environment
Please use `docker/docker_pytorch`, it will build a pytorch docker


In [2]:
import sys
sys.path.append('/workspace/Documents')
import os
import numpy as np
import pandas as pd
import nibabel as nb
import CTDenoising_Diffusion_N2N.functions_collection as ff

main_path = '/mnt/camca_NAS/denoising/'

### step 1: read original patient list

In [3]:
patient_sheet = pd.read_excel(os.path.join(main_path,'Patient_lists', 'fixedCT_static.xlsx'),dtype={'Patient_ID': str, 'Patient_subID': str})
print('patient sheet len: ', len(patient_sheet))

patient sheet len:  100


### step 2: split into 6 batches

In [4]:
if os.path.isfile(os.path.join(main_path,'Patient_lists', 'fixedCT_static_shuffled_batched.xlsx')) == 1:
    print('already split')
else:
    A = np.arange(0,100)
    np.random.shuffle(A)
    num_batches = 6
    batches = np.array_split(A, num_batches)

    batch_column = []
    for i in range(0,len(patient_sheet)):
        for j in range(0,num_batches):
            if i in batches[j]:
                batch_column.append(j)
                break
    patient_sheet['batch'] = batch_column

    patient_sheet_shuffled = patient_sheet.iloc[A].reset_index(drop=True)
    patient_sheet_shuffled.to_excel(os.path.join(main_path, 'Patient_lists', 'fixedCT_static_shuffled_batched.xlsx'), index=False)


### step 3: make a patient list for model training and testing

In [5]:
patient_sheet = pd.read_excel(os.path.join(main_path,'Patient_lists','fixedCT_static_shuffled_batched.xlsx'),dtype={'Patient_ID': str, 'Patient_subID': str})
noise_types = ['gaussian','poisson']
simulation_num = 2

link = 'local' # 'local' or 'NAS'
if link == 'local':
    data_path1 = '/workspace/Documents/Data/denoising'
elif link == 'NAS':
    data_path1 = '/mnt/camca_NAS/denoising/Data'

for noise_type in noise_types:
    results = []
    for i in range(0, len(patient_sheet)):
        patient_id = patient_sheet['Patient_ID'][i]
        patient_subid = patient_sheet['Patient_subID'][i]
        batch = patient_sheet['batch'][i]
        print(f"Processing patient {patient_id} {patient_subid}...")

        for n in range(0,simulation_num):
            # fix the typo
            n_type = 'gaussian' if noise_type == 'gaussian' else 'possion'
            simulation_file = os.path.join(data_path1,'simulation/',patient_id,patient_subid,n_type+'_random_'+str(n), 'recon.nii.gz')
            ground_truth_file = os.path.join(data_path1,'fixedCT/',patient_id,patient_subid,'img_thinslice_partial.nii.gz')

            results.append([batch,patient_id, patient_subid, n,simulation_file, ground_truth_file])#, simulation_img.shape])

        df = pd.DataFrame(results, columns=['batch','Patient_ID', 'Patient_subID', 'random_num', 'noise_file', 'ground_truth_file'])#, 'image_shape'])
        df.to_excel(os.path.join(main_path, 'Patient_lists', 'fixedCT_static_simulation_train_test_' + noise_type + '_' + link +'.xlsx'), index=False)