# Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import os
import zipfile
import matplotlib
from matplotlib import pyplot as plt
import nibabel as nib
from nibabel.testing import data_path
from PIL import Image
%matplotlib inline
%matplotlib notebook

In [None]:
%cd drive
%cd MyDrive

In [None]:
# normalise pixels to 0 to 255
def truncated_range(img):
    max_hu = 384
    min_hu = -384
    img[np.where(img > max_hu)] = max_hu
    img[np.where(img < min_hu)] = min_hu
    return (img - min_hu) / (max_hu - min_hu) * 255.


In [None]:
# have folders set up as follows
# GoogleDrive
# --test
# --train
# -----ct_images
# -----labels

# assuming folders have not been made
!mkdir 'test'
!mkdir 'train'
!mkdir 'train_nii_files'
!mkdir 'train_npy_files'
!mkdir 'test_nii_files'
!mkdir 'test_npy_files'

%cd train

!mkdir 'ct_images'
!mkdir 'labels'

%cd ..


In [None]:
# dataset came as zip file of test and train, unzip data into the nii_files folder
%cd 'nii_files'

# list of files with Patient_number.nii.gz files
with zipfile.ZipFile('test.zip', 'r') as zip_ref:
    zip_ref.extractall('')

# list of folders, each folder contains GT.nii.gz files and Patient_number.nii.gz files
with zipfile.ZipFile('train.zip', 'r') as zip_ref:
    zip_ref.extractall('')

%cd ..

In [None]:
train_filepath = os.path.join(path, 'train')
test_filepath = os.path.join(path, 'test')

In [None]:
# training data is 1 to 40
train_nii_filepath = os.path.join(path, 'train_nii_files')
train_npy_filepath = os.path.join(path, 'train_npy_files')

files = os.listdir(train_nii_filepath)

# for each file/folder in the folder
for i, patient in enumerate(files):
    # i is the index, patient is the patient number/file name
    # current_patient is each patient number i.e. Patient_01
    current_patient = os.path.join(path, patient)

    %cd patient

    img = nib.load(patient + '.nii.gz'))
    img = np.array(img.get_fdata())
    label = nib.load('GT.nii.gz')
    label = np.array(label.get_fdata())
    img = truncated_range(img)

    %cd ..
    %cd ..
    %cd train_npy_files
    !mkdir patient

    # taking each slice of CT image and extracting into .npy files
    for idx in range(img.shape[2]):
        if idx == 0 or idx == img.shape[2] - 1:
            continue
        cur_img = img[:, :, idx - 1:idx + 2].astype('uint8')
        cur_label = label[:, :, idx].astype('uint8')

        %cd patient

        np.save(patient + '_' + str(idx) + '_image.npy'), cur_img)
        np.save(patient + '_' + str(idx) + '_label.npy'), cur_label)

        %cd .. 

        img_fromarray = Image.fromarray(cur_img)
        lbl_fromarray = Image.fromarray(cur_label)

        matplotlib.image.imsave(train_filepath + patient + '_' + str(idx) + '_image.png', img_fromarray)
        matplotlib.image.imsave(train_filepath + patient + '_' + str(idx) + '_label.png', lbl_fromarray)



In [None]:
# training data is 41 to 60
test_nii_filepath = os.path.join(path, 'test_nii_files')
test_npy_filepath = os.path.join(path, 'test_npy_files')

files = os.listdir(test_nii_filepath)

# for each file/folder in the folder
for i, patient in enumerate(files):
    # i is the index, patient is the patient number/file name
    # current_patient is the file name, will be a file with .nii (i.e. Patient_41.nii.gz)
    current_patient = os.path.join(path, patient)
    patient_number = patient[0:10] # getting rid of the .nii.gz part
    img = nib.load(patient))
    img = np.array(img.get_fdata())
    img = truncated_range(img)

    %cd ..
    %cd ..
    %cd test_npy_files
    !mkdir patient_number

    # taking each slice of CT image and extracting into .npy files
    for idx in range(img.shape[2]):
        if idx == 0 or idx == img.shape[2] - 1:
            continue
        cur_img = img[:, :, idx - 1:idx + 2].astype('uint8')
        cur_label = label[:, :, idx].astype('uint8')

        %cd patient

        np.save(patient + '_' + str(idx) + '_image.npy'), cur_img)
        np.save(patient + '_' + str(idx) + '_label.npy'), cur_label)

        %cd .. 

        img_fromarray = Image.fromarray(cur_img)
        lbl_fromarray = Image.fromarray(cur_label)

        matplotlib.image.imsave(test_filepath + patient + '_' + str(idx) + '_image.png', img_fromarray)
        matplotlib.image.imsave(test_filepath + patient + '_' + str(idx) + '_label.png', lbl_fromarray)