In [None]:
# Installs Kaggle API; allows us to easily use Kaggle datasets
# Want to go to Kaggle > Settings > API > Create New Token (and then download kaggle.json file)
!pip install kaggle

# Allows us to access local files
from google.colab import files

# Prompts you to upload files; want to upload kaggle.json
uploaded = files.upload()

# Creates a new directory and copies kaggle.json file to directory; specify read/write permissions only to owner
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Lists Kaggle datasets
!kaggle datasets list

# Downloads specified dataset; can obtain this command by copying the API command of a dataset. This can be found by clicking the three dots on a given dataset near the download button.
!kaggle datasets download -d navoneel/brain-mri-images-for-brain-tumor-detection

In [7]:
# Allows us to work with zip files since Kaggle will give our dataset in .zip format
import zipfile

# Specifies location of zip file
data_zip = "/content/brain-mri-images-for-brain-tumor-detection.zip"

# Specifies location to put the unzipped contents in
data_dir = "./data"

# Reading zip file and extracting its contents into data_dir ('./data')
data_zip_ref = zipfile.ZipFile(data_zip,"r")
data_zip_ref.extractall(data_dir)

In [44]:
import tensorflow as tf
import os

# Loads dataset from given directory (uses default specifications such as batch size 32, reshapes images to 256 x 256, and shuffling data)
# Can specify alternate values depending on computational capabilities (i.e. data = tf.keras.utils.image_dataset_from_directory('/content/data/chest_xray/train', batch_size = 16))
# If limited by computational capabilities, want to reduce batch size
data = tf.keras.utils.image_dataset_from_directory('/content/data/brain_tumor_dataset')

Found 253 files belonging to 2 classes.


In [31]:
# This code block is just to dive deeper into what 'data' contains and how we work with it

# Allows us to access batches of our dataset from our data pipeline
iter = data.as_numpy_iterator()

# Has the first batch of training data; batch[0] will have image data (each image is a 2-d array with color channels of each pixel), batch[1] will have labels
batch = iter.next()

In [32]:
# Currently, image color channels have values ranging from 0 to 255; we want to scale our images and have their color channel values between 0 and 1.
# Can transform data as it is loaded in the data pipeline using map function (a kind of lamba function)
# x is an image, y is the label associated with the image

data = data.map(lambda x, y: (x/255, y))

In [33]:
# We want to split our batches into training, validation, and testing batches
# Can do this knowing the number of batches

len(data)

8

In [34]:
# The number of allocated training, validation, and testing batches should be equal to the total number of batches

train_size = int(len(data) * .8)
validation_size = int(len(data) * .1) + 1
test_size = int(len(data) * .1) + 1

In [35]:
# Since len(data) = train_size + validation_size + test_size, we can proceed

train_size + validation_size + test_size

8

In [36]:
# Training data trains the model
# Validation data is checks our model's performance during training, and it is used for fine tuning
# Testing data is for evaluating the model at the end

# There are data pipeline functions to select batches for training, validation, and testing partitions

# Takes the allocated number of batches for training (first [train_size] batches)
train = data.take(train_size)

# Skips first [train_size batches] and takes the number of batches allocated to validation from the leftover batches
validation = data.skip(train_size).take(validation_size)

# Skips the batches allocated for training and validation, taking rest for testing
test = data.skip(train_size + validation_size).take(test_size)