In [None]:
# Installs Kaggle API; allows us to easily use Kaggle datasets
# Want to go to Kaggle > Settings > API > Create New Token (and then download kaggle.json file)
!pip install kaggle

# Allows us to access local files
from google.colab import files

# Prompts you to upload files; want to upload kaggle.json
uploaded = files.upload()

# Creates a new directory and copies kaggle.json file to directory; specify read/write permissions only to owner
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Lists Kaggle datasets
!kaggle datasets list

# Downloads specified dataset
!kaggle datasets download -d paultimothymooney/chest-xray-pneumonia

In [None]:
# Allows us to work with zip files since Kaggle will give our dataset in .zip format
import zipfile

# Specifies location of zip file
data_zip = "/content/chest-xray-pneumonia.zip"

# Specifies location to put unzipped contents in
data_dir = "./data"

# Reading zip file and extracting contents into data_dir ('./data')
data_zip_ref = zipfile.ZipFile(data_zip,"r")
data_zip_ref.extractall(data_dir)

In [42]:
# We will use tensorflow to train our neural network
import tensorflow as tf

#ImageDataGenerator allows us to augment our data
from keras.preprocessing.image import ImageDataGenerator

In [43]:
# Create variables that store the paths to our training, validation, and testing data
train_dir = '/content/data/chest_xray/train'
val_dir = '/content/data/chest_xray/val'
test_dir = '/content/data/chest_xray/test'

# This specifies the transformations and augmentations to be done to the specified data
# Rescale normalizes by multiplying color values by 1/255
# Rotation range randomly rotates images during training by up to 30 degrees
# Width shift range randomly shifts images left or right up to 20% of width
# Height shift range randomly shifts images left or right up to 20% of height
# Zoom range randomly zooms images in or out up to 20% of image size
# Brightness range randomly varies brightness from 25% to 100% of actual image brightness
# Augmentation helps us have more diverse data, which can be useful since real-world input can vary
datagen = ImageDataGenerator(rescale = 1.0/255, rotation_range = 30, width_shift_range = 0.2, height_shift_range = 0.2, zoom_range=0.2, brightness_range=(0.25, 1))

# Partitions our data into training, testing, and validation and applies specified transformations. Specifies that this is a binary classification problem. Default batch (set of samples of a given size that make up one iteration of training) sizes of 32 and image sizes of 256 x 256
# We can vary batch size based on computational capabilities. Lower batch size is less expensive
# Training trains our model, validation helps us fine tune the model by evaluating the model during training, and testing is for after we are done training
train_generator = datagen.flow_from_directory(train_dir, class_mode='binary')
test_generator = datagen.flow_from_directory(test_dir, class_mode='binary')
val_generator = datagen.flow_from_directory(val_dir, class_mode='binary')

# Since we have resized, normalized, and augmented our data, we have adequately preprocessed the data and can proceed with training a model.


Found 5216 images belonging to 2 classes.
Found 624 images belonging to 2 classes.
Found 16 images belonging to 2 classes.
