In [None]:
# Installs Kaggle API; allows us to easily use Kaggle datasets
# Want to go to Kaggle > Settings > API > Create New Token (and then download kaggle.json file)
!pip install kaggle

# Allows us to access local files
from google.colab import files

# Prompts you to upload files; want to upload kaggle.json
uploaded = files.upload()

# Creates a new directory and copies kaggle.json file to directory; specify read/write permissions only to owner
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Lists Kaggle datasets
!kaggle datasets list

# Downloads specified dataset; can obtain this command by copying the API command of a dataset. This can be found by clicking the three dots on a given dataset near the download button.
!kaggle datasets download -d navoneel/brain-mri-images-for-brain-tumor-detection

In [2]:
# Allows us to work with zip files since Kaggle will give our dataset in .zip format
import zipfile

# Specifies location of zip file
data_zip = "/content/brain-mri-images-for-brain-tumor-detection.zip"

# Specifies location to put the unzipped contents in
data_dir = "./data"

# Reading zip file and extracting its contents into data_dir ('./data')
data_zip_ref = zipfile.ZipFile(data_zip,"r")
data_zip_ref.extractall(data_dir)

In [13]:
import os
import cv2 as cv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Creates an array with our class names (same as folder names)
categories = ['no', 'yes']

# Stores the path that the image folders are within
data_path = '/content/data'

# Creates empty arrays that will eventually hold the transformed images (X) and the labels (y)
X = []
y = []
count = 0
# Loops through the categories, keeping track of the category index and name
for i, category in enumerate(categories):
# Goes through the images in the folders associated with each category
# listdir will give all of the files/directories in a given directory; path will give us the path to a directory/file; join will append subequent paths
  for file in os.listdir(os.path.join(data_path, category)):
    # Stores the image path
    image_path = os.path.join(data_path, category, file)
    # Loads the image as a numpy array
    image = cv.imread(image_path)
    # Resizes the image to 256 x 256 since our model will take images of the same size
    image = cv.resize(image, (256, 256))
    # Flattens images (reduces dimensions) to decrease memory
    image = image.flatten()
    # Normalizes our data
    image = image / 255
    # Adds image to the images array
    X.append(image)
    # Adds label associated with image to the target variable array
    y.append(i)

# Converts the arrays to numpy arrays for more efficient operations
X = np.array(X)
y = np.array(y)

# Splits our data into training and testing data, with 20% for testing and 80% for training, shuffling will randomly rearrange the images before partitioning
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)