In [None]:
import os
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle as sk_shuffle
from google.colab import drive
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import random
from tqdm import tqdm

In [None]:
def read_data(DATA_DIR, skip_classes):
  X = []
  y = []
  label = 0
  data_dict = dict()
  folders = sorted(os.listdir(DATA_DIR))
  num_classes = len(folders) - len(skip_classes)

  for i, class_folder in enumerate(folders):
    print(f"Loading {class_folder}...")
    if i in skip_classes:
      continue
    # append n times to labels (n img per folder)
    class_path = os.path.join(DATA_DIR, class_folder)
    files = os.listdir(class_path)
    for img in tqdm(files):
      img_path = os.path.join(class_path, img)
      im_arr = np.asarray(Image.open(img_path))
      data_dict[img] = (im_arr, label)
    label += 1

  for key in sorted(data_dict.keys()):
    X_n, y_n = data_dict[key]
    X.append(X_n)
    y.append(y_n)
  print("Done")
  return np.array(X), np.array(y), num_classes

In [None]:
def load_data(DATA_DIR, skip_classes, test_split=0.2, random_state=42, shuffle=True):
  X, y, num_classes = read_data(DATA_DIR, skip_classes)
  if not test_split:
    if shuffle:
      X, y = sk_shuffle(X, y, random_state=random_state)
    return X, y, num_classes
  X_train, X_test, y_train, y_test = train_test_split(
      X, y, test_size=test_split, shuffle=shuffle, random_state=random_state)
  return X_train, X_test, y_train, y_test, num_classes

In [None]:
if __name__ == "__main__":
  drive.mount('/content/drive/')
  data_dir = '/content/drive/MyDrive/FYP/Kather_decomposed'
  # X_train, X_test, y_train, y_test = load_data(data_dir, skip_classes = [1,2,3,4,5,7], test_split=0.2)
  X, y, num_classes = load_data(data_dir, skip_classes = [], test_split=0)