In [1]:
import numpy as np
import sklearn
from sklearn.tree import DecisionTreeClassifier
from torchvision import datasets, transforms
import urllib
import zipfile
from tensorboardX import SummaryWriter
from datetime import datetime

In [2]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Setup

In [3]:
# INSERT YOUR DATA HERE
# Expected format: One folder per class, e.g.
# train
# --- dogs
# |   +-- lassie.jpg
# |   +-- komissar-rex.png
# --- cats
# |   +-- garfield.png
# |   +-- smelly-cat.png
#
# Example: https://github.com/jrieke/traingenerator/tree/main/data/image-data
train_data = "train/"  # required
val_data = "train/"    # optional
test_data = "test/"              # optional

In [4]:
# Set up logging.
experiment_id = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
writer = SummaryWriter(logdir=f"logs/{experiment_id}")

# Preprocessing

In [5]:
# Set up scaler.
scaler = sklearn.preprocessing.StandardScaler()

In [6]:
def preprocess(data, name):
    if data is None:  # val/test can be empty
        return None
    # Read image files to pytorch dataset (only temporary).
    transform = transforms.Compose([
        transforms.Resize(28), 
        transforms.CenterCrop(28), 
        transforms.ToTensor()
    ])
    data = datasets.ImageFolder(data, transform=transform)

    # Convert to numpy arrays.
    images_shape = (len(data), *data[0][0].shape)
    images = np.zeros(images_shape)
    labels = np.zeros(len(data))
    for i, (image, label) in enumerate(data):
        images[i] = image
        labels[i] = label
    
    # Flatten.
    images = images.reshape(len(images), -1)

    # Scale to mean 0 and std 1.
    if name == "train":
        scaler.fit(images)
    images = scaler.transform(images)

    # Shuffle train set.
    if name == "train":
        images, labels = sklearn.utils.shuffle(images, labels)

    return [images, labels]

In [7]:
processed_train_data = preprocess(train_data, "train")
processed_val_data = preprocess(val_data, "val")
processed_test_data = preprocess(test_data, "test")

# Model

In [8]:
model = DecisionTreeClassifier()

# Training

In [9]:
def evaluate(data, name):
    if data is None:  # val/test can be empty
        return

    images, labels = data
    acc = model.score(images, labels)
    print(f"{name + ':':6} accuracy: {acc}")
    writer.add_scalar(f"{name}_accuracy", acc)

In [10]:
# Train on train_data.
model.fit(*processed_train_data)

DecisionTreeClassifier()

In [11]:
# Evaluate on all datasets.
evaluate(processed_train_data, "train")
evaluate(processed_val_data, "val")
evaluate(processed_test_data, "test")

train: accuracy: 1.0
val:   accuracy: 1.0
test:  accuracy: 0.8085106382978723


In [12]:
import pickle
filename = 'DecisionTree.sav'
pickle.dump(model, open(filename, 'wb'))