# Retrieve Dataset

In [93]:
import tensorflow_datasets as tfds
import tensorflow as tf
import numpy as np;

raw_ds = tfds.load('deep_weeds', split="train", as_supervised=True)


# Pre-process Dataset

In [None]:
def predicate(x, y):
  return tf.not_equal(y, 8)

full_ds = raw_ds.filter(predicate) # remove instances from negative class
full_ds_size = full_ds.reduce(0, lambda x,_: x + 1).numpy()
full_ds = full_ds.shuffle(full_ds_size) # shuffle dataset

In [106]:
# train_size = int(0.7 * full_ds_size)
# test_size = int(0.3 * full_ds_size)

train_size = 1000
test_size = 1000


train_ds = full_ds.take(train_size)
test_ds = full_ds.skip(train_size)

train_ds_numpy = tfds.as_numpy(train_ds)
test_ds_numpy = tfds.as_numpy(test_ds)

print(f'Total number of all instances = {full_ds_size}')
print(f'Total number of training instances = {train_size}')
print(f'Total number of testing instances = {test_size}')

# TODO: Reduce dimension
# TODO: Abstract function

Total number of all instances = 8403
Total number of training instances = 1000
Total number of testing instances = 1000


In [107]:
train_X = []
train_Y = []
for ex in train_ds_numpy:
  pixels = ex[0] # 256 x 256 x 3 (RGB)
  label = ex[1]
  row = pixels.flatten()
  train_X.append(row)
  train_Y.append(label)
print(f'Sample train_X: {train_X[0]}')
print(f'Sample train_Y: {train_Y[0]}')

Sample train_X: [255 243 236 ... 124 139 134]
Sample train_Y: 5


In [108]:
test_X = []
test_Y = []
for ex in test_ds_numpy:
  pixels = ex[0] # 256 x 256 x 3 (RGB)
  label = ex[1]
  row = pixels.flatten()
  test_X.append(row)
  test_Y.append(label)
print(f'Sample test_X: {test_X[0]}')
print(f'Sample test_Y: {test_Y[0]}')

2021-10-10 01:41:56.511766: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 6760 of 8403
2021-10-10 01:41:58.675614: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:228] Shuffle buffer filled.


Sample test_X: [145 152 160 ...  71 144  89]
Sample test_Y: 5


# Train Model (Decision Tree)

In [111]:
from sklearn import tree

# ~170s for 1000 training instances max_depth=None

# ~89.5s for 1000 training instances max_depth=5

clf = tree.DecisionTreeClassifier(max_depth=5)
clf = clf.fit(train_X, train_Y)

# Evaluate Model

In [112]:
# 0.29 ~85.3s for 1000 training instances max_depth=None

# 0.216 ~68.4s for 1000 training instances max_depth=5

correct_prediction_count = 0
predicted_test_Y = clf.predict(test_X)
for i in range(test_size):
  if predicted_test_Y[i] == test_Y[i]:
    correct_prediction_count += 1
accuracy = correct_prediction_count / test_size
print(f'Accuracy: {accuracy}')

Accuracy: 0.216
