# Setup

In [124]:
import numpy as np
import pandas as pd
import mnist
from tensorflow import keras

from tensorflow.keras.utils import load_img, img_to_array

from itertools import permutations
from glob import glob

# Preparing the Data

In [5]:
from glob import glob
image_paths = glob("/Users/neuropunk/Documents/_source/_datawrangler/TAMUDatathon2022/imagepuzzle/train/*/*")

In [25]:
# I want 10% of the data to be test
train_image_paths = [x[1] for x in filter(lambda x: x[0] % 10 != 0, enumerate(image_paths))]
test_image_paths = [x[1] for x in filter(lambda x: x[0] % 10 == 0, enumerate(image_paths))]

In [35]:
def get_4d_image_array(image_paths):
    images = []
    for image_path in image_paths:
        img = load_img(f'{image_path}', target_size=(128, 128))
        img_array = img_to_array(img)
        images.append(img_array)
    return np.stack(images, axis=0)

In [36]:
train_images = get_4d_image_array(train_image_paths)
test_images = get_4d_image_array(test_image_paths)

In [45]:
combs = [''.join(str(x) for x in comb) for comb in list(permutations(range(0, 4)))]
idx_to_comb_map = dict(enumerate(combs))
comb_to_idx_map = dict([(v, k) for k,v in idx_to_comb_map.items()])     

In [53]:
def get_labels(image_paths):
    return np.array([comb_to_idx_map[x.split("/")[-2]] for x in image_paths])

train_labels = get_labels(train_image_paths)
test_labels = get_labels(test_image_paths)

In [57]:
print(train_images.shape) # (60000, 28, 28, 1)
print(test_images.shape)  # (10000, 28, 28, 1)

(44755, 128, 128, 3)
(4973, 128, 128, 3)


# Building Model

In [158]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout, ZeroPadding2D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

# TOY MODEL
num_filters = 50
filter_size = 3
pool_size = 2

model = Sequential([
    Conv2D(num_filters, filter_size, input_shape=(128, 128, 3)),
    MaxPooling2D(pool_size=pool_size),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(24, activation='softmax')
])

# LESS TOY MODEL
model = Sequential([
    ZeroPadding2D(5, input_shape=(128,128,3)),  # extra padding

    Conv2D(64, kernel_size=(5,5), padding='same', activation='relu', strides=2),
    BatchNormalization(),
    MaxPooling2D(),
    
    Conv2D(128, kernel_size=(5,5), padding='same', activation='relu', strides=2),
    BatchNormalization(),
    Dropout(0.3),
    
    Conv2D(128, kernel_size=(5,5), padding='same', activation='relu', strides=2),
    BatchNormalization(),
    Dropout(0.3),    
    
    Conv2D(128, kernel_size=(3,3), padding='same', activation='relu', strides=2),
    BatchNormalization(),
    Dropout(0.3),
    
    Conv2D(128, kernel_size=(3,3), padding='same', activation='relu', strides=1),
    BatchNormalization(),
    Dropout(0.3),       

    Flatten(), # combining all features
    
    Dense(1024, activation='relu'),
    BatchNormalization(),
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(24, activation='softmax')
])


model.compile(
    'adam',
    loss='categorical_crossentropy',
    metrics='accuracy'
)

filepath1 = "best_model.h5"
checkpoint = ModelCheckpoint(filepath1, 
                             monitor='val_accuracy',
                             verbose=1,
                             save_best_only=True, 
                             mode='max')
callbacks_list = [checkpoint]

# Fitting

In [148]:
model.fit(
    train_images,
    to_categorical(train_labels),
    epochs=10,
    batch_size=64,
    validation_data=(test_images, to_categorical(test_labels)),
    callbacks=callbacks_list
)

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.73316, saving model to best_model.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.73316
Epoch 3/10
Epoch 3: val_accuracy improved from 0.73316 to 0.82345, saving model to best_model.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.82345 to 0.87231, saving model to best_model.h5
Epoch 5/10
Epoch 5: val_accuracy improved from 0.87231 to 0.89443, saving model to best_model.h5
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.89443
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.89443
Epoch 8/10
Epoch 8: val_accuracy improved from 0.89443 to 0.90227, saving model to best_model.h5
Epoch 9/10
Epoch 9: val_accuracy improved from 0.90227 to 0.91755, saving model to best_model.h5
Epoch 10/10
Epoch 10: val_accuracy improved from 0.91755 to 0.91776, saving model to best_model.h5


<keras.callbacks.History at 0x13f3bdf30>

# Saving Model

In [161]:
#model.save("bleh.h5")
reconstructed_model = keras.models.load_model("best_model.h5")

In [149]:
baseline_model = keras.models.load_model('example_model.h5')

# Sanity Check of Model

In [162]:
testIdx = 3005
predCombIdx = np.argmax(model.predict(train_images[testIdx:(testIdx+1),:,:,:]))
print("prediction:", idx_to_comb_map[predCombIdx], "actual:", idx_to_comb_map[test_labels[testIdx]])

prediction: 3021 actual: 3120


In [163]:
predictions = reconstructed_model.predict(test_images)

df = pd.concat([
    pd.Series(np.argmax(predictions, axis=1) == test_labels, name="matched"),
    pd.Series([idx_to_comb_map[x] for x in test_labels], name="comb"),

], axis=1)

df.matched.mean()



0.9177558817615121

In [164]:
predictions = baseline_model.predict(test_images)

df = pd.concat([
    pd.Series(np.argmax(predictions, axis=1) == test_labels, name="matched"),
    pd.Series([idx_to_comb_map[x] for x in test_labels], name="comb"),

], axis=1)

df.matched.mean()



0.06635833500904886

# Accuracy over Validation Set Partitioned by Combinations

In [165]:
predictions = reconstructed_model.predict(test_images)

df = pd.concat([
    pd.Series(np.argmax(predictions, axis=1) == test_labels, name="matched"),
    pd.Series([idx_to_comb_map[x] for x in test_labels], name="comb"),

], axis=1)



In [166]:
df.groupby("comb").matched.mean()

comb
0123    0.980676
0132    0.961353
0213    0.937198
0231    0.937198
0312    0.961353
0321    0.985577
1023    0.966184
1032    0.951691
1203    0.985507
1230    0.995169
1302    0.811594
1320    0.888889
2013    0.754808
2031    0.826923
2103    0.927536
2130    0.855072
2301    0.946860
2310    0.879808
3012    0.792271
3021    0.888889
3102    0.961353
3120    0.859903
3201    0.990338
3210    0.980769
Name: matched, dtype: float64