##This notebook contains all CNN architectures for the Siamese comparison, as well as the K-Nearest-Neighbors code since that just uses the pre-trained SCNN.


In [None]:
!pip install tensorflow
import tensorflow as tf
!pip install tensorflow_hub
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import layers
import pathlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
import random
import datetime
import requests

%load_ext tensorboard

np.set_printoptions(precision=4)

### Loading Data

In [None]:
"""
!pip install google.colab
"""
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Create a dictionary in which keys are all the artists and values are all of the images by that artist:

In [None]:

artistdict = {}

with open('/content/drive/My Drive/train_info.csv', 'r') as csvfile:
  table = csv.reader(csvfile)
  c = 0
  for row in table:
    if c == 0:
      c += 1
      continue
    if row[1] not in artistdict:
      artistdict[row[1]] = []
    artistdict[row[1]].append(row[0])
    c += 1
    
print(c)


79434


Create a balanced set of labeled image pairs:

In [None]:
datapoints = []
badcount=0
for artist in artistdict.keys():
  paintings = artistdict[artist]
  if len(paintings) < 31:
    continue
  for i in range(30):
    left = paintings[i]
    if left[0] != '9':
      continue
    for j in range(i+1, 30):
      random.seed(j)
      right = paintings[j]
      if right[0] != '9':
        continue
      datapoints.append([left, right, 1])
      if len(datapoints) <= 60:
        k = j
        while True:
          otherartist = random.choice(list(artistdict))
          if otherartist == artist:
            k += 1
            random.seed(k)
            continue
          otherpainting = random.choice(artistdict[otherartist])
          if otherpainting[0] != '9':
            continue
          break
      else:
        otherpainting = random.choice(datapoints[:-30])
        otherpainting = otherpainting[0]
        
      datapoints.append([left, otherpainting, 0])

random.shuffle(datapoints)

print(len(datapoints))
print(badcount)

uniquepaintings = []

for datapoint in datapoints:
  if datapoint[0] not in uniquepaintings:
    uniquepaintings.append(datapoint[0])
  if datapoint[1] not in uniquepaintings:
    uniquepaintings.append(datapoint[1])
print(len(uniquepaintings))

6708
0
2089


###Creating Dataset

In [None]:
import matplotlib.pyplot as plt
!pip install pillow
from PIL import Image



In [None]:
fails = 0
count = 0
for painting in uniquepaintings:
  imagename = "/content/drive/My Drive/kaggle/train_9/" + painting
  destname = '/content/train9unique/' + painting
  try:
    image = Image.open(imagename).convert('RGB').resize((224, 224))
  except:
    fails += 1
  image.save(destname, 'JPEG')
  if count % 50 == 0:
    print(count)
  count += 1
print(fails)

Process the datapoints into numpy array format:

In [None]:
from numpy import asarray

In [None]:
def names_to_images(datapoint):
  leftimage = datapoint[0]
  rightimage = datapoint[1]
  label = datapoint[2]

  leftfilename = '/content/drive/My Drive/kaggle/train_9/' + leftimage
  rightfilename = '/content/drive/My Drive/kaggle/train_9/' + rightimage

  
  leftimage = Image.open(leftfilename).resize((224, 224))
  leftimage = asarray(leftimage)
  
  rightimage = Image.open(rightfilename).resize((224, 224))
  rightimage = asarray(rightimage)
  

  assert(leftimage.shape[2] == 3 and rightimage.shape[2] == 3)

  return leftimage, rightimage, label

m = len(datapoints)
#m = 1400 #for Colab run


i=0
print(len(datapoints))

dataset_size = m

leftimages = np.empty((dataset_size, 224, 224, 3))
rightimages = np.empty((dataset_size, 224, 224, 3))
labels = np.empty((dataset_size))

fail = 0

for datapoint in datapoints:
  try:
    leftimage, rightimage, label = names_to_images(datapoint)
  except:
    fail += 1
    pass

  leftimages[i,:,:,:] = leftimage[:,:,:] 
  rightimages[i,:,:,:] = rightimage[:,:,:]

  labels[i] = label
  i+=1
  if i == dataset_size:
    break
  if i % 50 == 0:
    print(i)


print(fail)


Define train, val, and test sets:

In [None]:



train_size = int(0.7 * dataset_size)
val_size = int(0.15 * dataset_size)
test_size = int(0.15 * dataset_size)


leftimg_train = leftimages[:train_size,:,:,:]
leftimg_val = leftimages[train_size:train_size + val_size,:,:,:]
leftimg_test = leftimages[train_size + val_size:,:,:,:]

rightimg_train = rightimages[:train_size,:,:,:]
rightimg_val = rightimages[train_size:train_size + val_size,:,:,:]
rightimg_test = rightimages[train_size + val_size:,:,:,:]

label_train = labels[:train_size]
label_val = labels[train_size:train_size + val_size]
label_test = labels[train_size + val_size:]


print(leftimages.shape)
print(leftimg_train.shape)

(1400, 224, 224, 3)
(979, 224, 224, 3)


# Creating Model:

(Define inputs)

In [None]:
input_shape = [224, 224, 3]
left_input = tf.keras.Input(input_shape)
right_input = tf.keras.Input(input_shape)

Model -1. Multilayer Perceptron

In [None]:
cnn = tf.keras.Sequential([
                             tf.keras.layers.Flatten(),
                             tf.keras.layers.Dense(1200, activation="relu", trainable=True),
                             tf.keras.layers.Dense(800, activation="relu", trainable=True),
                             tf.keras.layers.Dense(128, activation="relu", trainable=True)

])

Model 0. Vanilla CNN

In [None]:
cnn = tf.keras.Sequential()
cnn.add(tf.keras.layers.Conv2D(32, kernel_size=(5, 5), strides=(1, 1),
                 activation='relu',
                 input_shape=input_shape))
cnn.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
cnn.add(tf.keras.layers.Conv2D(64, (5, 5), activation='relu'))
cnn.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
cnn.add(tf.keras.layers.Flatten())
cnn.add(tf.keras.layers.Dense(1000, activation='relu'))
cnn.add(tf.keras.layers.Dense(128, activation='relu'))

Model 0.5. Bigger Vanilla

In [None]:
cnn = tf.keras.Sequential()
cnn.add(tf.keras.layers.Conv2D(8, kernel_size=(5, 5), strides=(1, 1),
                 activation='relu',
                 input_shape=input_shape))
cnn.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
cnn.add(tf.keras.layers.Conv2D(16, (5, 5), activation='relu'))
cnn.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
cnn.add(tf.keras.layers.Conv2D(32, (5, 5), activation='relu'))
cnn.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
cnn.add(tf.keras.layers.Conv2D(64, (5, 5), activation='relu'))
cnn.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
cnn.add(tf.keras.layers.Conv2D(128, (5, 5), activation='relu'))
cnn.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
cnn.add(tf.keras.layers.Flatten())
cnn.add(tf.keras.layers.Dense(1000, activation='relu'))
cnn.add(tf.keras.layers.Dense(128, activation='relu'))

Model 1. Resnet-50, NOT trainable

In [None]:


cnn = tf.keras.Sequential([
                             hub.KerasLayer("https://tfhub.dev/tensorflow/resnet_50/feature_vector/1", trainable=False),
                             tf.keras.layers.Dense(800, activation="relu", trainable=True),
                             tf.keras.layers.Dense(128, activation="relu", trainable=True)

]) # inputs size (None, 224, 224, 3) ; outputs size (128)



Model 2. Resnet-50, Trainable

In [None]:


cnn = tf.keras.Sequential([
                             hub.KerasLayer("https://tfhub.dev/tensorflow/resnet_50/feature_vector/1", trainable=True),
                             tf.keras.layers.Dense(800, activation="relu", trainable=True),
                             tf.keras.layers.Dense(128, activation="relu", trainable=True)

]) # inputs size (None, 224, 224, 3) ; outputs size (128)


Model 3. Inception V3, NOT trainable

In [None]:

cnn = tf.keras.Sequential([
                             hub.KerasLayer("https://tfhub.dev/google/imagenet/inception_v3/feature_vector/4", trainable=False),
                             tf.keras.layers.Dense(800, activation="relu", trainable=True),
                             tf.keras.layers.Dense(128, activation="relu", trainable=True)

]) # inputs size (None, 224, 224, 3) ; outputs size (128)


Model 4. EfficientNet, NOT trainable

In [None]:
cnn = tf.keras.Sequential([
                             hub.KerasLayer("https://tfhub.dev/tensorflow/efficientnet/b7/feature-vector/1", trainable=False),
                             tf.keras.layers.Dense(800, activation="relu", trainable=True),
                             tf.keras.layers.Dense(128, activation="relu", trainable=True)

]) # inputs size (None, 224, 224, 3) ; outputs size (128)

Model 5. EfficientNet, Trainable

In [None]:
cnn = tf.keras.Sequential([
                             hub.KerasLayer("https://tfhub.dev/tensorflow/efficientnet/b7/feature-vector/1", trainable=True),
                             tf.keras.layers.Dense(800, activation="relu", trainable=True),
                             tf.keras.layers.Dense(128, activation="relu", trainable=True)

]) # inputs size (None, 224, 224, 3) ; outputs size (128)

(The rest of the model)

In [None]:
encoded_l = cnn(left_input)
encoded_r = cnn(right_input)

L1_layer = tf.keras.layers.Lambda(lambda tensors:tf.math.abs(tensors[0] - tensors[1]))
L1_distance = L1_layer([encoded_l, encoded_r])

prediction = tf.keras.layers.Dense(1, activation="sigmoid", trainable=True)(L1_distance)
siamese_net = tf.keras.Model(inputs=[left_input, right_input], outputs=prediction)

Training the model

In [None]:
siamese_net.compile(loss="binary_crossentropy", metrics=['acc'], optimizer='adam')
siamese_net.summary()

history = siamese_net.fit([leftimg_train, rightimg_train], label_train, epochs=8, steps_per_epoch = 150)

Evaluating the model

In [None]:
siamese_net.evaluate([leftimg_val, rightimg_val], label_val)



[0.9329984188079834, 0.5476190447807312]

Fine-tuning the transfer learning models

In [None]:
cnn.trainable = True
siamese_net.compile(loss="binary_crossentropy", metrics=['acc'], optimizer=keras.optimizers.Adam(1e-5))
siamese_net.summary()

In [None]:
history = siamese_net.fit([leftimg_train, rightimg_train], label_train, epochs=4, steps_per_epoch = 120)

In [None]:
siamese_net.evaluate([leftimg_val, rightimg_val], label_val)

##Price Prediction: K-Nearest-Neighbors

In [None]:
with open('phillips2.csv') as f:
    lots = [{k: v for k, v in row.items()}
        for row in csv.DictReader(f, skipinitialspace=True)]



In [None]:
fails = 0
for lot in lots:
  try:
    int(lot['day'])
    int(lot['month'])
    int(lot['year'])
  except:
    print(lot)
    lots.remove(lot)
    fails += 1
print(fails)
print(len(lots))

0
23825


###K-Nearest-Neighbors

In [None]:
  lots = sorted(lots, key=lambda lot: int(lot['day']))
  lots = sorted(lots, key=lambda lot: int(lot['month']))
  lots = sorted(lots, key=lambda lot: int(lot['year']))

In [None]:
def knearest(k, n, lots):
  #MIGHT RETURN NONE!

  idx = random.randint(n+100, len(lots))
  topredict = lots[idx]

  i = 1
  while True:
    last_before_auction_idx = idx - i
    last_before_auction = lots[last_before_auction_idx]
    lastn = lots[last_before_auction_idx-n : last_before_auction_idx+1]
    if last_before_auction['day'] != topredict['day'] or last_before_auction['month'] != topredict['month']:
      break
    i += 1

  priceandprob = []

  inputtopred = np.zeros((1, 224, 224, 3))
  try:
    predimg = tf.image.decode_jpeg(requests.get(topredict["imageurl"]).content, channels=3)
    lastimage = predimg
  except:
    return None
  predimg = tf.image.convert_image_dtype(predimg, tf.float32)
  predimg = tf.image.resize(predimg, [224, 224])
  inputtopred[0,:,:,:] = predimg[:,:,:]

  
  for lot in lastn:

    inputlotimg = np.zeros((1, 224, 224, 3))
    try:
      lotimg = tf.image.decode_jpeg(requests.get(lot["imageurl"]).content, channels=3)
      lastimage = lotimg
    except:
      lotimg = lastimage
    lotimg = tf.image.convert_image_dtype(lotimg, tf.float32)
    lotimg = tf.image.resize(lotimg, [224, 224])   
    inputlotimg[0,:,:,:] = predimg[:,:,:]

    prob = siamese_net.predict([inputlotimg, inputtopred])
    lot['prob'] = prob
    priceandprob.append(lot)

  priceandprob = sorted(priceandprob, key=lambda lot: lot['prob'], reverse=True)
  #sum = 0
  #for i in range(k):
  #  sum += float(priceandprob[i]['price'])
  #avg = sum / k
  
  errors = {}
  medians = {}
  for i in k:
    topi = priceandprob[:i]
    topi = sorted(topi, key = lambda lot: lot['price'])
    median = float(topi[i//2]['price'])
    medians[str(i)] = median
    mape = 100 * abs(float(topredict['price']) - median) / float(topredict['price'])
    errors[str(i)] = mape

  print('to predict: '+ topredict['price'])
  print('prediction: '+ str(medians['70']))
  return errors


Messing around with K Nearest: try with very big n, smallish k

In [None]:
m =  #number of attempts to average over
sums = {}
for i in [10, 30, 50, 70, 90]:
  sums[str(i)] = 0
for i in range(m):
  errors = knearest([10, 30, 50, 70, 90], 400, lots)
  if errors != None:
    print(errors)
    print(i)
    for i in errors.keys():
      sums[i] += errors[i]
  else: 
    m -= 1

avgerrors = {}
for i in [10, 30, 50, 70, 90]:
  avgerrors[str(i)] = sums[str(i)] / m
print(avgerrors)