# This notebook contains implementation on classification using GloVe and ELMo embeddings
A few set up is required in using different embeddings, which will be described as follow

Run the below blocks if you want to use GloVe embedding


In [None]:
## Download and unzip glove pretrained embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!apt-get -qq install unzip
!unzip glove.6B.zip

In [None]:
from gensim.models import word2vec
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="glove.6B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt")
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

Run the below blocks if you want to use ELMo embedding


In [None]:
%tensorflow_version 1.x

In [None]:
# Here is for ELMo embedding
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K

sess = tf.Session()
K.set_session(sess)
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())
  sess.run(tf.tables_initializer())
  # x = sess.run(embeddings)

# elmo_model = hub.load("https://tfhub.dev/google/elmo/2")


#IMPORTANT
Note that you cannot switch from GloVe to ELMo embedding since ELMo does not support Tensorflow 2, you would need to reset the runtime in order to use ELMo if you previously were using GloVe

While in terms of the speed in training, ELMo is much slower in training since it is using tensorflow 1

I recommend it is more efficient to copy this notebook if you would like to train parallelly at the same time 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

mug_csv = pd.read_csv('mug.csv')
kettle_csv = pd.read_csv('kettle.csv')
bottle_csv = pd.read_csv('bottle.csv')
bakingtray_csv = pd.read_csv('bakingtray.csv')
hammer_csv = pd.read_csv('hammer.csv')
pan_csv = pd.read_csv('pan.csv')
wrench_csv = pd.read_csv('wrench.csv')


mug_csv['Object'] = ['mug'] * len(mug_csv)
kettle_csv['Object'] = ['kettle'] * len(kettle_csv)
bottle_csv['Object'] = ['bottle'] * len(bottle_csv)
bakingtray_csv['Object'] = ['bakingtray'] * len(bakingtray_csv)
hammer_csv['Object'] = ['hammer'] * len(hammer_csv)
pan_csv['Object'] = ['pan'] * len(pan_csv)
wrench_csv['Object'] = ['wrench'] * len(wrench_csv)
# mug_csv.head()

In [None]:
all_df = mug_csv.append(kettle_csv, ignore_index=True)
all_df = all_df.append(bottle_csv, ignore_index=True)
all_df = all_df.append(bakingtray_csv, ignore_index=True)
all_df = all_df.append(hammer_csv, ignore_index=True)
all_df = all_df.append(pan_csv, ignore_index=True)
all_df = all_df.append(wrench_csv, ignore_index=True)

print(len(all_df))
all_df.head()

In [None]:
import string # for preprocess_text()
import re
# NLTK library to remove the stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

'''
Want to try to method
1) change the digit to word
2) split the number into single digit
'''
num_to_word = {'1' : ' one ',
               '2' : ' two ',
               '3' : ' three ',
               '4' : ' four ',
               '5' : ' five ',
               '6' : ' six ',
               '7' : ' eight ',
               '8' : ' eight ',
               '9' : ' nine ',
               '0' : ' zero '}

regex = '([0-9]+\.?[0-9]+)([a-zA-Z]+)'
r = re.compile(regex)

# unit conversion
convert_vol = {'oz'     : 29.5735,
               'cl'     : 10.0,
               'gallon' : 4546.09,
               'L'      : 1000.0}
char_set = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"

class InputPreprocess():
  @staticmethod
  def quantize(val, to_values):
      """Quantize a value with regards to a set of allowed values.
      
      Examples:
          quantize(49.513, [0, 45, 90]) -> 45
          quantize(43, [0, 10, 20, 30]) -> 30
      
      Note: function doesn't assume to_values to be sorted and
      iterates over all values (i.e. is rather slow).
      
      Args:
          val        The value to quantize
          to_values  The allowed values
      Returns:
          Closest value among allowed values.
      """
      best_match = None
      best_match_diff = None
      for other_val in to_values:
          diff = abs(other_val - val)
          if best_match is None or diff <= best_match_diff:
              best_match = other_val
              best_match_diff = diff
      return best_match

  # Preprocess weights --> e.g. 990g -> 990 <int>
  @staticmethod
  def preprocess_weights(df):
    tmp = df.copy()
    weights = tmp['Weight'].to_list()
    # Want to extract rows with 'g' only
    row_g = []
    for i in range(len(weights)):
      curr_weight = weights[i]
      if curr_weight.endswith('kg'):
        curr_weight = curr_weight[0:-2]
        curr_weight = float(curr_weight) * 1000
        # row_g.append(i)
      elif curr_weight.endswith('g'):
        curr_weight = curr_weight[0:-1]
        curr_weight = float(curr_weight)
        row_g.append(i)
      elif curr_weight.endswith('pounds'):
        curr_weight = curr_weight[0:-6]
        curr_weight = float(curr_weight) * 453.592
      elif curr_weight.endswith('oz'):
        curr_weight = curr_weight[0:-2]
        curr_weight = float(curr_weight) * 28.3495
      elif curr_weight == 'other':
        curr_weight = 0
      if curr_weight > 5000:
        curr_weight = 0
      weights[i] = int(curr_weight)
    tmp['Weight'] = weights
    return tmp, row_g
  
  @staticmethod
  def preprocess_volume(df):
    tmp = df.copy()
    volume = tmp['Volume'].to_list()
    for i in range(len(volume)):
      curr_volume = volume[i].split(';')[0] # now only consider to first element in the labels

      if curr_volume.endswith('ml'):
        curr_volume = float(curr_volume[0:-2])
      elif curr_volume.endswith('oz'):
        curr_volume = float(curr_volume[0:-2]) * convert_vol['oz']
      elif curr_volume.endswith('g'):
        curr_volume = float(curr_volume[0:-1])
      elif curr_volume.endswith('cl'):
        curr_volume = float(curr_volume[0:-2]) * convert_vol['cl']
      elif curr_volume.endswith('L'):
        curr_volume = float(curr_volume[0:-1]) * convert_vol['L']
      elif curr_volume.endswith('gallon'):
        curr_volume = float(curr_volume[0:-6]) * convert_vol['gallon']
      elif curr_volume == 'other':
        curr_volume = 0
      volume[i] = int(curr_volume)
      # if volume[i] > 5000:
      #   volume[i] = 0
    tmp['Volume'] = volume
    return tmp

  @staticmethod
  def convert_num2words(word_list):
    tmp_word_list = []
    for word in word_list:
      if word[0].isdigit() and word[-1].isdigit():
        try:
          num_in_word = num2words(float(word))
          tmp_word_list.append(num_in_word)
        except ValueError as e:
          tmp_word_list.append(word)
      else:
        tmp_word_list.append(word)
    return tmp_word_list
  @staticmethod
  def separate_num_unit(word_list):
    tmp_word_list = []
    for word in word_list:
      if word == '':
        continue
      m = r.match(word)
      if m is not None:
        tmp_word_list.extend(list(m.groups()))
      else:
        tmp_word_list.append(word)
    return tmp_word_list

  @staticmethod
  def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words
  
  @staticmethod
  def remove_out_of_char(sentence):
    res = ''
    for ch in sentence:
      if ch in char_set:
        res += ch
      else:
        res += ' '
    return res
  """
  Clean the text with the following rules
  - Convert newline \n to white space
  - Convert tab \t to white space
  - Lowercase all texts
  - Covert punctuation to white space

  """
  @staticmethod
  def preprocess_text(df, column):
    tmp = df.copy()
    for i in range(len(tmp)):
      s = tmp[column][i]
      s = s.lower()
      s = s.replace('\n', ' ') 
      s = s.replace('\t', ' ')
      s = InputPreprocess.remove_out_of_char(s)
      word_list = s.split(' ')
      # handle cases like 12oz -> 12 oz (separate the value and the unit)
      tmp_word_list = InputPreprocess.separate_num_unit(word_list)

      # word_list = InputPreprocess.convert_num2words(tmp_word_list)
      s = ' '.join(tmp_word_list)
      res_string = ''
      for j in range(len(s)):
        if s[j] == '.' and j-1 > 0 and j+1 < len(s) and s[j-1].isdigit() and s[j+1].isdigit():
          res_string += s[j]
        elif s[j] in string.punctuation:
          continue
        else:
          res_string += s[j]

      # s = " ".join("".join([" " if ch in string.punctuation else ch for ch in s]).split())
      tmp.at[i, column] = res_string.split(' ')
    return tmp
  
  @staticmethod
  def preprocess_dimensions(df):
    tmp = df.copy()
    length, width, height = tmp['Length'], tmp['Width'], tmp['Height']
    res_length, res_width, res_height = [], [], []
    for i in range(len(df)):
      if length[i] == 'other':
        res_length.append(0)
      else:
        res_length.append(int(round(float(length[i]))))

      if width[i] == 'other':
        res_width.append(0)
      else:
        res_width.append(int(round(float(width[i]))))

      if height[i] == 'other':
        res_height.append(0)
      else:
        res_height.append(int(round(float(height[i]))))
    tmp['Length'], tmp['Width'], tmp['Height'] = res_length, res_width, res_height
    return tmp

  @staticmethod
  def parts_into_list(df):
    tmp = df.copy()
    for i in range(len(tmp)):
      s = tmp['Parts'][i]
      s = s.split(';')
      tmp.at[i, 'Parts'] = s
    return tmp    

In [None]:
# Pre-process on input text
obj_df = InputPreprocess.preprocess_text(all_df, 'Input')
obj_df = InputPreprocess.preprocess_text(obj_df, 'UnstructuredText')
obj_df = InputPreprocess.preprocess_text(obj_df, 'StructuredText')
obj_df['Input'] = obj_df['Input'].apply(lambda x: InputPreprocess.remove_stopwords(x))
obj_df['UnstructuredText'] = obj_df['UnstructuredText'].apply(lambda x: InputPreprocess.remove_stopwords(x))
obj_df['StructuredText'] = obj_df['StructuredText'].apply(lambda x: InputPreprocess.remove_stopwords(x))


# Pre-process on weight and volume

obj_df, row_g = InputPreprocess.preprocess_weights(obj_df)
obj_df = InputPreprocess.preprocess_volume(obj_df)

weight_bins = np.arange(min(obj_df['Weight']), max(obj_df['Weight']), 10)
volume_bins = np.arange(min(obj_df['Volume']), max(obj_df['Volume']), 10)


obj_df['Weight'] = obj_df['Weight'].apply(lambda x : InputPreprocess.quantize(x, weight_bins))
obj_df['Volume'] = obj_df['Volume'].apply(lambda x : InputPreprocess.quantize(x, volume_bins))

# Pre-process on dimensions
obj_df = InputPreprocess.preprocess_dimensions(obj_df)
length_bins = np.arange(min(obj_df['Length']), max(obj_df['Length']), 1)
width_bins = np.arange(min(obj_df['Width']), max(obj_df['Width']), 1)
height_bins = np.arange(min(obj_df['Height']), max(obj_df['Height']), 1)
obj_df['Length'] = obj_df['Length'].apply(lambda x : InputPreprocess.quantize(x, length_bins))
obj_df['Width'] = obj_df['Width'].apply(lambda x : InputPreprocess.quantize(x, width_bins))
obj_df['Height'] = obj_df['Height'].apply(lambda x : InputPreprocess.quantize(x, height_bins))

# Pre-process on object parts
obj_df = InputPreprocess.parts_into_list(obj_df)
obj_df.head()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
obj_df['Parts'] = list(mlb.fit_transform(list(obj_df['Parts'])))
obj_df.head()

In [None]:
'''
Show distribution on weight and volume
'''
plt.style.use('ggplot')
fig = plt.figure(figsize=(10, 3), dpi=200)
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2)
ax3 = fig.add_subplot(1, 3, 3)

ax1.hist(pd.to_numeric(obj_df['Length']), bins = 70)
ax2.hist(pd.to_numeric(obj_df['Width']), bins = 70)
ax3.hist(pd.to_numeric(obj_df['Height']), bins = 70)

In [None]:
print('Number of bins for quantize: {}.  Number of actual bins after quantise: {}'.format(len(weight_bins), len(set(obj_df['Weight']))))

In [None]:
'''
Show distribution on weight and volume
'''
fig = plt.figure(figsize=(7, 3), dpi=200)
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

ax1.hist(obj_df['Weight'], bins = 50)
ax2.hist(obj_df['Volume'], bins = 50)


In [None]:
'''
Run this if we only want gram only
''' 
# gram_only_df = obj_df.copy().iloc[row_g].reset_index(drop=True)
# gram_only_df.head()
# obj_df = gram_only_df

In [None]:
material_list = list(set(obj_df["Material"]))
print(len(material_list), material_list)

colour_list = list(set(obj_df['Colour']))
print(len(colour_list), colour_list)

weight_list = list(set(obj_df['Weight']))
print(len(weight_list), sorted(weight_list))

volume_list = list(set(obj_df['Volume']))
print(len(volume_list), sorted(volume_list))

object_list = list(set(obj_df['Object']))
print(len(object_list), sorted(object_list))

length_list = list(set(obj_df['Length']))
print(len(length_list), sorted(length_list))

width_list = list(set(obj_df['Width']))
print(len(width_list), sorted(width_list))

height_list = list(set(obj_df['Height']))
print(len(height_list), sorted(height_list))

In [None]:
# Look at the maximum input length
lennn = []
for i in range(len(obj_df)):
  lennn.append(len(obj_df.iloc[i, 0]))
print(min(lennn), max(lennn))

In [None]:
'''
Plot input text length
'''
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline
fig = plt.figure(figsize=(15, 4), dpi=200)
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2)
ax3 = fig.add_subplot(1, 3, 3)
ax1.hist([len(sen) for sen in obj_df["Input"]], bins=50)
ax2.hist([len(sen) for sen in obj_df["UnstructuredText"]], bins=50)
ax3.hist([len(sen) for sen in obj_df["StructuredText"]], bins=50)

ax1.tick_params(axis='both', which='major', labelsize=12)
ax1.tick_params(axis='both', which='minor', labelsize=12)
ax1.set_xlabel('Length of sentence', fontsize=14)
ax1.set_ylabel('Number of sentences', fontsize=14)
ax1.set_title('All Text')

ax2.tick_params(axis='both', which='major', labelsize=12)
ax2.tick_params(axis='both', which='minor', labelsize=12)
ax2.set_xlabel('Length of sentence', fontsize=14)
ax2.set_ylabel('Number of sentences', fontsize=14)
ax2.set_title('Unstructured Text')

ax3.tick_params(axis='both', which='major', labelsize=12)
ax3.tick_params(axis='both', which='minor', labelsize=12)
ax3.set_xlabel('Length of sentence', fontsize=14)
ax3.set_ylabel('Number of sentences', fontsize=14)
ax3.set_title('Structured Text')

# plt.savefig('text_length_distribution.png')
plt.show()

In [None]:
# Create the dictionary for material and colour
material_to_idx = {x : i for i, x in enumerate(material_list)}
colour_to_idx = {x : i for i, x in enumerate(colour_list)}
weight_to_idx = {x : i for i, x in enumerate(weight_list)}
volume_to_idx = {x : i for i, x in enumerate(volume_list)}
object_to_idx = {x : i for i, x in enumerate(object_list)}
length_to_idx = {x : i for i, x in enumerate(length_list)}
width_to_idx = {x : i for i, x in enumerate(width_list)}
height_to_idx = {x : i for i, x in enumerate(height_list)}
# Now update the label to integer form
for i in range(len(obj_df)):
  obj_df.iloc[i, 3] = material_to_idx[obj_df.iloc[i, 3]]
  obj_df.iloc[i, 4] = colour_to_idx[obj_df.iloc[i, 4]]
  obj_df.iloc[i, 5] = weight_to_idx[obj_df.iloc[i, 5]]
  obj_df.iloc[i, 6] = volume_to_idx[obj_df.iloc[i, 6]]
  obj_df.iloc[i, 7] = length_to_idx[obj_df.iloc[i, 7]]
  obj_df.iloc[i, 8] = width_to_idx[obj_df.iloc[i, 8]]
  obj_df.iloc[i, 9] = height_to_idx[obj_df.iloc[i, 9]]
  obj_df.iloc[i, -1] = object_to_idx[obj_df.iloc[i, -1]]
obj_df.head()

In [None]:
# Generate the set of word and get input sequence
def get_wordset(df, text_type):
  word_set = set()
  sequence_len = []
  for i in range(len(df)):
    list_of_word = df[text_type][i]
    word_set.update(list_of_word)
    sequence_len.append(len(list_of_word))
  print("Number of words in {}: {}  and maximum sequence length: {}".format(text_type, len(word_set), max(sequence_len)))
  return word_set, sequence_len

word_set, sequence_len = get_wordset(obj_df, 'Input')
unstruct_word_set, unstruct_sequence_len = get_wordset(obj_df, 'UnstructuredText')
struct_word_set, struct_sequence_len = get_wordset(obj_df, 'StructuredText')

input_sequences = list(obj_df['Input'])
unstruct_input_sequences = list(obj_df['UnstructuredText'])
struct_input_sequences = list(obj_df['StructuredText'])

In [None]:
"""
Convert the material labels column to categorical
Pad sequence
Find number of unique tokens (words)
"""
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# from keras.utils import to_categorical


def tok_and_pad_seq(word_set, sequence_len, input_sequences):
  MAX_NUM_WORDS = len(word_set)
  MAX_SEQUENCE_LENGTH = max(sequence_len)
  tokenizer = tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
  tokenizer.fit_on_texts(input_sequences)
  sequences = tokenizer.texts_to_sequences(input_sequences)
  word_index = tokenizer.word_index
  print(f'Found {len(word_index)} unique tokens.')
  padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
  return MAX_NUM_WORDS, MAX_SEQUENCE_LENGTH, word_index, padded_sequences, tokenizer

######################################################
# For unstructured + structured input
MAX_NUM_WORDS, MAX_SEQUENCE_LENGTH, word_index, padded_sequences, tokenizer = tok_and_pad_seq(word_set, sequence_len, input_sequences)

######################################################
# For unstructured input
unstruct_preproc = tok_and_pad_seq(unstruct_word_set, unstruct_sequence_len, unstruct_input_sequences)
UNSTRUCT_MAX_NUM_WORDS, UNSTRUCT_MAX_SEQUENCE_LENGTH, unstruct_word_index, unstruct_padded_sequences, unstruct_tokenizer = unstruct_preproc

# For structured input
struct_preproc = tok_and_pad_seq(struct_word_set, struct_sequence_len, struct_input_sequences)
STRUCT_MAX_NUM_WORDS, STRUCT_MAX_SEQUENCE_LENGTH, struct_word_index, struct_padded_sequences, struct_tokenizer = struct_preproc
######################################################


material_labels = tf.keras.utils.to_categorical(np.asarray(obj_df['Material']))
colour_labels = tf.keras.utils.to_categorical(np.asarray(obj_df['Colour']))
weight_labels = tf.keras.utils.to_categorical(np.asarray(obj_df['Weight']))
volume_labels = tf.keras.utils.to_categorical(np.asarray(obj_df['Volume']))
object_labels = tf.keras.utils.to_categorical(np.asarray(obj_df['Object']))
length_labels = tf.keras.utils.to_categorical(np.asarray(obj_df['Length']))
width_labels = tf.keras.utils.to_categorical(np.asarray(obj_df['Width']))
height_labels = tf.keras.utils.to_categorical(np.asarray(obj_df['Height']))

print(padded_sequences.shape,
      material_labels.shape,
      colour_labels.shape,
      weight_labels.shape,
      volume_labels.shape,
      object_labels.shape,
      length_labels.shape,
      width_labels.shape,
      height_labels.shape)


In [None]:
'''
Run this block only if using ELMo embedding
'''

batch_size = 32
def ElmoEmbedding(x):
  return elmo_model(inputs={"tokens": tf.squeeze(tf.cast(x, tf.string)),"sequence_len": tf.constant(batch_size*[MAX_SEQUENCE_LENGTH])
                     },
                      signature="tokens",
                      as_dict=True)["elmo"]

In [None]:
'''
Runn this block only if using GloVe embedding
'''
# now prepare the embedding matrix
import tensorflow as tf
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Flatten, Add, Lambda
from keras.layers import LSTM
from keras.utils import np_utils

def get_embedding_matrix(MAX_NUM_WORDS, word_index):
  embedding_matrix = np.zeros((MAX_NUM_WORDS + 1, 300)) # 300 as we're using glove.6B.300d
  for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
      continue
    if word in glove_model:
      embedding_vector = glove_model[word]
      if embedding_vector is not None:
        # words not found in embedding index will be all-zeros
        embedding_matrix[i] = embedding_vector
  return embedding_matrix
embedding_matrix = get_embedding_matrix(MAX_NUM_WORDS, word_index)
unstruct_embedding_matrix = get_embedding_matrix(UNSTRUCT_MAX_NUM_WORDS, unstruct_word_index)
struct_embedding_matrix = get_embedding_matrix(STRUCT_MAX_NUM_WORDS, struct_word_index)

In [None]:
import tensorflow as tf
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Flatten, Add, Lambda
from keras.layers import LSTM
from keras.utils import np_utils
from keras.layers.wrappers import Bidirectional
from keras.layers import GlobalMaxPool1D, concatenate
from keras.layers import LSTM
from keras.models import Model
from keras.layers import Input
from keras.layers import BatchNormalization
from keras.layers.merge import add


class ObjectPropertiesExtractionModel():
  """
    We will have different branch that represents the prediction of 
    different properties of the object
  """

  def __init__(self,
               material_labels,
               colour_labels,
               weight_labels,
               volume_labels,
               object_labels,
               length_labels, width_labels, height_labels, nb_parts, prop):
    self.nb_material_classes = material_labels
    self.nb_colour_classes = colour_labels
    self.nb_weight_classes = weight_labels
    self.nb_volume_classes = volume_labels
    self.nb_object_classes = object_labels
    self.nb_length_classes = length_labels
    self.nb_width_classes = width_labels
    self.nb_height_classes = height_labels
    self.nb_parts = nb_parts
    self.prop = prop

  def make_default_hidden_layers_elmo(self, inputs, MAX_SEQUENCE_LENGTH):
    embedding = Lambda(ElmoEmbedding, output_shape=(MAX_SEQUENCE_LENGTH, 1024))(inputs)
    x = Bidirectional(LSTM(units=256))(embedding)
    return x

  def make_default_hidden_layers(self, inputs, embedding_matrix, embedding_dim, MAX_NUM_WORDS, MAX_SEQUENCE_LENGTH):
    embedding = Embedding(MAX_NUM_WORDS + 1,
                    embedding_dim,
                    input_length=MAX_SEQUENCE_LENGTH,
                    weights=[embedding_matrix],
                            trainable=True)(inputs) #Configure the trainable param
    x = Bidirectional(LSTM(256))(embedding)
    return x

  def make_material_branch(self, input):
    x = BatchNormalization()(input)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(self.nb_material_classes, activation='softmax', name='material_output')(x)
    return x

  def make_colour_branch(self, input):
    x = BatchNormalization()(input)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(self.nb_colour_classes, activation='softmax', name='colour_output')(x)
    return x

  def make_weight_branch(self, input):
    x = BatchNormalization()(input)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(self.nb_weight_classes, activation='softmax', name='weight_output')(x)
    return x

  def make_volume_branch(self, input):
    x = BatchNormalization()(input)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(self.nb_volume_classes, activation='softmax', name='volume_output')(x)
    return x
  
  def make_object_branch(self, input):
    x = BatchNormalization()(input)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(self.nb_object_classes, activation='softmax', name='object_output')(x)
    return x

  def make_length_branch(self, input):
    x = BatchNormalization()(input)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(self.nb_length_classes, activation='softmax', name='length_output')(x)
    return x

  def make_width_branch(self, input):
    x = BatchNormalization()(input)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(self.nb_width_classes, activation='softmax', name='width_output')(x)
    return x

  def make_height_branch(self, input):
    x = BatchNormalization()(input)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(self.nb_height_classes, activation='softmax', name='height_output')(x)
    return x

  def make_obj_parts_branch(self, input):
    x = BatchNormalization()(input)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(self.nb_parts, activation='sigmoid', name='parts_output')(x)
    return x

  def assemble_full_model_multi_input(self, unstruct_embedding_matrix, struct_embedding_matrix, embedding_dim,
                                      STRUCT_MAX_NUM_WORDS, STRUCT_MAX_SEQUENCE_LENGTH,
                                      UNSTRUCT_MAX_NUM_WORDS, UNSTRUCT_MAX_SEQUENCE_LENGTH):
    struct_input = Input(shape=(STRUCT_MAX_SEQUENCE_LENGTH, ))
    unstruct_input = Input(shape=(UNSTRUCT_MAX_SEQUENCE_LENGTH, ))

    struct_hidden = self.make_default_hidden_layers(struct_input, struct_embedding_matrix, embedding_dim, STRUCT_MAX_NUM_WORDS, STRUCT_MAX_SEQUENCE_LENGTH)
    unstruct_hidden = self.make_default_hidden_layers(unstruct_input, unstruct_embedding_matrix, embedding_dim, UNSTRUCT_MAX_NUM_WORDS, UNSTRUCT_MAX_SEQUENCE_LENGTH)

    combined = concatenate([struct_hidden, unstruct_hidden])
    material_branch = self.make_material_branch(combined)
    colour_branch = self.make_colour_branch(combined)
    weight_branch = self.make_weight_branch(combined)
    volume_branch = self.make_volume_branch(combined)
    object_branch = self.make_object_branch(combined)
    length_branch = self.make_length_branch(combined)
    width_branch = self.make_width_branch(combined)
    height_branch = self.make_height_branch(combined)

    model = Model(inputs=[unstruct_input, struct_input], outputs=[material_branch,
                                                                  colour_branch,
                                                                  weight_branch,
                                                                  volume_branch,
                                                                  object_branch,
                                                                  length_branch,
                                                                  width_branch,
                                                                  height_branch])
    return model

  def assemble_full_model(self, embedding_matrix, embedding_dim, MAX_NUM_WORDS, MAX_SEQUENCE_LENGTH, embed):
    input_shape = (MAX_SEQUENCE_LENGTH,)
    if embed == 'glove':
      inputs = Input(shape=input_shape)
      x = self.make_default_hidden_layers(inputs, embedding_matrix, embedding_dim, MAX_NUM_WORDS, MAX_SEQUENCE_LENGTH)
    elif embed == 'elmo':
      inputs = Input(shape=input_shape, dtype=tf.string)
      x = self.make_default_hidden_layers_elmo(inputs, MAX_SEQUENCE_LENGTH)


    material_branch = self.make_material_branch(x)
    colour_branch = self.make_colour_branch(x)
    weight_branch = self.make_weight_branch(x)
    volume_branch = self.make_volume_branch(x)
    object_branch = self.make_object_branch(x)
    length_branch = self.make_length_branch(x)
    width_branch = self.make_width_branch(x)
    height_branch = self.make_height_branch(x)
    parts_branch = self.make_obj_parts_branch(x)

    '''
    Return single output
    '''
    # model = Model(inputs=inputs, outputs=parts_branch)
    '''
    Return multiple output
    '''
    model = Model(inputs=inputs, outputs=[material_branch,
                                          colour_branch,
                                          weight_branch,
                                          volume_branch,
                                          object_branch,
                                          length_branch,
                                          width_branch,
                                          height_branch,
                                          parts_branch])
    # if self.prop == 'material':
    #   model = Model(inputs=inputs, outputs=material_branch)
    # if self.prop == 'colour':
    #   model = Model(inputs=inputs, outputs=colour_branch)
    # elif self.prop == 'weight':
    #   model = Model(inputs=inputs, outputs=weight_branch)
    # elif self.prop == 'volume':
    #   model = Model(inputs=inputs, outputs=volume_branch)
    # elif self.prop == 'object':
    #   model = Model(inputs=inputs, outputs=object_branch)
    # elif self.prop == 'length':
    #   model = Model(inputs=inputs, outputs=length_branch)
    # elif self.prop == 'width':
    #   model = Model(inputs=inputs, outputs=width_branch)
    # elif self.prop == 'height':
    #   model = Model(inputs=inputs, outputs=height_branch)
    return model


In [None]:
'''
Change the embedding to glove/elmo
'''
embedding_dim = 300
embed = 'glove'

def get_single_output_model_elmo(embed):
  prop_list = ['material', 'colour', 'weight', 'volume', 'object', 'length', 'width', 'height']
  models = []
  for i in prop_list:
    helper = ObjectPropertiesExtractionModel(len(material_list),
                                            len(colour_list),
                                            len(weight_list),
                                            len(volume_list),
                                            len(object_list),
                                            len(length_list),
                                            len(width_list),
                                            len(height_list), i)
    if embed == 'elmo':
      model = helper.assemble_full_model(None, None, None, MAX_SEQUENCE_LENGTH) #for ELMo
    elif embed == 'glove':
      model = helper.assemble_full_model(struct_embedding_matrix, embedding_dim, STRUCT_MAX_NUM_WORDS, STRUCT_MAX_SEQUENCE_LENGTH)
    models.append(model)
  return models

helper = ObjectPropertiesExtractionModel(len(material_list),
                                         len(colour_list),
                                         len(weight_list),
                                         len(volume_list),
                                         len(object_list),
                                         len(length_list),
                                         len(width_list),
                                         len(height_list), len(list(mlb.classes_)), None)

# model = helper.assemble_full_model(None, None, None, MAX_SEQUENCE_LENGTH, 'elmo') #for ELMo
'''
For GloVe using all of the text
to use structured text only, for example
model = helper.assemble_full_model(struct_embedding_matrix, embedding_dim, STRUCT_MAX_NUM_WORDS, STRUCT_MAX_SEQUENCE_LENGTH, 'glove')
'''
model = helper.assemble_full_model(embedding_matrix, embedding_dim, MAX_NUM_WORDS, MAX_SEQUENCE_LENGTH, 'glove')
# model = helper.assemble_full_model_multi_input(unstruct_embedding_matrix, struct_embedding_matrix, embedding_dim,
#                                                STRUCT_MAX_NUM_WORDS, STRUCT_MAX_SEQUENCE_LENGTH,
#                                                UNSTRUCT_MAX_NUM_WORDS, UNSTRUCT_MAX_SEQUENCE_LENGTH, 'glove') # for GloVe with multiple input

# ELMo - single output
# models = get_single_output_model_elmo()
# material_model, colour_model, weight_model, volume_model, object_model, length_model, width_model, height_model = models[0], models[1], models[2], models[3], models[4], models[5], models[6], models[7]

# material_model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
# colour_model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
# weight_model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
# volume_model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
# object_model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
# length_model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
# width_model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
# height_model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])


# For multiple output
losses = {
    "material_output": "categorical_crossentropy",
    "colour_output": "categorical_crossentropy",
    "weight_output": "categorical_crossentropy",
    "volume_output": "categorical_crossentropy",
    "object_output": "categorical_crossentropy",
    "length_output": "categorical_crossentropy",
    "width_output": "categorical_crossentropy",
    "height_output": "categorical_crossentropy",
    'parts_output' : 'binary_crossentropy'
}

model.compile(loss=losses, optimizer='adam', metrics=['accuracy'])

# model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

'''To predict object parts only'''
# model.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])


model.summary()
# plot_model(model, to_file="MultipleOutputModel.png")

In [None]:
'''
Plot the model
'''
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
'''
ELMo embedding only
'''
# Now get the data for elmo embedding

input_list = obj_df['Input'].tolist()

# Pre-pad
new_input_list = []

for seq in input_list:
  new_seq = []
  for i in range(MAX_SEQUENCE_LENGTH - len(seq)):
    new_seq.append("PADword")
  for w in seq:
    new_seq.append(w)
  new_input_list.append(new_seq)


In [None]:
'''
ELMo embedding only
'''
from sklearn.model_selection import train_test_split
split_data = train_test_split(np.asarray(new_input_list),
                              material_labels,
                              colour_labels,
                              weight_labels,
                              volume_labels,
                              object_labels,
                              length_labels,
                              width_labels,
                              height_labels,
                              np.asarray(list(obj_df['Parts'])),
                              test_size=0.20, random_state=33)
(x_train, x_test,
 y_train_material, y_test_material,
 y_train_colour, y_test_colour,
 y_train_weight, y_test_weight,
 y_train_volume, y_test_volume,
 y_train_object, y_test_object,
 y_train_length, y_test_length,
 y_train_width, y_test_width,
 y_train_height, y_test_height,
 y_train_parts, y_test_parts) = split_data

In [None]:
'''
GloVe embedding only
'''

from sklearn.model_selection import train_test_split
split_data = train_test_split(padded_sequences,
                              unstruct_padded_sequences,
                              struct_padded_sequences,
                              material_labels,
                              colour_labels,
                              weight_labels,
                              volume_labels,
                              object_labels,
                              length_labels,
                              width_labels,
                              height_labels,
                              np.asarray(list(obj_df['Parts'])),
                              test_size=0.20, random_state=33)

(x_train, x_test,
 unstruct_x_train, unstruct_x_test,
 struct_x_train, struct_x_test,
 y_train_material, y_test_material,
 y_train_colour, y_test_colour,
 y_train_weight, y_test_weight,
 y_train_volume, y_test_volume,
 y_train_object, y_test_object,
 y_train_length, y_test_length,
 y_train_width, y_test_width,
 y_train_height, y_test_height,
 y_train_parts, y_test_parts) = split_data

In [None]:
# For multiple input - GloVe

EPOCHS = 50
# history = model.fit(x=[unstruct_x_train, struct_x_train], y=y_train_weight, epochs=EPOCHS, validation_data=([unstruct_x_test, struct_x_test], y_test_weight), verbose=2)
history = model.fit(x=[unstruct_x_train, struct_x_train],
                    y={"material_output": y_train_material,
                       "colour_output": y_train_colour,
                       'weight_output': y_train_weight,
                       'volume_output' : y_train_volume,
                       'object_output' : y_train_object,
                       'length_output' : y_train_length,
                       'width_output' : y_train_width,
                       'height_output' : y_train_height},
                    batch_size=32,
                    validation_data=([unstruct_x_test, struct_x_test],
                                     {"material_output": y_test_material,
                                      "colour_output": y_test_colour,
                                      'weight_output': y_test_weight,
                                      'volume_output' : y_test_volume,
                                      'object_output' : y_test_object,
                                      'length_output' : y_test_length,
                                      'width_output' : y_test_width,
                                      'height_output' : y_test_height}),
                    epochs=EPOCHS,
                    verbose=2)

In [None]:
# Set up directory to save best model
# NOTE: TF1 does not support saving best model
!rm -rf best_model
!mkdir best_model
path = './best_model'
checkpoint = tf.keras.callbacks.ModelCheckpoint(path, monitor='loss', save_best_only=True, verbose=1)


In [None]:
'''
NOTE: for training ELMo embedding
It is found that there will be error if the size of the validation is not dividable (maybe TF1 issues?)
for example if I have 200 test samples, with batch size of 32
I can only train it without error by using 192 test samples (192 % 32 = 0)
Similar technique should also be applied for x_train as well
'''

from sklearn.model_selection import train_test_split
path = './best_model'

EPOCHS = 50

# ######################### For ELMo embedding
'''
Single output
'''
# history = model.fit(x_train, y_train_parts,
#                     epochs=50, batch_size=batch_size,
#                     validation_data=(x_test[0:batch_size*9], y_test_parts[0:batch_size*9]), verbose=1)

'''
Multiple output
'''
# history = model.fit(x_train,
#                     y={"material_output": y_train_material,
#                        "colour_output": y_train_colour,
#                        'weight_output': y_train_weight,
#                        'volume_output': y_train_volume,
#                        'object_output': y_train_object,
#                        'length_output': y_train_length,
#                        'width_output': y_train_width,
#                        'height_output': y_train_height,
#                        'parts_output': y_train_parts},
#                     validation_data=(x_test[0:batch_size*9],
#                                      {"material_output": y_test_material[0:batch_size*9],
#                                       "colour_output": y_test_colour[0:batch_size*9],
#                                       'weight_output': y_test_weight[0:batch_size*9],
#                                       'volume_output': y_test_volume[0:batch_size*9],
#                                       'object_output': y_test_object[0:batch_size*9],
#                                       'length_output': y_test_length[0:batch_size*9],
#                                       'width_output': y_test_width[0:batch_size*9],
#                                       'height_output': y_test_height[0:batch_size*9],
#                                       'parts_output': y_test_parts[0:batch_size*9]}),
#                     epochs=EPOCHS,
#                     batch_size=batch_size,
#                     verbose=1)


# ######################### For Glove embedding
'''
Single output
'''
# history = model.fit(x_train, y_train_parts, epochs=100, validation_data=(x_test, y_test_parts), verbose=2)

'''
Multiple output
'''
history = model.fit(x=x_train,
                    y={"material_output": y_train_material,
                       "colour_output": y_train_colour,
                       'weight_output': y_train_weight,
                       'volume_output' : y_train_volume,
                       'object_output' : y_train_object,
                       'length_output' : y_train_length,
                       'width_output' : y_train_width,
                       'height_output' : y_train_height,
                       'parts_output' : y_train_parts},
                    batch_size=32,
                    validation_data=(x_test,
                                     {"material_output": y_test_material,
                                      "colour_output": y_test_colour,
                                      'weight_output': y_test_weight,
                                      'volume_output' : y_test_volume,
                                      'object_output' : y_test_object,
                                      'length_output' : y_test_length,
                                      'width_output' : y_test_width,
                                      'height_output' : y_test_height,
                                      'parts_output' : y_test_parts}),
                    epochs=200,
                    # callbacks=[checkpoint],
                    verbose=1)


In [None]:
# Look at classification report
from sklearn.metrics import classification_report
pred = model.predict(x_test)
report = classification_report(np.argmax(y_test_volume, axis=1), np.argmax(pred[3], axis=1))
print(report)



# The below blocks are pipeline for training single output w.r.t different embedding

In [None]:
colour_history = colour_model.fit(x_train, y_train_colour,
                    epochs=EPOCHS, batch_size=batch_size,
                    validation_data=(x_test[0:batch_size*9], y_test_colour[0:batch_size*9]), verbose=1)

In [None]:
# weight_history = weight_model.fit(x_train, y_train_weight,
#                     epochs=5, batch_size=batch_size,
#                     validation_data=(x_test[0:batch_size*9], y_test_weight[0:batch_size*9]), verbose=1)
weight_history = weight_model.fit(struct_x_train, y_train_weight, epochs=40, validation_data=(struct_x_test, y_test_weight), callbacks=[checkpoint], verbose=2)


In [None]:
weight_model.load_weights(path)

In [None]:
# volume_history = volume_model.fit(x_train, y_train_volume,
#                     epochs=1, batch_size=batch_size,
#                     validation_data=(x_test[0:batch_size*9], y_test_volume[0:batch_size*9]), verbose=1)
volume_history = volume_model.fit(struct_x_train, y_train_volume, epochs=40, validation_data=(struct_x_test, y_test_volume), callbacks=[checkpoint], verbose=2)


In [None]:
volume_model.load_weights(path)

In [None]:
object_history = object_model.fit(x_train, y_train_object,
                    epochs=EPOCHS, batch_size=batch_size,
                    validation_data=(x_test[0:batch_size*9], y_test_object[0:batch_size*9]), verbose=1)

In [None]:
# length_history = length_model.fit(x_train, y_train_length,
#                     epochs=1, batch_size=batch_size,
#                     validation_data=(x_test[0:batch_size*9], y_test_length[0:batch_size*9]), verbose=1)
length_history = length_model.fit(struct_x_train, y_train_length, epochs=40, validation_data=(struct_x_test, y_test_length), callbacks=[checkpoint], verbose=2)


In [None]:
length_model.load_weights(path)

In [None]:
# width_history = width_model.fit(x_train, y_train_width,
#                                 epochs=1, batch_size=batch_size,
#                                 validation_data=(x_test[0:batch_size*9], y_test_width[0:batch_size*9]), verbose=1)
width_history = width_model.fit(struct_x_train, y_train_width, epochs=40, validation_data=(struct_x_test, y_test_width), callbacks=[checkpoint], verbose=2)


In [None]:
width_model.load_weights(path)

In [None]:
# height_history = height_model.fit(x_train, y_train_height,
#                     epochs=5, batch_size=batch_size,
#                     validation_data=(x_test[0:batch_size*9], y_test_height[0:batch_size*9]), verbose=1)
height_history = height_model.fit(struct_x_train, y_train_height, epochs=40, validation_data=(struct_x_test, y_test_height), callbacks=[checkpoint], verbose=2)


In [None]:
height_model.load_weights(path)

In [None]:
'''
Load the best model, this can only be used if you use GloVe embedding (TF2)
'''
model.load_weights(path)

In [None]:
pred = model.predict(x_test)

In [None]:
idx_to_weight = {v: k for k, v in weight_to_idx.items()}
idx_to_volume = {v: k for k, v in volume_to_idx.items()}
idx_to_length = {v: k for k, v in length_to_idx.items()}
idx_to_width = {v: k for k, v in width_to_idx.items()}
idx_to_height = {v: k for k, v in height_to_idx.items()}

In [None]:
# Prepare ground truth (from onehot back to actual measurement)
def get_numeric_true_metric(onehot, idx_dict):
  num_arr = []
  for i in onehot:
    num_arr.append(int(idx_dict[np.argmax(i)]))
  return num_arr
true_weight = get_numeric_true_metric(y_test_weight, idx_to_weight)
true_volume = get_numeric_true_metric(y_test_volume, idx_to_volume)
true_length = get_numeric_true_metric(y_test_length, idx_to_length)
true_width = get_numeric_true_metric(y_test_width, idx_to_width)
true_height = get_numeric_true_metric(y_test_height, idx_to_height)

In [None]:
# Now look at the predicted values
# Material -> Colour -> Weight -> Volume -> Object -> Length -> Width -> Height
# Index of the property relating to the models
# prop_idx = [2, 3, 5, 6, 7]
'''
NOTE: this needs to be changed if the model is single output
for example
pred_weight, pred_weight_val = pred, []
pred_volume, pred_volume_val = pred, []
pred_length, pred_length_val = pred, []
pred_width, pred_width_val = pred, []
pred_height, pred_height_val = pred, []
'''
pred_weight, pred_weight_val = pred[2], []
pred_volume, pred_volume_val = pred[3], []
pred_length, pred_length_val = pred[5], []
pred_width, pred_width_val = pred[6], []
pred_height, pred_height_val = pred[7], []

def convert_to_number(pred, val, idx_dict):
  for i in range(len(pred)):
    val.append(int(idx_dict[np.argmax(pred[i])]))
convert_to_number(pred_weight, pred_weight_val, idx_to_weight)
convert_to_number(pred_volume, pred_volume_val, idx_to_volume)
convert_to_number(pred_length, pred_length_val, idx_to_length)
convert_to_number(pred_width, pred_width_val, idx_to_width)
convert_to_number(pred_height, pred_height_val, idx_to_height)


In [None]:
def get_abs_error(true, pred):
  abs_arr = []
  for i in range(len(true)):
    abs_arr.append(abs(true[i] - pred[i]))
  return abs_arr

weight_abs_error = np.asarray(get_abs_error(true_weight, pred_weight_val))
volume_abs_error = np.asarray(get_abs_error(true_volume, pred_volume_val))
length_abs_error = np.asarray(get_abs_error(true_length, pred_length_val))
width_abs_error = np.asarray(get_abs_error(true_width, pred_width_val))
height_abs_error = np.asarray(get_abs_error(true_height, pred_height_val))
print(f'Mean absolute error of weight: {np.mean(weight_abs_error)}g')
print(f'Mean absolute error of volume: {np.mean(volume_abs_error)}ml')
print(f'Mean absolute error of length: {np.mean(length_abs_error)}cm')
print(f'Mean absolute error of width : {np.mean(width_abs_error)}cm')
print(f'Mean absolute error of height: {np.mean(height_abs_error)}cm')

In [None]:
!mkdir experiment4_glove

In [None]:
plt.style.use("ggplot")


In [None]:
def plot_abs_error(abs_error, property):
  abs_error = abs_error[abs_error != 0]
  weights = np.ones_like(abs_error) / len(abs_error)
  fig = plt.figure(figsize=(10, 4), dpi=200)
  ax1 = fig.add_subplot(1, 1, 1)
  ax1.hist(abs_error, weights=weights, bins=200)
  ax1.set_xlabel('Absolute Error')
  ax1.set_ylabel('Normalized Frequency')
  ax1.set_title(f'Absolute error on {property} prediction')
  plt.savefig(f'./experiment4_glove/{property}.png')

plot_abs_error(weight_abs_error, 'weight')
plot_abs_error(volume_abs_error, 'volume')
plot_abs_error(length_abs_error, 'length')
plot_abs_error(width_abs_error, 'width')
plot_abs_error(height_abs_error, 'height')


In [None]:
!zip -r download6.zip ./experiment6_glove