In [5]:
! python boring_function.py 'hello world'

Traceback (most recent call last):
  File "boring_function.py", line 3, in <module>
    import keras
ImportError: No module named keras


In [4]:
#TODO: Execute the following procedure (Shift+Enter)
from kraino.utils import data_provider

dp = data_provider.select['daquar-triples']
dp

ImportError: cannot import name imread

The code above returns a dictionary of three representations of the DAQUAR dataset. For now, we will look only into the 'text' representation. dp['text'] returns a function from dataset split into the dataset's textual representation. It will be more clear after executing the following instruction.

In [3]:
# check the keys of the representation of DAQUAR train
train_text_representation = dp['text'](train_or_test='train')
train_text_representation.keys()

['answer_words_delimiter',
 'end_of_answer',
 'img_name',
 'y',
 'x',
 'end_of_question',
 'img_ind',
 'question_id']

This representation specifies how questions are ended ('?'), answers are ended ('.'), answer words are delimited (DAQUAR sometimes has a set of answer words as an answer, for instance 'knife, fork' may be a valid answer), but most important, it has questions (key 'x'), answers (key 'y'), and names of the corresponding images (key 'img_name').
We first need to build a suitable vocabulary from our raw textual training data, and next transform them into one-hot representation.


In [4]:
from toolz import frequencies
train_raw_x = train_text_representation['x']
# we start from building the frequencies table
wordcount_x = frequencies(' '.join(train_raw_x).split(' '))
# print the most and least frequent words
n_show = 5
print(sorted(wordcount_x.items(), key=lambda x: x[1], reverse=True)[:n_show])
print(sorted(wordcount_x.items(), key=lambda x: x[1])[:n_show])

[('the', 9847), ('?', 6795), ('what', 5847), ('is', 5368), ('on', 2909)]
[('all', 1), ('surrounded', 1), ('four', 1), ('displaying', 1), ('children', 1)]


In [6]:
# Kraino is a framework that helps in fast prototyping Visual Turing Test models
from kraino.utils.input_output_space import build_vocabulary

# This function takes wordcounts and returns word2index - mapping from words into indices, 
# and index2word - mapping from indices to words.
word2index_x, index2word_x = build_vocabulary(
    this_wordcount=wordcount_x,
    truncate_to_most_frequent=0)
word2index_x

{'3': 507,
 u'<eoa>': 2,
 u'<eoq>': 3,
 u'<pad>': 0,
 u'<unk>': 1,
 '?': 52,
 'a': 206,
 'above': 80,
 'ac': 817,
 'across': 513,
 'against': 534,
 'air': 589,
 'airconditionerg': 790,
 'alarm': 424,
 'all': 4,
 'along': 92,
 'amidst': 783,
 'and': 382,
 'any': 390,
 'apart': 689,
 'apples': 514,
 'appliance': 511,
 'appliances': 113,
 'are': 489,
 'arm': 495,
 'armchair': 547,
 'armchairs': 320,
 'around': 765,
 'at': 828,
 'attached': 807,
 'audio': 648,
 'available': 515,
 'away': 502,
 'baby': 138,
 'back': 709,
 'backpack': 542,
 'bag': 471,
 'bags': 608,
 'ball': 624,
 'bananas': 652,
 'bars': 478,
 'base': 303,
 'basin': 567,
 'basins': 546,
 'basket': 216,
 'baskets': 111,
 'bath': 257,
 'bathroom': 559,
 'bathtub': 593,
 'bean': 463,
 'bear': 462,
 'bed': 374,
 'bedding': 721,
 'beds': 873,
 'bedside': 51,
 'been': 564,
 'before': 234,
 'behind': 661,
 'beige': 757,
 'below': 811,
 'belt': 557,
 'bench': 668,
 'beneath': 269,
 'benhind': 96,
 'between': 510,
 'bicycle': 335,
 

In addition, we are using a few special symbols that don't occur in the training dataset.
Most important are $<pad>$ and $<unk>$. We will use the former to pad sequences in order to have the same 
number of temporal elements; we will use the latter for words (at test time) that don't exist in training dataset.

Armed with vocabulary, we can build one-hot representation of the training data. However, this is not neccessary and maybe even wasteful. Our one-hot representation of the input text doesn't explicitely build long vectors, but instead it operates on indices. The example above would be encoded as [0,1,4,2,7,3]. 
```
Can you prove the equivalence in the claim?
```
__claim__:

Let $x$ be a binary vector with exactly one value $1$ at position $index$, that is $x[index]=1$. Then $$W[:,index] = Wx$$ where $W[:,b]$ denotes a vector built from a column $b$ of $W$.


In [7]:
from kraino.utils.input_output_space import encode_questions_index
one_hot_x = encode_questions_index(train_raw_x, word2index_x)
print(train_raw_x[:3])
print(one_hot_x[:3])

['what is on the right side of the black telephone and on the left side of the red chair ?', 'what is in front of the white door on the left side of the desk ?', 'what is on the desk ?']
[[71, 598, 744, 647, 705, 272, 161, 647, 125, 135, 382, 744, 647, 650, 272, 161, 647, 298, 15, 52, 3], [71, 598, 603, 255, 161, 647, 352, 131, 744, 647, 650, 272, 161, 647, 656, 52, 3], [71, 598, 744, 647, 656, 52, 3]]


As we can see, the sequences have different elements. We will pad the sequences to have the same length $MAXLEN$.

In [8]:
# We use another framework that is useful to build deep learning models - Keras
from keras.preprocessing import sequence
MAXLEN=30
train_x = sequence.pad_sequences(one_hot_x, maxlen=MAXLEN)
train_x[:3]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,  71, 598, 744, 647,
        705, 272, 161, 647, 125, 135, 382, 744, 647, 650, 272, 161, 647,
        298,  15,  52,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         71, 598, 603, 255, 161, 647, 352, 131, 744, 647, 650, 272, 161,
        647, 656,  52,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  71, 598, 744,
        647, 656,  52,   3]])

And do the same with the answers.

In [9]:
# for simplicity, we consider only first answer words; that is, if answer is 'knife,fork' we encode only 'knife'
MAX_ANSWER_TIME_STEPS=1

from kraino.utils.input_output_space import encode_answers_one_hot
train_raw_y = train_text_representation['y']
wordcount_y = frequencies(' '.join(train_raw_y).split(' '))
word2index_y, index2word_y = build_vocabulary(this_wordcount=wordcount_y)
train_y, _ = encode_answers_one_hot(
    train_raw_y, 
    word2index_y, 
    answer_words_delimiter=train_text_representation['answer_words_delimiter'],
    is_only_first_answer_word=True,
    max_answer_time_steps=MAX_ANSWER_TIME_STEPS)
print(train_x.shape)
print(train_y.shape)

(6795L, 30L)
(6795L, 686L)


Finally, we can also encode test questions. We need it later to see how well our models generalise to new question,answer,image triplets. Remember however that we should use vocabulary we generated from training samples.

```
Why should we use the training vocabulary to encode test questions?
```

In [10]:
def print_list(ll):
    # Prints the list
    print('\n'.join(ll))

In [11]:
test_text_representation = dp['text'](train_or_test='test')
test_raw_x = test_text_representation['x']
test_one_hot_x = encode_questions_index(test_raw_x, word2index_x)
test_x = sequence.pad_sequences(test_one_hot_x, maxlen=MAXLEN)
print_list(test_raw_x[:3])
test_x[:3];

what is on the left side of the white oven on the floor and on right side of the blue armchair ?
what is on the left side of the fire extinguisher and on the right side of the chair ?
what is between the the two white and black garbage bins ?


With encoded question, answer pairs we finish the first section. But before delving into building and training new models, let's have a look at summary to see bigger picture.

__Summary__

We started from raw questions from the training set. Use them to build a vocabulary. Next, we encode questions into sequences of one-hot vectors based on the vocabulary. Finally, we use the same vocabulary to encode questions from test set, if a word is absent we use extra token $<unk>$ to encode this fact (we encode the $<unk>$ token, not the word).

__WUPS__ 
First of all, please run the cell below to set up a link to the NLTK data.

In [12]:
%env NLTK_DATA=C:\Users\Dell user\Downloads\visual_turing_test-tutorial\visual_turing_test-tutorial\data\nltk_data

env: NLTK_DATA=C:\Users\Dell user\Downloads\visual_turing_test-tutorial\visual_turing_test-tutorial\data\nltk_data


We won't go very far using only textual features. Hence, it's now time to consider its visual counterpart.

As shown in the figure below, a quite common procedure works as follows:
* Use a CNN already pre-trained on some large-scale classification task, most often it is [ImageNet](http://image-net.org) with $1000$ for recognition.
* 'Chop off' CNN after some layer. We will use responses of that layer as visual features.

In this tutorial, we will use features extracted from the second last $4096$ dimensional layer of [VGG NET-19](http://arxiv.org/pdf/1409.1556.pdf). We have already extracted features in advance using [Caffe](http://caffe.berkeleyvision.org) - another excellent framework for deep learning, particularly good for CNNs.

Let's creat an input as a pair of textual and visual features.

Please run the cell below in order to get visual features aligned with textual featurs.

In [13]:
# this contains a list of the image names of our interest; 
# it also makes sure that visual and textual features are aligned correspondingly
train_image_names = train_text_representation['img_name']
# the name for visual features that we use
# CNN_NAME='vgg_net'
#CNN_NAME='googlenet'

CNN_NAME='fb_resnet'
# the layer in CNN that is used to extract features
# PERCEPTION_LAYER='fc1000'
# PERCEPTION_LAYER='pool5-7x7_s1'
# PERCEPTION_LAYER='res5c-152'
#PERCEPTION_LAYER='l2_res5c-152' # l2 prefix since there are l2-normalized visual features

#train_visual_features = dp['perception'](
#    train_or_test='train',
#    names_list=train_image_names,
#    parts_extractor=None,
 #   max_parts=None,
#    perception=CNN_NAME,
#    layer=PERCEPTION_LAYER,
#    second_layer=None
#    )
import numpy as np
train_visual_features=np.load('Concat_Feature_NPY.npy');
#train_visual_features.shape
#train_visual_features.shape
#train_image_names

In [15]:
# this contains a list of the image names of our interest; 
# it also makes sure that visual and textual features are aligned correspondingly
train_image_names = train_text_representation['img_name']
# the name for visual features that we use
# CNN_NAME='vgg_net'
#CNN_NAME='googlenet'

CNN_NAME='fb_resnet'
# the layer in CNN that is used to extract features
# PERCEPTION_LAYER='fc1000'
# PERCEPTION_LAYER='pool5-7x7_s1'
# PERCEPTION_LAYER='res5c-152'
PERCEPTION_LAYER='l2_res5c-152' # l2 prefix since there are l2-normalized visual features

train_visual_features_normal = dp['perception'](
    train_or_test='train',
    names_list=train_image_names,
    parts_extractor=None,
    max_parts=None,
    perception=CNN_NAME,
    layer=PERCEPTION_LAYER,
    second_layer=None
    )
#import numpy as np
#train_visual_features=np.load('Concat_Feature_NPY.npy');
#train_visual_features.shape
#train_visual_features.shape
#train_image_names

Shuffling memories ...
Skipped images 0 of them:


Let's creat an input as a pair of textual and visual features.

In [16]:
train_input_normal = [train_x, train_visual_features_normal]
train_input = [train_x, train_visual_features]
train_input

[array([[  0,   0,   0, ...,  15,  52,   3],
        [  0,   0,   0, ..., 656,  52,   3],
        [  0,   0,   0, ..., 656,  52,   3],
        ..., 
        [  0,   0,   0, ..., 242,  52,   3],
        [  0,   0,   0, ..., 793,  52,   3],
        [  0,   0,   0, ..., 554,  52,   3]]),
 array([[ 0.45900294,  0.84124762,  0.12362932, ..., -2.09677291,
         -1.01210475,  5.5813427 ],
        [ 0.45900294,  0.84124762,  0.12362932, ..., -2.09677291,
         -1.01210475,  5.5813427 ],
        [ 0.45900294,  0.84124762,  0.12362932, ..., -2.09677291,
         -1.01210475,  5.5813427 ],
        ..., 
        [ 0.37752256,  0.19149037,  0.06012457, ..., -4.54769135,
         -3.18773413,  5.24105978],
        [ 0.37752256,  0.19149037,  0.06012457, ..., -4.54769135,
         -3.18773413,  5.24105978],
        [ 0.37752256,  0.19149037,  0.06012457, ..., -4.54769135,
         -3.18773413,  5.24105978]], dtype=float32)]

In [None]:
#== Model definition

# First we define a model using keras/kraino
from keras.models import Sequential
from keras.layers.core import Activation
from keras.layers.core import Dense
from keras.layers.core import Dropout
from keras.layers.core import Layer
from keras.layers.core import Merge
from keras.layers.core import TimeDistributedMerge
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.recurrent import LSTM
from keras import optimizers

from kraino.core.model_zoo import AbstractSequentialModel
from kraino.core.model_zoo import AbstractSingleAnswer
from kraino.core.model_zoo import AbstractSequentialMultiplewordAnswer
from kraino.core.model_zoo import Config
from kraino.core.keras_extensions import DropMask
from kraino.core.keras_extensions import LambdaWithMask
from kraino.core.keras_extensions import time_distributed_masked_ave

# This model inherits from AbstractSingleAnswer, and so it produces single answer words
# To use multiple answer words, you need to inherit from AbstractSequentialMultiplewordAnswer
class VisionLanguageLSTM(AbstractSequentialModel, AbstractSingleAnswer):
    """
    BOW Language only model that produces single word answers.
    """
    def create(self):
        language_model = Sequential()
        language_model.add(Embedding(
                self._config.input_dim, 
                self._config.textual_embedding_dim, 
                mask_zero=True))
        #TODO: Replace averaging with RNN (you can choose between LSTM and GRU)
#         language_model.add(LambdaWithMask(time_distributed_masked_ave, output_shape=[self.output_shape[2]]))
        language_model.add(LSTM(self._config.hidden_state_dim, 
                      return_sequences=False))

        visual_model = Sequential()
        if self._config.visual_embedding_dim > 0:
            visual_model.add(Dense(
                    self._config.visual_embedding_dim,
                    input_shape=(self._config.visual_dim,)))
        else:
            visual_model.add(Layer(input_shape=(self._config.visual_dim,)))
        self.add(Merge([language_model, visual_model], mode=self._config.multimodal_merge_mode))
        self.add(Dropout(0.5))
        self.add(Dense(self._config.output_dim))
        self.add(Activation('softmax'))
        
        
# dimensionality of embeddings
EMBEDDING_DIM = 500
# kind of multimodal fusion (ave, concat, mul, sum)
MULTIMODAL_MERGE_MODE = 'sum'

model_config = Config(
    textual_embedding_dim=EMBEDDING_DIM,
    visual_embedding_dim=EMBEDDING_DIM,
    hidden_state_dim=EMBEDDING_DIM,
    multimodal_merge_mode=MULTIMODAL_MERGE_MODE,
    input_dim=len(word2index_x.keys()),
    output_dim=len(word2index_y.keys()),
    visual_dim=train_visual_features.shape[1])
model = VisionLanguageLSTM(model_config)
model.create()

model.compile(
    loss='categorical_crossentropy', 
    optimizer='Adam')
text_image_rnn_model = model
text_image_rnn_model_normal = model

Using Theano backend.


In [None]:
#== Model definition

# First we define a model using keras/kraino
from keras.models import Sequential
from keras.layers.core import Activation
from keras.layers.core import Dense
from keras.layers.core import Dropout
from keras.layers.core import Layer
from keras.layers.core import Merge
from keras.layers.core import TimeDistributedMerge
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.recurrent import LSTM
from keras import optimizers

from kraino.core.model_zoo import AbstractSequentialModel
from kraino.core.model_zoo import AbstractSingleAnswer
from kraino.core.model_zoo import AbstractSequentialMultiplewordAnswer
from kraino.core.model_zoo import Config
from kraino.core.keras_extensions import DropMask
from kraino.core.keras_extensions import LambdaWithMask
from kraino.core.keras_extensions import time_distributed_masked_ave

# This model inherits from AbstractSingleAnswer, and so it produces single answer words
# To use multiple answer words, you need to inherit from AbstractSequentialMultiplewordAnswer
class VisionLanguageLSTM(AbstractSequentialModel, AbstractSingleAnswer):
    """
    BOW Language only model that produces single word answers.
    """
    def create(self):
        language_model = Sequential()
        language_model.add(Embedding(
                self._config.input_dim, 
                self._config.textual_embedding_dim, 
                mask_zero=True))
        #TODO: Replace averaging with RNN (you can choose between LSTM and GRU)
#         language_model.add(LambdaWithMask(time_distributed_masked_ave, output_shape=[self.output_shape[2]]))
        language_model.add(LSTM(self._config.hidden_state_dim, 
                      return_sequences=False))

        visual_model = Sequential()
        if self._config.visual_embedding_dim > 0:
            visual_model.add(Dense(
                    self._config.visual_embedding_dim,
                    input_shape=(self._config.visual_dim,)))
        else:
            visual_model.add(Layer(input_shape=(self._config.visual_dim,)))
        self.add(Merge([language_model, visual_model], mode=self._config.multimodal_merge_mode))
        self.add(Dropout(0.5))
        self.add(Dense(self._config.output_dim))
        self.add(Activation('softmax'))
        
        
# dimensionality of embeddings
EMBEDDING_DIM = 500
# kind of multimodal fusion (ave, concat, mul, sum)
MULTIMODAL_MERGE_MODE = 'sum'

model_config = Config(
    textual_embedding_dim=EMBEDDING_DIM,
    visual_embedding_dim=EMBEDDING_DIM,
    hidden_state_dim=EMBEDDING_DIM,
    multimodal_merge_mode=MULTIMODAL_MERGE_MODE,
    input_dim=len(word2index_x.keys()),
    output_dim=len(word2index_y.keys()),
    visual_dim=train_visual_features_normal.shape[1])
model = VisionLanguageLSTM(model_config)
model.create()

model.compile(
    loss='categorical_crossentropy', 
    optimizer='Adam')
text_image_rnn_model_normal = model

In [345]:
text_image_rnn_model_normal

<__main__.VisionLanguageLSTM at 0x9d1f3b70>

In [346]:
text_image_rnn_model

<__main__.VisionLanguageLSTM at 0x9d1f3b70>

In [18]:
#== Model training
text_image_rnn_model.fit(
    train_input, 
    train_y,
    batch_size=512,
    nb_epoch=2,
    validation_split=0.1,
    show_accuracy=True)

Train on 6115 samples, validate on 680 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x30d5ceb8>

In [344]:
train_input_normal

[array([[  0,   0,   0, ...,  15,  52,   3],
        [  0,   0,   0, ..., 656,  52,   3],
        [  0,   0,   0, ..., 656,  52,   3],
        ..., 
        [  0,   0,   0, ..., 242,  52,   3],
        [  0,   0,   0, ..., 793,  52,   3],
        [  0,   0,   0, ..., 554,  52,   3]]),
 array([[ 0.02181184,  0.03997612,  0.00587487, ...,  0.0015843 ,
          0.01211601,  0.01073669],
        [ 0.02181184,  0.03997612,  0.00587487, ...,  0.0015843 ,
          0.01211601,  0.01073669],
        [ 0.02181184,  0.03997612,  0.00587487, ...,  0.0015843 ,
          0.01211601,  0.01073669],
        ..., 
        [ 0.01418223,  0.00719364,  0.00225867, ...,  0.01255163,
          0.01961501,  0.01007006],
        [ 0.01418223,  0.00719364,  0.00225867, ...,  0.01255163,
          0.01961501,  0.01007006],
        [ 0.01418223,  0.00719364,  0.00225867, ...,  0.01255163,
          0.01961501,  0.01007006]])]

In [348]:
text_image_rnn_model_normal.fit(
    train_input_normal, 
    train_y,
    batch_size=512,
    nb_epoch=40,
    validation_split=0.1,
    show_accuracy=True)

Train on 6115 samples, validate on 680 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0xe4ba8208>

### Predictions (Features)

In [19]:
test_image_names = test_text_representation['img_name']
test_visual_features_normal = dp['perception'](
    train_or_test='test',
    names_list=test_image_names,
    parts_extractor=None,
    max_parts=None,
    perception=CNN_NAME,
    layer=PERCEPTION_LAYER,
    second_layer=None
    )

#test_visual_features=np.load('Test_Features_Concat.npy');
test_visual_features=np.load('Test_Features_Concat.npy');
test_visual_features.shape

Shuffling memories ...
Skipped images 0 of them:


(5673L, 3048L)

In [385]:
import copy;
Final_predictions=[];
test_x=test_x[:1000];
test_visual_features=test_visual_features[:1000];
test_visual_features_normal=test_visual_features_normal[:1000];
test_x_temp=copy.deepcopy(test_x);
test_visual_features_temp=copy.deepcopy(test_visual_features);
test_visual_features_normal_temp=copy.deepcopy(test_visual_features_normal);
count1=0;
count2=0;
for i in range(0,1000):
    ques=test_raw_x[i];
    Ques_words=ques.split()
    test_x_temp[0:1000]=0;
    test_x_temp[i]=test_x[i];
    print i;
    test_visual_features_temp[0:1000]=0;
    test_visual_features_normal_temp[0:1000]=0;
    if "behind" in Ques_words or "front" in Ques_words or "far" in Ques_words:
        count1=count1+1;
        test_visual_features_temp[i]=test_visual_features[i];
        test_input = [test_x_temp, test_visual_features_temp];
        from kraino.core.model_zoo import word_generator
        text_image_rnn_model._config.word_generator = word_generator['max_likelihood']
        predictions_answers = text_image_rnn_model.decode_predictions(
        X=test_input,
        temperature=None,
        index2word=index2word_y,
        verbose=0)
    else:
        count2=count2+1;
        test_visual_features_normal_temp[i]=test_visual_features_normal[i];
        test_input = [test_x_temp, test_visual_features_normal_temp];
        from kraino.core.model_zoo import word_generator
        text_image_rnn_model_normal._config.word_generator = word_generator['max_likelihood']
        predictions_answers = text_image_rnn_model_normal.decode_predictions(
        X=test_input,
        temperature=None,
        index2word=index2word_y,
        verbose=0)
    Final_predictions.append(predictions_answers[i])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [20]:
test_input = [test_x[0:1000], test_visual_features[0:1000]];

from kraino.core.model_zoo import word_generator
# we first need to add word_generator to _config (we could have done this before, in the Config constructor)
# we use maximum likelihood as a word generator
text_image_rnn_model._config.word_generator = word_generator['max_likelihood']
predictions_answers_DN = text_image_rnn_model.decode_predictions(
    X=test_input,
    temperature=None,
    index2word=index2word_y,
    verbose=0)

In [391]:
test_input = [test_x[0:1000], test_visual_features_normal[0:1000]];

from kraino.core.model_zoo import word_generator
# we first need to add word_generator to _config (we could have done this before, in the Config constructor)
# we use maximum likelihood as a word generator
text_image_rnn_model_normal._config.word_generator = word_generator['max_likelihood']
predictions_answers_normal = text_image_rnn_model_normal.decode_predictions(
    X=test_input,
    temperature=None,
    index2word=index2word_y,
    verbose=0)

In [424]:
from kraino.utils import print_metrics
test_raw_y = test_text_representation['y']
test_raw_y=test_raw_y[250:500]
_ = print_metrics.select['wups'](
        gt_list=test_raw_y,
        pred_list=Final_predictions[250:500],
        verbose=1,
        extra_vars=None)

METRIC: Accuracy is 33.2, wups at 0.9 is 38.0291816522, wups at 0.0 is -1.0
CLASS METRIC: Accuracy is 19.5009262286, wups at 0.9 is -1.0, wups at 0.0 is -1.0


In [415]:
from kraino.utils import print_metrics
test_raw_y = test_text_representation['y']
test_raw_y=test_raw_y[250:500]

_ = print_metrics.select['wups'](
        gt_list=test_raw_y,
        pred_list=predictions_answers_normal[250:500],
        verbose=1,
        extra_vars=None)

METRIC: Accuracy is 32.4, wups at 0.9 is 37.026618862, wups at 0.0 is -1.0
CLASS METRIC: Accuracy is 18.9603950588, wups at 0.9 is -1.0, wups at 0.0 is -1.0


In [None]:
from csv import DictReader
with open("Labels.csv") as Label


In [115]:
import xlrd
from xlrd import open_workbook

book = open_workbook("Temp_Labels.xlsx")
sheet = book.sheet_by_index(0) #If your data is on sheet 1

column1 = []
column2 = []
column3 = []
#...

for row in range(0, 1000): #start from 1, to leave out row 0
    as1 = sheet.cell(row, 0).value
    as2 = sheet.cell(row, 1).value
    as3 = sheet.cell(row, 2).value
    A1 = isinstance(as1, float)
    A2 = isinstance(as2, float)
    A3 = isinstance(as3, float)

    if A1:
        as1=str(int(as1));
    
    if A2:
        as2=str(int(as2));
    
    if A3:
        as3=str(int(as3));
        
    column1.append(as1.encode('utf8')) #extract from first col
    column2.append(as2.encode('utf8'))
    column3.append(as3.encode('utf8'))
    

In [116]:
Final_Predictions=column1

In [117]:
predictions_answers_DN=column2

In [122]:
predictions_answers=column3

In [123]:
predictions_answers_DN

['garbage_bin',
 'table',
 'chair',
 '3',
 'washing_machine',
 '5',
 'sink',
 'microwave',
 '9',
 'oven',
 'paper_tray',
 'hole_puncher',
 'garbage_bin',
 'framed_certificate',
 'ornamental_plant',
 'red',
 'sofa',
 'sofa',
 '3',
 'fire_extinguisher',
 'fire_alarm',
 'brown',
 '1',
 'tape_dispenser',
 'printer',
 'ladder',
 'brown',
 'ladder',
 'bookshelf',
 'door_way',
 '1',
 'water_dispenser',
 'telephone_cord',
 'hand_sanitizer_dispenser',
 'water_carboy',
 'blue',
 '2',
 'chair',
 'whiteboard',
 'water_carboy',
 '3',
 '1',
 'whiteboard_eraserwhiteboard_marker',
 'redblack',
 '1',
 '1',
 'gray',
 'green',
 'electrical_outlet',
 'head_phone',
 'shirts_in_hanger',
 'redwhite',
 'dresser',
 'cables',
 'white',
 'brown',
 'blinds',
 'pillow',
 'brown',
 'picture',
 'black',
 'deoderant',
 'bookshelf',
 'bicycle',
 '2',
 '2',
 'lamp_shade',
 '2',
 'gray',
 'laptop',
 '1',
 'umbrella',
 'brown',
 '1',
 'brown',
 'book',
 'pinkpurpleblueblack',
 'fire_alarm',
 '3',
 'kichen_towel',
 'white

In [120]:
from kraino.utils import print_metrics
test_raw_y = test_text_representation['y']
test_raw_y=test_raw_y[250:500]

_ = print_metrics.select['wups'](
        gt_list=test_raw_y,
        pred_list=predictions_answers_DN[250:500],
        verbose=1,
        extra_vars=None)

METRIC: Accuracy is 87.6, wups at 0.9 is 87.6, wups at 0.0 is 87.6
CLASS METRIC: Accuracy is 79.1304347826, wups at 0.9 is -1.0, wups at 0.0 is -1.0


In [133]:
test_raw_y

['book',
 '3',
 '1',
 'white',
 '1',
 'bookshelf, treadmill',
 'pool_table, sofa, coffee_table',
 'key',
 '2',
 'purple',
 '4',
 'green',
 'toy',
 'green',
 'rug',
 'candle',
 'blue, red',
 'guitar',
 'table',
 '5',
 '1',
 'dresser, television, sofa',
 'lamp_shade',
 'can',
 '2',
 'yellow',
 '2',
 'plastic_toy_container',
 'piano, piano_bench',
 'wall',
 'night_stand',
 'bed_sheets, map',
 'dresser',
 'laptop',
 'cutting_board',
 'blanket',
 'decorative_bowl',
 '4',
 '2',
 'window_seat',
 '2',
 'television',
 'mirror',
 'door_way',
 'floor_mat, blanket',
 'chair',
 'piano_bench',
 '3',
 'brown',
 'table',
 'bed_sheets',
 'electrical_kettle',
 'blue',
 'white',
 'dresser',
 'table',
 'purple',
 'envelope, microwave, paper_towel',
 '4',
 'night_stand',
 'night_stand',
 'cd_disc',
 'photo',
 'picture',
 'pillow, blanket',
 '2',
 'stove',
 'table',
 'bag',
 'pillow, blanket',
 'table',
 'table',
 'stove',
 'sofa',
 'sofa',
 'piano',
 'towel',
 'sofa',
 'toilet',
 'books',
 'monitor, teleph

In [134]:
predictions_answers_DN[250:500]

['book',
 '3',
 '1',
 'white',
 '1',
 'bookshelftreadmill',
 'pool_tablesofacoffee_table',
 'key',
 '2',
 'purple',
 '4',
 'green',
 'toy',
 'green',
 'rug',
 'candle',
 'bluered',
 'guitar',
 'table',
 '5',
 '1',
 'dressertelevisionsofa',
 'lamp_shade',
 'can',
 '2',
 'yellow',
 '2',
 'plastic_toy_container',
 'pianopiano_bench',
 'wall',
 'night_stand',
 'bed_sheetsmap',
 'dresser',
 'laptop',
 'cutting_board',
 'blanket',
 'decorative_bowl',
 '4',
 '2',
 'window_seat',
 '2',
 'television',
 'mirror',
 'door_way',
 'floor_matblanket',
 'chair',
 'piano_bench',
 '3',
 'brown',
 'table',
 'bed_sheets',
 'electrical_kettle',
 'blue',
 'white',
 'dresser',
 'table',
 'purple',
 'envelopemicrowavepaper_towel',
 '4',
 'night_stand',
 'night_stand',
 'cd_disc',
 'photo',
 'picture',
 'pillowblanket',
 '2',
 'stove',
 'table',
 'bag',
 'pillowblanket',
 'table',
 'table',
 'stove',
 'sofa',
 'sofa',
 'piano',
 'towel',
 'sofa',
 'toilet',
 'books',
 'monitortelephonecomputer',
 'bed',
 '5',


In [119]:
from kraino.utils import print_metrics
test_raw_y = test_text_representation['y']
test_raw_y=test_raw_y[250:500]
_ = print_metrics.select['wups'](
        gt_list=test_raw_y,
        pred_list=Final_Predictions[250:500],
        verbose=1,
        extra_vars=None)

METRIC: Accuracy is 33.2, wups at 0.9 is 38.0291816522, wups at 0.0 is 66.8336276522
CLASS METRIC: Accuracy is 19.5009262286, wups at 0.9 is -1.0, wups at 0.0 is -1.0


In [125]:
from kraino.utils import print_metrics
test_raw_y = test_text_representation['y']
test_raw_y=test_raw_y[250:500]
_ = print_metrics.select['wups'](
        gt_list=test_raw_y,
        pred_list=predictions_answers[250:500],
        verbose=1,
        extra_vars=None)

METRIC: Accuracy is 32.4, wups at 0.9 is 37.026618862, wups at 0.0 is 67.0603376536
CLASS METRIC: Accuracy is 18.9603950588, wups at 0.9 is -1.0, wups at 0.0 is -1.0


In [358]:
test_input = [test_x, test_visual_features]


In [367]:
len(test_x)

100

In [368]:
test_text_representation = dp['text'](train_or_test='test')
test_raw_x = test_text_representation['x']
test_one_hot_x = encode_questions_index(test_raw_x, word2index_x)
test_x = sequence.pad_sequences(test_one_hot_x, maxlen=MAXLEN)
#print_list(test_raw_x[:3])
#test_x[:3]

In [375]:
len(test_visual_features)

5673

In [372]:
test_visual_features[0:100]

array([[ 0.35031411,  0.70939934,  0.29002979, ..., -4.89574718,
        -3.02242517,  4.07691669],
       [ 0.35031411,  0.70939934,  0.29002979, ..., -4.89574718,
        -3.02242517,  4.07691669],
       [ 0.35031411,  0.70939934,  0.29002979, ..., -4.89574718,
        -3.02242517,  4.07691669],
       ..., 
       [ 0.34003553,  0.30489182,  0.4074102 , ..., -3.66248107,
        -0.22230528,  5.85822916],
       [ 0.46625227,  0.31800053,  0.41182289, ..., -5.65062904,
        -0.79568577,  6.39906883],
       [ 0.46625227,  0.31800053,  0.41182289, ..., -5.65062904,
        -0.79568577,  6.39906883]], dtype=float32)

In [376]:
test_input = [test_x[0:100], test_visual_features[0:100]];


In [401]:
len(predictions_answers_DN)

1000

In [394]:
len(test_x)

1000

In [403]:
count1

111

In [404]:
count2

889

In [81]:
Final_predictions

NameError: name 'Final_predictions' is not defined

In [406]:
test_raw_y

['garbage_bin',
 'table',
 'chair',
 '3',
 'washing_machine',
 '5',
 'sink',
 'microwave',
 '9',
 'oven',
 'paper_tray',
 'hole_puncher',
 'garbage_bin',
 'framed_certificate',
 'ornamental_plant',
 'red',
 'sofa',
 'sofa',
 '3',
 'fire_extinguisher',
 'fire_alarm',
 'brown',
 '1',
 'tape_dispenser',
 'printer',
 'ladder',
 'brown',
 'ladder',
 'bookshelf',
 'door_way',
 '1',
 'water_dispenser',
 'telephone_cord',
 'hand_sanitizer_dispenser',
 'water_carboy',
 'blue',
 '2',
 'chair',
 'whiteboard',
 'water_carboy',
 '3',
 '1',
 'whiteboard_eraser, whiteboard_marker',
 'red, black',
 '1',
 '1',
 'gray',
 'green',
 'electrical_outlet',
 'head_phone',
 'shirts_in_hanger',
 'red, white',
 'dresser',
 'cables',
 'white',
 'brown',
 'blinds',
 'pillow',
 'brown',
 'picture',
 'black',
 'deoderant',
 'bookshelf',
 'bicycle',
 '2',
 '2',
 'lamp_shade',
 '2',
 'gray',
 'laptop',
 '1',
 'umbrella',
 'brown',
 '1',
 'brown',
 'book',
 'pink, purple, blue, black',
 'fire_alarm',
 '3',
 'kichen_tow

In [407]:
predictions_answers_DN

['modem',
 'night_stand',
 'refridgerator',
 '3',
 'refridgerator',
 '4',
 'tea_kettle',
 'tissue_box',
 '4',
 'garbage_bin',
 'brown',
 'remote_control',
 'table',
 'wall_decoration',
 'plant',
 'red',
 'ornamental_plant',
 'plant',
 '1',
 'vase',
 'telephone',
 'white',
 '3',
 'books',
 'tape_dispenser',
 'monitor',
 'white',
 'ornamental_plant',
 'wall_decoration',
 'table',
 '3',
 'telephone',
 'basket',
 'bottle',
 'table',
 'brown',
 '2',
 'box',
 'table',
 'lamp',
 '1',
 '1',
 'excercise_ball',
 'red',
 '3',
 '3',
 'black',
 'black',
 'plant',
 'garbage_bin',
 'box',
 'black',
 'photo',
 'stool',
 'brown',
 'red',
 'pillow',
 'blanket',
 'brown',
 'picture',
 'white',
 'lamp',
 'lamp',
 'tissue_box',
 '1',
 '7',
 'books',
 '2',
 'red',
 'night_stand',
 '1',
 'books',
 'brown',
 '7',
 'white',
 'decorative_item',
 'blue',
 'jacket',
 '3',
 'wall_decoration',
 'brown',
 'photo',
 'red',
 'paper',
 '3',
 'tissue_box',
 'photo',
 'books',
 'books',
 'brown',
 'photo',
 'lamp',
 'bro

In [408]:
predictions_answers_normal

['desk',
 'printer',
 'fireplace',
 '3',
 'table',
 '4',
 'globe',
 'bottle_of_liquid',
 '1',
 'garbage_bin',
 'pink',
 'fax_machine',
 'table',
 'telephone',
 'lamp',
 'black',
 'map',
 'lamp_shade',
 '3',
 'ladder',
 'whiteboard',
 'blue',
 '2',
 'book',
 'printer',
 'printer',
 'black',
 'projector_screen',
 'photo',
 'door',
 '1',
 'whiteboard',
 'desk',
 'lamp',
 'water_carboy',
 'blue',
 '2',
 'garbage_bin',
 'table',
 'printer',
 '1',
 '1',
 'books',
 'yellow',
 '2',
 '3',
 'black',
 'red',
 'tape',
 'garbage_bin',
 'clothes',
 'black',
 'telephone',
 'stool',
 'brown',
 'red',
 'pillow',
 'blanket',
 'brown',
 'picture',
 'purple',
 'remote_control',
 'hockey_stick',
 'garbage_bin',
 '1',
 '8',
 'book',
 '3',
 'blue',
 'bed_sheets',
 '2',
 'books',
 'blue',
 '1',
 'brown',
 'books',
 'blue',
 'deoderant',
 '4',
 'bottle_of_liquid',
 'white',
 'tablecloth',
 'red',
 'magnet',
 '3',
 'lamp',
 'table',
 'chair',
 'alarm_clock',
 'blue',
 'pillow',
 'lamp',
 'blue',
 'black',
 'cha