In [1]:
! python boring_function.py 'hello world'

'hello
1.0.1
0.3.3




In [2]:
#TODO: Execute the following procedure (Shift+Enter)
from kraino.utils import data_provider

dp = data_provider.select['daquar-triples']
dp

ImportError: cannot import name imread

The code above returns a dictionary of three representations of the DAQUAR dataset. For now, we will look only into the 'text' representation. dp['text'] returns a function from dataset split into the dataset's textual representation. It will be more clear after executing the following instruction.

In [3]:
# check the keys of the representation of DAQUAR train
train_text_representation = dp['text'](train_or_test='train')
train_text_representation.keys()

['answer_words_delimiter',
 'end_of_answer',
 'img_name',
 'y',
 'x',
 'end_of_question',
 'img_ind',
 'question_id']

This representation specifies how questions are ended ('?'), answers are ended ('.'), answer words are delimited (DAQUAR sometimes has a set of answer words as an answer, for instance 'knife, fork' may be a valid answer), but most important, it has questions (key 'x'), answers (key 'y'), and names of the corresponding images (key 'img_name').
We first need to build a suitable vocabulary from our raw textual training data, and next transform them into one-hot representation.


In [4]:
from toolz import frequencies
train_raw_x = train_text_representation['x']
# we start from building the frequencies table
wordcount_x = frequencies(' '.join(train_raw_x).split(' '))
# print the most and least frequent words
n_show = 5
print(sorted(wordcount_x.items(), key=lambda x: x[1], reverse=True)[:n_show])
print(sorted(wordcount_x.items(), key=lambda x: x[1])[:n_show])

[('the', 9847), ('?', 6795), ('what', 5847), ('is', 5368), ('on', 2909)]
[('all', 1), ('surrounded', 1), ('four', 1), ('displaying', 1), ('children', 1)]


In [5]:
# Kraino is a framework that helps in fast prototyping Visual Turing Test models
from kraino.utils.input_output_space import build_vocabulary

# This function takes wordcounts and returns word2index - mapping from words into indices, 
# and index2word - mapping from indices to words.
word2index_x, index2word_x = build_vocabulary(
    this_wordcount=wordcount_x,
    truncate_to_most_frequent=0)
word2index_x

{'3': 507,
 u'<eoa>': 2,
 u'<eoq>': 3,
 u'<pad>': 0,
 u'<unk>': 1,
 '?': 52,
 'a': 206,
 'above': 80,
 'ac': 817,
 'across': 513,
 'against': 534,
 'air': 589,
 'airconditionerg': 790,
 'alarm': 424,
 'all': 4,
 'along': 92,
 'amidst': 783,
 'and': 382,
 'any': 390,
 'apart': 689,
 'apples': 514,
 'appliance': 511,
 'appliances': 113,
 'are': 489,
 'arm': 495,
 'armchair': 547,
 'armchairs': 320,
 'around': 765,
 'at': 828,
 'attached': 807,
 'audio': 648,
 'available': 515,
 'away': 502,
 'baby': 138,
 'back': 709,
 'backpack': 542,
 'bag': 471,
 'bags': 608,
 'ball': 624,
 'bananas': 652,
 'bars': 478,
 'base': 303,
 'basin': 567,
 'basins': 546,
 'basket': 216,
 'baskets': 111,
 'bath': 257,
 'bathroom': 559,
 'bathtub': 593,
 'bean': 463,
 'bear': 462,
 'bed': 374,
 'bedding': 721,
 'beds': 873,
 'bedside': 51,
 'been': 564,
 'before': 234,
 'behind': 661,
 'beige': 757,
 'below': 811,
 'belt': 557,
 'bench': 668,
 'beneath': 269,
 'benhind': 96,
 'between': 510,
 'bicycle': 335,
 

In addition, we are using a few special symbols that don't occur in the training dataset.
Most important are $<pad>$ and $<unk>$. We will use the former to pad sequences in order to have the same 
number of temporal elements; we will use the latter for words (at test time) that don't exist in training dataset.

Armed with vocabulary, we can build one-hot representation of the training data. However, this is not neccessary and maybe even wasteful. Our one-hot representation of the input text doesn't explicitely build long vectors, but instead it operates on indices. The example above would be encoded as [0,1,4,2,7,3]. 
```
Can you prove the equivalence in the claim?
```
__claim__:

Let $x$ be a binary vector with exactly one value $1$ at position $index$, that is $x[index]=1$. Then $$W[:,index] = Wx$$ where $W[:,b]$ denotes a vector built from a column $b$ of $W$.


In [6]:
from kraino.utils.input_output_space import encode_questions_index
one_hot_x = encode_questions_index(train_raw_x, word2index_x)
print(train_raw_x[:3])
print(one_hot_x[:3])

['what is on the right side of the black telephone and on the left side of the red chair ?', 'what is in front of the white door on the left side of the desk ?', 'what is on the desk ?']
[[71, 598, 744, 647, 705, 272, 161, 647, 125, 135, 382, 744, 647, 650, 272, 161, 647, 298, 15, 52, 3], [71, 598, 603, 255, 161, 647, 352, 131, 744, 647, 650, 272, 161, 647, 656, 52, 3], [71, 598, 744, 647, 656, 52, 3]]


As we can see, the sequences have different elements. We will pad the sequences to have the same length $MAXLEN$.

In [7]:
# We use another framework that is useful to build deep learning models - Keras
from keras.preprocessing import sequence
MAXLEN=30
train_x = sequence.pad_sequences(one_hot_x, maxlen=MAXLEN)
train_x[:3]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,  71, 598, 744, 647,
        705, 272, 161, 647, 125, 135, 382, 744, 647, 650, 272, 161, 647,
        298,  15,  52,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         71, 598, 603, 255, 161, 647, 352, 131, 744, 647, 650, 272, 161,
        647, 656,  52,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  71, 598, 744,
        647, 656,  52,   3]])

And do the same with the answers.

In [8]:
# for simplicity, we consider only first answer words; that is, if answer is 'knife,fork' we encode only 'knife'
MAX_ANSWER_TIME_STEPS=1

from kraino.utils.input_output_space import encode_answers_one_hot
train_raw_y = train_text_representation['y']
wordcount_y = frequencies(' '.join(train_raw_y).split(' '))
word2index_y, index2word_y = build_vocabulary(this_wordcount=wordcount_y)
train_y, _ = encode_answers_one_hot(
    train_raw_y, 
    word2index_y, 
    answer_words_delimiter=train_text_representation['answer_words_delimiter'],
    is_only_first_answer_word=True,
    max_answer_time_steps=MAX_ANSWER_TIME_STEPS)
print(train_x.shape)
print(train_y.shape)

(6795L, 30L)
(6795L, 686L)


Finally, we can also encode test questions. We need it later to see how well our models generalise to new question,answer,image triplets. Remember however that we should use vocabulary we generated from training samples.

```
Why should we use the training vocabulary to encode test questions?
```

In [9]:
def print_list(ll):
    # Prints the list
    print('\n'.join(ll))
    
print_list(['Visual Turing Test', 'Summer School', 'Dr. Mario Fritz', 'Mateusz Malinowski'])

Visual Turing Test
Summer School
Dr. Mario Fritz
Mateusz Malinowski


In [10]:
test_text_representation = dp['text'](train_or_test='test')
test_raw_x = test_text_representation['x']
test_one_hot_x = encode_questions_index(test_raw_x, word2index_x)
test_x = sequence.pad_sequences(test_one_hot_x, maxlen=MAXLEN)
print_list(test_raw_x[:3])
test_x[:3]

what is on the left side of the white oven on the floor and on right side of the blue armchair ?
what is on the left side of the fire extinguisher and on the right side of the chair ?
what is between the the two white and black garbage bins ?


array([[  0,   0,   0,   0,   0,   0,   0,  71, 598, 744, 647, 650, 272,
        161, 647, 352, 229, 744, 647, 640, 382, 744, 705, 272, 161, 647,
         70, 547,  52,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  71, 598, 744,
        647, 650, 272, 161, 647, 573, 174, 382, 744, 647, 705, 272, 161,
        647,  15,  52,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,  71, 598, 510, 647, 647, 120, 352, 382, 125,
        764, 220,  52,   3]])

With encoded question, answer pairs we finish the first section. But before delving into building and training new models, let's have a look at summary to see bigger picture.

__Summary__

We started from raw questions from the training set. Use them to build a vocabulary. Next, we encode questions into sequences of one-hot vectors based on the vocabulary. Finally, we use the same vocabulary to encode questions from test set, if a word is absent we use extra token $<unk>$ to encode this fact (we encode the $<unk>$ token, not the word).

__WUPS__ 
First of all, please run the cell below to set up a link to the NLTK data.

In [11]:
%env NLTK_DATA=C:\Users\Dell user\Downloads\visual_turing_test-tutorial\visual_turing_test-tutorial\data\nltk_data

env: NLTK_DATA=C:\Users\Dell user\Downloads\visual_turing_test-tutorial\visual_turing_test-tutorial\data\nltk_data


We won't go very far using only textual features. Hence, it's now time to consider its visual counterpart.

As shown in the figure below, a quite common procedure works as follows:
* Use a CNN already pre-trained on some large-scale classification task, most often it is [ImageNet](http://image-net.org) with $1000$ for recognition.
* 'Chop off' CNN after some layer. We will use responses of that layer as visual features.

In this tutorial, we will use features extracted from the second last $4096$ dimensional layer of [VGG NET-19](http://arxiv.org/pdf/1409.1556.pdf). We have already extracted features in advance using [Caffe](http://caffe.berkeleyvision.org) - another excellent framework for deep learning, particularly good for CNNs.

Let's creat an input as a pair of textual and visual features.

Please run the cell below in order to get visual features aligned with textual featurs.

In [13]:
# this contains a list of the image names of our interest; 
# it also makes sure that visual and textual features are aligned correspondingly
train_image_names = train_text_representation['img_name']
# the name for visual features that we use
# CNN_NAME='vgg_net'
#CNN_NAME='googlenet'

CNN_NAME='fb_resnet'
# the layer in CNN that is used to extract features
# PERCEPTION_LAYER='fc1000'
# PERCEPTION_LAYER='pool5-7x7_s1'
# PERCEPTION_LAYER='res5c-152'
PERCEPTION_LAYER='l2_res5c-152' # l2 prefix since there are l2-normalized visual features

train_visual_features = dp['perception'](
    train_or_test='train',
    names_list=train_image_names,
    parts_extractor=None,
    max_parts=None,
    perception=CNN_NAME,
    layer=PERCEPTION_LAYER,
    second_layer=None
    )
#import numpy as np
#train_visual_features=np.load('Concat_Feature_NPY.npy');
#train_visual_features.shape
#train_visual_features.shape
#train_image_names

Shuffling memories ...
Skipped images 0 of them:


In [51]:
import numpy as np
train_image_names
np.savetxt("images_name.csv",train_image_names )

TypeError: Mismatch between array dtype ('|S9') and format specifier ('%.18e')

In [52]:
import pandas as pd 
df = pd.DataFrame(train_image_names)
df.to_csv("file_path.csv")

Let's creat an input as a pair of textual and visual features.

In [82]:
train_input.shape

AttributeError: 'list' object has no attribute 'shape'

In [14]:
train_input = [train_x, train_visual_features]
train_input

[array([[  0,   0,   0, ...,  15,  52,   3],
        [  0,   0,   0, ..., 656,  52,   3],
        [  0,   0,   0, ..., 656,  52,   3],
        ..., 
        [  0,   0,   0, ..., 242,  52,   3],
        [  0,   0,   0, ..., 793,  52,   3],
        [  0,   0,   0, ..., 554,  52,   3]]),
 array([[ 0.02181184,  0.03997612,  0.00587487, ...,  0.0015843 ,
          0.01211601,  0.01073669],
        [ 0.02181184,  0.03997612,  0.00587487, ...,  0.0015843 ,
          0.01211601,  0.01073669],
        [ 0.02181184,  0.03997612,  0.00587487, ...,  0.0015843 ,
          0.01211601,  0.01073669],
        ..., 
        [ 0.01418223,  0.00719364,  0.00225867, ...,  0.01255163,
          0.01961501,  0.01007006],
        [ 0.01418223,  0.00719364,  0.00225867, ...,  0.01255163,
          0.01961501,  0.01007006],
        [ 0.01418223,  0.00719364,  0.00225867, ...,  0.01255163,
          0.01961501,  0.01007006]])]

In [15]:
#== Model definition

# First we define a model using keras/kraino
from keras.models import Sequential
from keras.layers.core import Activation
from keras.layers.core import Dense
from keras.layers.core import Dropout
from keras.layers.core import Layer
from keras.layers.core import Merge
from keras.layers.core import TimeDistributedMerge
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.recurrent import LSTM
from keras import optimizers

from kraino.core.model_zoo import AbstractSequentialModel
from kraino.core.model_zoo import AbstractSingleAnswer
from kraino.core.model_zoo import AbstractSequentialMultiplewordAnswer
from kraino.core.model_zoo import Config
from kraino.core.keras_extensions import DropMask
from kraino.core.keras_extensions import LambdaWithMask
from kraino.core.keras_extensions import time_distributed_masked_ave

# This model inherits from AbstractSingleAnswer, and so it produces single answer words
# To use multiple answer words, you need to inherit from AbstractSequentialMultiplewordAnswer
class VisionLanguageLSTM(AbstractSequentialModel, AbstractSingleAnswer):
    """
    BOW Language only model that produces single word answers.
    """
    def create(self):
        language_model = Sequential()
        language_model.add(Embedding(
                self._config.input_dim, 
                self._config.textual_embedding_dim, 
                mask_zero=True))
        #TODO: Replace averaging with RNN (you can choose between LSTM and GRU)
#         language_model.add(LambdaWithMask(time_distributed_masked_ave, output_shape=[self.output_shape[2]]))
        language_model.add(LSTM(self._config.hidden_state_dim, 
                      return_sequences=False))

        visual_model = Sequential()
        if self._config.visual_embedding_dim > 0:
            visual_model.add(Dense(
                    self._config.visual_embedding_dim,
                    input_shape=(self._config.visual_dim,)))
        else:
            visual_model.add(Layer(input_shape=(self._config.visual_dim,)))
        self.add(Merge([language_model, visual_model], mode=self._config.multimodal_merge_mode))
        self.add(Dropout(0.5))
        self.add(Dense(self._config.output_dim))
        self.add(Activation('softmax'))
        
        
# dimensionality of embeddings
EMBEDDING_DIM = 500
# kind of multimodal fusion (ave, concat, mul, sum)
MULTIMODAL_MERGE_MODE = 'sum'

model_config = Config(
    textual_embedding_dim=EMBEDDING_DIM,
    visual_embedding_dim=EMBEDDING_DIM,
    hidden_state_dim=EMBEDDING_DIM,
    multimodal_merge_mode=MULTIMODAL_MERGE_MODE,
    input_dim=len(word2index_x.keys()),
    output_dim=len(word2index_y.keys()),
    visual_dim=train_visual_features.shape[1])
model = VisionLanguageLSTM(model_config)
model.create()

model.compile(
    loss='categorical_crossentropy', 
    optimizer='Adam')
text_image_rnn_model = model

Using Theano backend.


In [16]:
#== Model training
text_image_rnn_model.fit(
    train_input, 
    train_y,
    batch_size=512,
    nb_epoch=40,
    validation_split=0.1,
    show_accuracy=True)

Train on 6115 samples, validate on 680 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x92efa20>

### Predictions (Features)

In [17]:
test_image_names = test_text_representation['img_name']
test_visual_features = dp['perception'](
    train_or_test='test',
    names_list=test_image_names,
    parts_extractor=None,
    max_parts=None,
    perception=CNN_NAME,
    layer=PERCEPTION_LAYER,
    second_layer=None
    )

#test_visual_features=np.load('Test_Features_Concat.npy');
#test_visual_features=np.load('Test_Features_Concat.npy');
test_visual_features.shape

Shuffling memories ...
Skipped images 0 of them:


(5673L, 2048L)

In [18]:
test_input = [test_x, test_visual_features]

In [19]:
from kraino.core.model_zoo import word_generator
# we first need to add word_generator to _config (we could have done this before, in the Config constructor)
# we use maximum likelihood as a word generator
text_image_rnn_model._config.word_generator = word_generator['max_likelihood']
predictions_answers = text_image_rnn_model.decode_predictions(
    X=test_input,
    temperature=None,
    index2word=index2word_y,
    verbose=0)

In [21]:
from kraino.utils import print_metrics
test_raw_y = test_text_representation['y']
_ = print_metrics.select['wups'](
        gt_list=test_raw_y,
        pred_list=predictions_answers,
        verbose=1,
        extra_vars=None)

METRIC: Accuracy is 23.0742111757, wups at 0.9 is 28.6059567877, wups at 0.0 is -1.0
CLASS METRIC: Accuracy is 3.7312106184, wups at 0.9 is -1.0, wups at 0.0 is -1.0


In [22]:
predictions_answers

['napkin_dispenser',
 'basket',
 'hockey_stick',
 '3',
 'sofa',
 '19',
 'electrical_kettle',
 'glass',
 '1',
 'dishwasher',
 'blue',
 'pen_cup',
 'table',
 'telephone',
 'ornamental_plant',
 'blue',
 'ornamental_plant',
 'lamp_shade',
 '3',
 'lamp',
 'telephone',
 'brown',
 '2',
 'book',
 'printer',
 'stacked_chairs',
 'black',
 'whiteboard',
 'clock',
 'door',
 '1',
 'stacked_chairs',
 'stapler',
 'lamp',
 'water_carboy',
 'blue',
 '2',
 'dish_rack',
 'table',
 'paper_tray',
 '1',
 '1',
 'stapler',
 'yellow',
 '2',
 '7',
 'white',
 'red',
 'whiteboard',
 'basket',
 'box',
 'black',
 'mirror',
 'chair',
 'brown',
 'red',
 'pillow',
 'blanket',
 'brown',
 'picture',
 'black',
 'ladder',
 'hockey_stick',
 'telephone',
 '1',
 '8',
 'book',
 '2',
 'blue',
 'baby_chair',
 '2',
 'wooden_planks',
 'blue',
 '1',
 'blue',
 'book',
 'blue',
 'pillow',
 '2',
 'bottle_of_liquid',
 'white',
 '2',
 'white',
 'door_knob',
 '2',
 'clock',
 'ornamental_plant',
 'books',
 'remote_control',
 'brown',
 'n

In [29]:
import scipy.io as scio
mat = scio.savemat('Answer_Predict_Concat_Concat_DnNnT.mat',predictions_answers)

AttributeError: 'list' object has no attribute 'items'

In [32]:
len(predictions_answers)

5673

In [23]:

import csv
with open("Predictions_A_C_S_C_NnT.csv",'wb') as resultFile:
    wr = csv.writer(resultFile, dialect='excel')
    wr.writerow(predictions_answers)