In [1]:
import numpy as np
import numpy as np
from numpy.random import shuffle
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data.sampler import SubsetRandomSampler
import time
import copy
import pickle
import os
from torch.utils.data import Dataset, DataLoader
import PIL
from tqdm import tqdm
from cider.cider import Cider

# Read in data

In [2]:
with open('Flickr8k_CNN/train_data.p','r') as f:
    train_data = pickle.load(f)
with open('Flickr8k_CNN/valid_data.p','r') as f:
    valid_data = pickle.load(f)
with open('Flickr8k_CNN/test_data.p','r') as f:
    test_data = pickle.load(f)
with open('Flickr8k_CNN/all_captions.p','r') as f:
    all_captions = pickle.load(f)

# Hyperparameter

In [12]:
USE_GPU = torch.cuda.is_available()
PAD_LENGTH = 36
BATCH_SIZE = 128    #16
EMBED_SIZE = 512   # 128
HIDDEN_SIZE = 1024  #216
TEMPERATURE = 0.5
LR_RNN = 1e-3
LR_CNN = 1e-3
MOMENTUM = 0.9
WEIGHT_DECAY = 0.0
model_num = 'model17_bi_0322'

# Transform to tensor and data loader

In [5]:
'''encoder for word to idx can be found here'''
# encode description for embedding layer
class make_vocab_list:
    def __init__(self, captions):
#         self.all_captions = str(captions.values()).translate(None, '\[],')
#         self.vocabulary = set(''.join(self.all_captions).split())
        self.vocabulary = {word for i, caption in enumerate(all_captions.values()[1:]) 
                                   for j in xrange(len(caption)) for word in caption[j].split()}
        self.vocabulary.add('<start>')
        self.vocabulary.add('<end>')
        self.vocabulary.add('^')
        self.word_to_idx = {word:i for i, word in enumerate(self.vocabulary)}
        self.idx_to_word = {i:word for i, word in enumerate(self.vocabulary)}
    def encode(self, line):
        ''' input      line string of words
            output     list of idx
        '''
        output = []
        for word in line.split():
            output.append(self.word_to_idx[word])
        return output
    
    def decode(self, encoded):
        ''' input       list of idx
            output      string of words
        '''
        output = []
        for idx in encoded:
            output.append(self.idx_to_word[idx])
        return ' '.join(output)

class ImageCaptioningData(Dataset):
    def __init__(self, data_set, encoder, transform = None, pad_length = PAD_LENGTH):
#         self.images = torch.from_numpy(data_set[:,0])
#         self.images = self.images.permute(0, 3, 1, 2)
#         self.labels = torch.Tensor(data_set[:,1])
        self.data = data_set
        self.transform = transform
        self.encoder = encoder
        self.pad_length = pad_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        pad_length = self.pad_length
        img,caption, ID = self.data[index]
        length = len(caption.split())+2
        caption_x = '<start> ' + caption + ' <end>' + ' ^'*(pad_length-len(caption.split()))
        caption_y = caption + ' <end>' + ' ^' +' ^'*(pad_length-len(caption.split()))
        x_encode, y_encode, = self.encoder.encode(caption_x), self.encoder.encode(caption_y)
        pad_idx = self.encoder.encode('^')
#         for i in xrange(pad_length-len(caption)):
#             x_encode.append(pad_idx)
#             y_encode.append(pad_idx)
        x_encode, y_encode = torch.LongTensor(x_encode), torch.LongTensor(y_encode)
#         if self.transform:
#             img = self.transform(img)
#             img = img.view(2048,7,7)
        return (img, x_encode, y_encode, length, ID)

In [6]:
encoder = make_vocab_list(all_captions)
transform = transforms.Compose([transforms.Scale((224,224)),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])])
train_data_tensor = ImageCaptioningData(train_data,encoder,transform)
valid_data_tensor = ImageCaptioningData(valid_data,encoder,transform)
test_data_tensor = ImageCaptioningData(test_data,encoder,transform)

In [7]:
''' Batch loaders using pytorch data loads'''
train_data_loader = DataLoader(
    dataset = train_data_tensor,
    shuffle = False,
    batch_size = BATCH_SIZE,
    num_workers = 4
)
valid_data_loader = DataLoader(
    dataset = valid_data_tensor,
    shuffle = False,
    batch_size = BATCH_SIZE,
    num_workers = 4
)
test_data_loader = DataLoader(
    dataset = test_data_tensor,
    shuffle = False,
    batch_size = BATCH_SIZE,
    num_workers = 4
)

# Model

In [8]:
class AttentiveCNN( nn.Module ):
    def __init__( self, embed_size = EMBED_SIZE, hidden_size = HIDDEN_SIZE ):
        super(AttentiveCNN, self ).__init__()
        resnet = models.resnet152( pretrained=True )
        modules = list(resnet.children())[7] # delete the last fc layer and avg pool.
        self.resnet_conv = nn.Sequential(*modules) # last conv feature

        self.avgpool = nn.AvgPool2d(7)
        self.local_layer = nn.Linear(2048, hidden_size) # v_i = W_a * A
        self.global_layer = nn.Linear(2048, embed_size)  # v_g = W_b * a^g
        
        self.init_weights()
        
    def init_weights( self ):
        """Initialize the weights."""
        nn.init.xavier_normal(self.local_layer.weight)
        nn.init.xavier_normal(self.global_layer.weight)
        
        
    def forward( self, images ):
        '''
        Input: images
        Output: V=[v_1, ..., v_n], v_g
        '''
        
        # Last conv layer feature map
        A = self.resnet_conv( images )
        A_global = self.avgpool(A)
        A_global = A_global.view( A_global.size(0), -1 )
        
        # V = [ v_1, v_2, ..., v_49 ]
        # V = 49 x 2048 ie each kernel from conv layer is retained
        v_local = A.view( A.size( 0 ), A.size( 1 ), -1 ).transpose( 1,2 )
        v_local = self.local_layer( v_local ) 
        
        v_global = self.global_layer( A_global ) 
        
        # return V, v_g, A
        return v_local, v_global

# Attention Block for C_hat calculation
class Atten( nn.Module ):
    def __init__( self, hidden_size = HIDDEN_SIZE ):
        super( Atten, self ).__init__()

        self.v = nn.Linear(hidden_size, 49, bias=False) # W_v
        self.g = nn.Linear(hidden_size, 49, bias=False) # W_g
        self.s = nn.Linear(hidden_size, 49, bias=False) # W_s
        self.h = nn.Linear(49, 1, bias=False) # w_h
        
        self.dropout = nn.Dropout(p = 0.5)
        self.init_weights()
        
    def init_weights( self ):
        """Initialize the weights."""
        nn.init.xavier_normal( self.v.weight )
        nn.init.xavier_normal( self.g.weight )
        nn.init.xavier_normal( self.h.weight )
        nn.init.xavier_normal( self.s.weight )
        
    def forward( self, V, h_t, s_t ):
        '''
        Input: v_local=[v_1, v_2, ... v_k], h_t, s_t from LSTM
        Output: c_hat_t, attention feature map
        '''
        
        # W_v * V + W_g * h_t * 1^T
        content_v = self.v( V ).unsqueeze( 1 ) + self.g( self.dropout( h_t ) ).unsqueeze( 2 )
        
        # z_t = W_h * tanh( content_v )
        z_t = self.h(self.dropout(F.tanh(content_v))).squeeze( 3 )
        alpha_t = F.softmax(z_t.view(-1, z_t.size(2))).view(z_t.size(0), z_t.size(1), -1)
        
        # Construct c_t: B x seq x hidden_size
        c_t = torch.bmm(alpha_t, V).squeeze(2)
        
        # W_s * s_t + W_g * h_t
        content_s = self.s(self.dropout(s_t)) + self.g(self.dropout(h_t))
        # w_t * tanh( content_s )
        z_t_extended = self.h(self.dropout(F.tanh(content_s)))
        
        # Attention score between sentinel and image content
        extended = torch.cat((z_t, z_t_extended), dim=2 )
        alpha_hat_t = F.softmax(extended.view(-1, extended.size(2))).view(extended.size(0), extended.size(1), -1)
        beta_t = alpha_hat_t[ :, :, -1 ]
        
        # c_hat_t = beta * s_t + ( 1 - beta ) * c_t
        beta_t = beta_t.unsqueeze(2)
        c_hat_t = beta_t * s_t + (1 - beta_t)  * c_t

        return c_hat_t, alpha_t, beta_t

    # Sentinel BLock    
class Sentinel( nn.Module ):
    def __init__( self, input_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE):
        super(Sentinel, self).__init__()

        self.x = nn.Linear(input_size, hidden_size, bias=False)
        self.h = nn.Linear(hidden_size, hidden_size, bias=False)
        
        # Dropout applied before affine transformation
        self.dropout = nn.Dropout(p = 0.5)
        
        self.init_weights()
        
    def init_weights(self):
        nn.init.xavier_normal(self.x.weight)
        nn.init.xavier_normal(self.h.weight)
        
    def forward(self, x_t, h_t_1, cell_t):
        
        # g_t = sigmoid( W_x * x_t + W_h * h_(t-1) )        
        gate_t = self.x(self.dropout(x_t)) + self.h(self.dropout(h_t_1))
        gate_t = F.sigmoid( gate_t )
        
        # Sentinel embedding
        s_t =  gate_t * F.tanh( cell_t )
        
        return s_t

# Adaptive Attention Block: C_t, Spatial Attention Weights, Sentinel embedding    
class AdaptiveBlock( nn.Module ):
    
    def __init__( self, embed_size = EMBED_SIZE, hidden_size = HIDDEN_SIZE, vocab_size = len(encoder.vocabulary)):
        super( AdaptiveBlock, self ).__init__()

        # Sentinel block
        self.sentinel = Sentinel( embed_size * 2, hidden_size )
        
        # Image Spatial Attention Block
        self.atten = Atten( hidden_size )
        
        # Final Caption generator
        self.mlp = nn.Linear( hidden_size, vocab_size )
        
        # Dropout layer inside Affine Transformation
        self.dropout = nn.Dropout( p = 0.5 )
        
        self.hidden_size = hidden_size
        self.init_weights()
        
    def init_weights( self ):
        '''
        Initialize final classifier weights
        '''
        nn.init.xavier_normal(self.mlp.weight)
        
        
    def forward( self, x, hiddens, cells, V ):
        
        # hidden for sentinel should be h0-ht-1
        h0 = self.init_hidden( x.size(0) )[0].transpose( 0,1 )
        
        # h_(t-1): B x seq x hidden_size ( 0 - t-1 )
        if hiddens.size( 1 ) > 1:
            hiddens_t_1 = torch.cat( ( h0, hiddens[ :, :-1, : ] ), dim=1 )
        else:
            hiddens_t_1 = h0

        # Get Sentinel embedding, it's calculated blockly    
        sentinel = self.sentinel( x, hiddens_t_1, cells )
        
        # Get C_t, Spatial attention, sentinel score
        c_hat, atten_weights, beta = self.atten( V, hiddens, sentinel )
        
        # Final score along vocabulary
        scores = self.mlp( self.dropout( c_hat + hiddens ) )
        
        return scores, atten_weights, beta
    
    def init_hidden( self, bsz ):
        '''
        Hidden_0 & Cell_0 initialization
        '''
        weight = next( self.parameters() ).data
        
        if torch.cuda.is_available():
            return ( Variable( weight.new( 1 , bsz, self.hidden_size ).zero_().cuda() ),
                    Variable( weight.new( 1,  bsz, self.hidden_size ).zero_().cuda() ) ) 
        else: 
            return ( Variable( weight.new( 1 , bsz, self.hidden_size ).zero_() ),
                    Variable( weight.new( 1,  bsz, self.hidden_size ).zero_() ) ) 


# Caption Decoder
class Decoder( nn.Module ):
    def __init__( self, embed_size = EMBED_SIZE, vocab_size = len(encoder.vocabulary), hidden_size = HIDDEN_SIZE ):
        super( Decoder, self ).__init__()
        self.embed = nn.Embedding( vocab_size, embed_size ) 
        self.dropout = nn.Dropout(p=0.5)
        # LSTM decoder: input = [ w_t; v_g ] => 2 x word_embed_size;
        self.LSTM_Forward = nn.LSTM( embed_size * 2, hidden_size, 1, batch_first=True )
        self.LSTM_Backward = nn.LSTM( embed_size * 2, hidden_size, 1, batch_first=True )
        # Save hidden_size for hidden and cell variable 
        self.hidden_size = hidden_size
        
        # Adaptive Attention Block: Sentinel + C_hat + Final scores for caption sampling
        self.adaptive_forward = AdaptiveBlock( embed_size, hidden_size, vocab_size )
        self.adaptive_backward = AdaptiveBlock( embed_size, hidden_size, vocab_size )
        
    def forward( self, V, v_g , captions, length, states=None, mode = 'train'):
        if mode == 'train':
            captions_for = captions
            captions_rev = torch.zeros((captions.size())).type(torch.cuda.LongTensor)
            for i, cap_temp in enumerate(captions.data):
                inv_index = torch.arange(length[i]-1,-1,-1,out = torch.cuda.LongTensor()) 
                captions_rev[i,:length[i]] = cap_temp.unsqueeze(0).unsqueeze(2)[:,inv_index,:]#[0].view(length[i])
                if length.numpy()[i] != PAD_LENGTH+2:
                    captions_rev[i,length[i]:] = captions.data[i,length[i]:]
            
            captions_rev = Variable(captions_rev)
        else:
            captions_rev = captions
            captions_for = captions
    
        # Word Embedding
#         embeddings_for = self.embed_for(captions_for)
#         embeddings_rev = self.embed_rev(captions_rev)
        embeddings_for = self.dropout(self.embed(captions_for))
        embeddings_rev = self.dropout(self.embed(captions_rev))
#         print embeddings
        # x_t = [w_t;v_g]
        x_for = torch.cat( ( embeddings_for, v_g.unsqueeze( 1 ).expand_as( embeddings_for ) ), dim=2 )
        x_rev = torch.cat( ( embeddings_rev, v_g.unsqueeze( 1 ).expand_as( embeddings_rev ) ), dim=2 )
        
        x = x_for
        
        if torch.cuda.is_available():
#             hiddens = Variable( torch.zeros( x.size(0), x.size(1), self.hidden_size ).cuda() )
            hiddens = Variable( torch.zeros( x.size(0), x.size(1), self.hidden_size ).cuda() )
            hiddens_reverse = Variable( torch.zeros( x.size(0), x.size(1), self.hidden_size ).cuda() )
#             hiddens_bi = Variable( torch.zeros( x.size(0), 2*x.size(1), self.hidden_size ).cuda() )
            cells = Variable( torch.zeros( x.size(1), x.size(0), self.hidden_size ).cuda() )
            cells_reverse = Variable( torch.zeros( x.size(1), x.size(0), self.hidden_size ).cuda() )
#             cells_bi = Variable( torch.zeros( 2*x.size(1), x.size(0), self.hidden_size ).cuda() )
        else:
            hiddens = Variable( torch.zeros( x.size(0), x.size(1), self.hidden_size ) )
            hiddens_reverse = Variable( torch.zeros( x.size(0), x.size(1), self.hidden_size ))
#             hiddens_bi = Variable( torch.zeros( x.size(0), 2*x.size(1), self.hidden_size ))
            cells = Variable( torch.zeros( x.size(1), x.size(0), self.hidden_size ) )
            cells_reverse = Variable( torch.zeros( x.size(1), x.size(0), self.hidden_size ) )
#             cells_bi = Variable( torch.zeros( 2*x.size(1), x.size(0), self.hidden_size ) )                 
  
        states_rev = states
        for time_step in range( x.size( 1 ) ):
            # Feed in x_t one at a time
            x_t = x[ :, time_step, : ]
            x_t = x_t.unsqueeze( 1 )
            h_t, states = self.LSTM_Forward(x_t, states )
            cells[ time_step, :, : ] = states[1]
            hiddens[:,time_step,:] = h_t
#         cells = cells.transpose(0,1)
            
            x_t_rev = x_rev[ :, time_step, : ]
            x_t_rev = x_t_rev.unsqueeze( 1 )
            h_t_rev, states_rev = self.LSTM_Backward(x_t_rev, states_rev )
            cells_reverse[ time_step, :, : ] = states_rev[1]
            hiddens_reverse[:,time_step,:] = h_t_rev
            
#         inv_index = torch.arange(x.size(1)-1,-1,-1,out = torch.cuda.LongTensor())
#         hiddens_forward = hiddens_reverse[:,inv_index,:]

        
#         inv_index = torch.arange(x.size(1)-1,-1,-1, out = torch.cuda.LongTensor())    
#         cells_forward = cells_reverse[inv_index,:,:] 
        cells = cells.transpose(0,1)
        cells_reverse = cells_reverse.transpose(0,1)
#         cells_forward = cells_forward.transpose(0,1)

        
            
#         if torch.cuda.device_count() > 1:
#             print 'hi'
#             device_ids = range( torch.cuda.device_count() )
#             adaptive_block_parallel = nn.DataParallel( self.adaptive_forward, device_ids=device_ids )
            
#             scores_forward, atten_weights, beta = adaptive_block_parallel( inputs, hiddens, cells, V )
#         else:
#             print 'hello'
        scores_forward, atten_weights, beta = self.adaptive_forward( x, hiddens, cells, V )
        scores_backward, atten_weights, beta = self.adaptive_backward( x_rev, hiddens_reverse, cells_reverse, V )

        # Return states for Caption Sampling purpose
        return scores_forward, scores_backward, states_rev, atten_weights, beta
#         return scores_forward, states, atten_weights, beta
    


# Whole Architecture with Image Encoder and Caption decoder        
class Encoder2Decoder( nn.Module ):
    def __init__( self, embed_size = EMBED_SIZE, vocab_size=len(encoder.vocabulary), hidden_size=HIDDEN_SIZE, mode = 'train' ):
        super( Encoder2Decoder, self ).__init__()
        
        # Image CNN encoder and Adaptive Attention Decoder
        self.encoder = AttentiveCNN( embed_size, hidden_size )
        self.decoder = Decoder( embed_size, vocab_size, hidden_size )   
        
    def forward(self, images, captions, length):
        
        # Data parallelism for V v_g encoder if multiple GPUs are available
        # V=[ v_1, ..., v_k ], v_g in the original paper
        if torch.cuda.device_count() > 1:
            device_ids = range( torch.cuda.device_count() )
            encoder_parallel = torch.nn.DataParallel( self.encoder, device_ids=device_ids )
            V, v_g = encoder_parallel( images ) 
        else:
            V, v_g = self.encoder( images )
        
        scores_forward, scores_backward, _, _,_ = self.decoder( V, v_g, captions, length)
#         scores_forward, _, _,_ = self.decoder( V, v_g, captions)
        
        
        return scores_forward, scores_backward
#         return scores_forward


# Train and Generation

In [9]:
def calc_loss_acc(model, data_loader, objective, mode):
    num_correct = Variable(torch.cuda.FloatTensor(1).zero_())
    loss_temp = 0
    total_data = 0
    total_data_acc = 0
    model.eval()
    if mode == 0:
        func = 'Calculating Train Loss/Acc'
    elif mode == 1:
        func = 'Calculating Validation Loss/Acc'
    else:
        func = 'Calculating Test Loss/Acc'
    for img, x, y, length, ID in tqdm(data_loader, desc = func):
        #reverse label
        y_rev = torch.zeros(y.size()).type(torch.LongTensor)
        for i,y_temp in enumerate(y):
            y_rev[i,length[i]-2] = x[i,0]
            inv_index = torch.arange(length[i]-3,-1,-1,out = torch.LongTensor()) 
            y_rev[i,:length[i]-2] = y_temp[inv_index]
            if length.numpy()[i] != PAD_LENGTH+2:
                y_rev[i,length[i]-1:] = y[i,length[i]-1:]
            
        if USE_GPU:
            img, x, y, y_rev = Variable(img.cuda(), volatile = True), \
                        Variable(x.cuda(), volatile = True), \
                        Variable(y.cuda(), volatile = True),\
                        Variable(y_rev.cuda(), volatile = True)
        output_forward, output_backward = model.forward(img, x, length)
        
        y = y.view(-1)
        y_rev = y_rev.view(-1)
        output_forward = output_forward.view((output_forward.size()[0]*output_forward.size()[1],output_forward.size()[-1]))
        output_backward = output_backward.view((output_backward.size()[0]*output_backward.size()[1],output_backward.size()[-1]))
        loss_temp += objective(output_forward,y)*x.size()[0] + objective(output_backward,y_rev)*x.size()[0]

        total_data += x.size()[0]
    return loss_temp/float(total_data)

def sample(x, temperature = TEMPERATURE):
    x = x.data[0].numpy()
    prob = np.exp(np.log(x)/TEMPERATURE)/np.sum(np.exp(np.log(x)/TEMPERATURE))
    #prob = np.exp((x)/TEMPERATURE)/np.sum(np.exp((x)/TEMPERATURE))
    pred_y = np.random.choice(range(len(encoder.vocabulary)),p = prob)
    return pred_y

def generate_caption_forward(model, data_loader, mode_caption = 'train', threshold = 50):
    start_time = time.time()
    model.eval()
    predDict = {}
    refDict = {}
    Beta = {}
    imgSet = set()
    prime = torch.LongTensor(encoder.encode('<start>'))
    for img, x, y, length, ID in tqdm(data_loader, desc = 'Generating Caption'):
        img = Variable(img.cuda())
        V, v_g = model.encoder(img)
        for batch in xrange(V.size()[0]):
            V_t, v_g_t = V[batch].unsqueeze(0), v_g[batch].unsqueeze(0)
            states = None
            temp_result = []
            temp_beta = []
            end_found = False
            inp = Variable(prime.cuda(), volatile = True).unsqueeze(0)
            current = 0
            if ID[batch] not in imgSet:
                while current <= threshold:
                    scores,_, states, atten_weights, beta = model.decoder( V_t, v_g_t, inp, length, states, 
                                                                          mode = 'generate')
                    if mode_caption == 'train':
                        _, pred_y = torch.max(F.softmax(scores[0]),1)
                        if pred_y.data[0] == encoder.encode('<end>')[0]:
                            end_found = True
                            break
                        current += 1
                        pred_y_word = encoder.decode([pred_y.data[0]])
                        temp_result.append(pred_y_word)
                        inp = pred_y.unsqueeze(0)
                        temp_beta.append(beta)
                    else:
                        pred_y = sample(F.softmax(scores[0].cpu()))
                        if pred_y == encoder.encode('<end>'):
                            end_found = True
                            break
                        current += 1
                        pred_y_word = encoder.decode([pred_y])
                        temp_result.append(pred_y_word)
                        pred_y = torch.LongTensor([pred_y])
                        inp = Variable(pred_y.cuda(), volatile = True).unsqueeze(0)
                        temp_beta.append(beta)
                imgSet.add(ID[batch])
                predDict[ID[batch]] = [' '.join(temp_result)]
                Beta[ID[batch]] = temp_beta
                refDict[ID[batch]] = all_captions[ID[batch]]
            else:
                continue
    #             _, pred_y = torch.max(F.softmax(scores[0]),1)
    #             if pred_y.data[0] == encoder.encode('<end>')[0]:
    #                 end_found = True
    #                 break
    #             current += 1
    #             pred_y_word = encoder.decode([pred_y.data[0]])
    #             temp_result.append(pred_y_word)
    #             inp = pred_y.unsqueeze(0)
    return predDict, refDict, Beta

def generate_caption_backward(model, data_loader, mode_caption = 'train', threshold = 50):
    start_time = time.time()
    model.eval()
    predDict = {}
    refDict = {}
    Beta = {}
    imgSet = set()
    prime = torch.LongTensor(encoder.encode('<end>'))
    for img, x, y, length, ID in tqdm(data_loader, desc = 'Generating Caption'):
        img = Variable(img.cuda())
        V, v_g = model.encoder(img)
        for batch in xrange(V.size()[0]):
            V_t, v_g_t = V[batch].unsqueeze(0), v_g[batch].unsqueeze(0)
            states = None
            temp_result = []
            temp_beta = []
            end_found = False
            inp = Variable(prime.cuda(), volatile = True).unsqueeze(0)
            current = 0
            if ID[batch] not in imgSet:
                while current <= threshold:
                    _,scores, states, atten_weights, beta = model.decoder( V_t, v_g_t, inp, length, states, 
                                                                          mode = 'generate')
                    if mode_caption == 'train':
                        _, pred_y = torch.max(F.softmax(scores[0]),1)
                        if pred_y.data[0] == encoder.encode('<start>')[0]:
                            end_found = True
                            break
                        current += 1
                        pred_y_word = encoder.decode([pred_y.data[0]])
                        temp_result.append(pred_y_word)
                        inp = pred_y.unsqueeze(0)
                        temp_beta.append(beta)
                    else:
                        pred_y = sample(F.softmax(scores[0].cpu()))
                        if pred_y == encoder.encode('<start>'):
                            end_found = True
                            break
                        current += 1
                        pred_y_word = encoder.decode([pred_y])
                        temp_result.append(pred_y_word)
                        pred_y = torch.LongTensor([pred_y])
                        inp = Variable(pred_y.cuda(), volatile = True).unsqueeze(0)
                        temp_beta.append(beta)
                imgSet.add(ID[batch])
                temp_result = temp_result[::-1]
                predDict[ID[batch]] = [' '.join(temp_result)]
                Beta[ID[batch]] = temp_beta
                refDict[ID[batch]] = all_captions[ID[batch]]
            else:
                continue
    return predDict, refDict, Beta

def calc_score(pred_forward, pred_backward, ref, mode = 'cider', n=4):
    if mode == 'cider':
        final_pred = {}
        score_record = []
        for img_ID in tqdm(pred_forward.keys(), desc = 'Calculating Score'):
            temp_ref = {}
            temp_for, temp_back = {}, {}
            temp_ref[img_ID] = ref[img_ID]
            temp_for[img_ID], temp_back[img_ID] = pred_forward[img_ID], pred_backward[img_ID]
            score_for, _ = Cider(n = n).compute_score(temp_ref, temp_for)
            score_back, _ = Cider(n = n).compute_score(temp_ref, temp_back)
            if score_for >= score_back:
                final_pred[img_ID] = pred_forward[img_ID]
            else:
                final_pred[img_ID] = pred_backward[img_ID]
        final_score = Cider(n = n).compute_score(ref, final_pred)
    return final_pred, final_score
# def calc_score(pred_forward, ref, mode = 'cider', n=4):
#     if mode == 'cider':
#         final_pred = {}
#         score_record = []
#         for img_ID in tqdm(pred_forward.keys(), desc = 'Calculating Score'):
#             final_pred[img_ID] = pred_forward[img_ID]   
#         final_score, _ = Cider(n = n).compute_score(ref, final_pred)
#     return final_pred, final_score

In [10]:
def train(model, optimizer_CNN, optimizer_RNN, objective, lr_decay_CNN, lr_decay_RNN, model_num, 
          epochs = 100, early_stop = 4, verbose = True):
    start_time = time.time()
    acc_record = {}
    loss_record = {}
    loss_temp = 0
    total_data = 0
    best_acc = 0.367367
#    best_acc = last_acc = -1.0
#     best_score = -1
#     performance_record = open('./model/'+model_num+"/performance_record.txt", "a")
    havesaved = False
    if USE_GPU:
        model = model.cuda()
    for epoch in xrange(epochs):
        if verbose:
            print 'epoch %i' %(epoch + 1)
        model.train()
        #learning_decay.step()
        for img, x,y,length, ID in tqdm(train_data_loader, desc = 'Training'):
            y_rev = torch.zeros(y.size()).type(torch.LongTensor)
            for i,y_temp in enumerate(y):
                y_rev[i,length[i]-2] = x[i,0]
                inv_index = torch.arange(length[i]-3,-1,-1,out = torch.LongTensor()) 
                y_rev[i,:length[i]-2] = y_temp[inv_index]  
                if length.numpy()[i] != PAD_LENGTH+2:
                    y_rev[i,length[i]-1:] = y[i,length[i]-1:]
                
            if USE_GPU:
                img,x,y, y_rev = Variable(img.cuda()), \
                                Variable(x.cuda()), \
                                Variable(y.cuda()), \
                                Variable(y_rev.cuda())
            # set optimizer parameter gradients to zero
            #if epoch >= 5:
            optimizer_CNN.zero_grad()
            optimizer_RNN.zero_grad()
            
            #forward prop
            output_forward, output_backward = model.forward(img, x, length)
            
            y = y.view(-1)
            y_rev = y_rev.view(-1)
            output_forward = output_forward.view((output_forward.size()[0]*output_forward.size()[1],output_forward.size()[-1]))
            output_backward = output_backward.view((output_backward.size()[0]*output_backward.size()[1],output_backward.size()[-1]))
            loss = objective(output_forward,y) + objective(output_backward,y_rev)
            
            #backward prop
            loss.backward()
            #if epoch >= 5:
            optimizer_CNN.step()
            optimizer_RNN.step()

#         train_loss = calc_loss_acc(model, train_data_loader, objective,0)
#         valid_loss = calc_loss_acc(model, valid_data_loader, objective,1)
#         test_loss = calc_loss_acc(model, test_data_loader, objective,2)
#         acc_record[epoch] = [train_acc, valid_acc, test_acc]
#         loss_record[epoch] = [train_loss, valid_loss, test_loss]
#         lr_decay.step(valid_loss.data[0])
        predDict_for, refDict, _ = generate_caption_forward(model_simple.cuda(), valid_data_loader, mode_caption = 'train',\
                                                    threshold = 38)
#         if epoch >= 19: 
#             predDict_back, _, _ = generate_caption_backward(model_simple.cuda(), valid_data_loader, mode_caption = 'train',\
#                                                     threshold = 38)
#             _, score = calc_score(predDict_for, predDict_back,  refDict, mode = 'cider', n=4)
#         else:
        _, score = calc_score(predDict_for, predDict_for, refDict, mode = 'cider', n=4)
        acc = score[0]
        print 'score: %f' % acc
        #if epoch >= 5:
        lr_decay_CNN.step(acc)
        lr_decay_RNN.step(acc)
        
#         if verbose:
#             print 'train loss: %f \t valid loss: %f \t test loss: %f' %(train_loss.data[0], valid_loss.data[0], test_loss.data[0])
#         if (epoch + 1) % 10 == 0:

        if acc > best_acc:
            best_acc = acc            
            best_model = copy.deepcopy(model.state_dict())
            #if epoch >= 19:
#                 if havesaved == False:
#                     havesaved = True
            with open('./model/'+model_num+"/model_state_Adam_CNN.pkl", "wb") as output_file:
                pickle.dump(best_model, output_file)
            print 'best score: %f' % acc
#         if epoch >= 14 and havesaved == False:
#             havesaved = True
#             with open('./model/'+model_num+"/model_state_Adam_CNN.pkl", "wb") as output_file:
#                 pickle.dump(best_model, output_file)
#             print 'train acc: %f \t valid acc: %f \t test acc: %f' %(train_acc.data[0], valid_acc.data[0], test_acc.data[0])
#         if (epoch + 1) % 5 == 0:
#             predDict, refDict = generate_caption(model, valid_data_loader, threshold = 50)
#             score = calc_score(predDict, refDict)
#             lr_decay.step(score)
#             print 'score: %f' %(score)

#             predDict_for, refDict, _ = generate_caption_forward(model, valid_data_loader, mode_caption = 'train',threshold = 50)
#             predDict_back, _, _ = generate_caption_backward(model, valid_data_loader, mode_caption = 'train',threshold = 50)
#             _, score, _ = calc_score(predDict_for, predDict_back, refDict, mode = 'cider', n=4)
#             lr_decay.step(score)
        
#         performance_record = open('./model/'+model_num+"/performance_record.txt", "a")
# #                 performance_record.write('epoch: ' + str(epoch+1)+'\ttrain_loss: '+str(train_loss.data[0])+'\tvalid_loss: '+\
# #                                         str(valid_loss.data[0])+'\t test_loss: '+str(test_loss.data[0])+'\n')
#         performance_record.write('epoch: '+str(epoch+1)+ '\ttrain loss: '+str(train_loss.data[0]) +'\tscore:' + str(score)+'\n')
#         performance_record.close()
#         print 'train loss: % f \tvalid loss: %f \ttest loss: %f' %(train_loss.data[0], 
#                                                                                valid_loss.data[0],
#                                                                                test_loss.data[0]) 
                                                                               #score)
#         if verbose:
#             print '-'*80
#         if epoch != 0 and last_acc >= valid_acc.data[0]:
#         if (epoch + 1) % 5 == 0 and last_acc >= score:


#         if epoch != 0 and last_acc >= score:
#             rise_epoch +=1
#             if rise_epoch == early_stop:
#                 if verbose:
#                     print 'early stop condition met'
#                     time_elapsed = (time.time() - start_time)
#                     print 'time elapsed: %i m %f s' %(time_elapsed//60, time_elapsed%60)
#                 return model, best_model, best_acc, (time.time() - start_time)
#             last_acc = score
# #         elif (epoch+1)%5 == 0 and last_acc < score:
#         elif last_acc < score:
#             rise_epoch = 0
#             if score >= best_acc:
#                 best_acc = score
#                 best_model = copy.deepcopy(model.state_dict())
# #                 with open('./model/'+model_num+"/model_state_SGD.pkl", "wb") as output_file:
# #                     pickle.dump(best_model, output_file)
#             last_acc = score
# #             if (epoch+1)%5 == 0 and score >= best_acc:
# # #                 performance_record = open('./model/'+model_num+"/performance_record.txt", "a")
# # # #                 performance_record.write('epoch: ' + str(epoch+1)+'\ttrain_loss: '+str(train_loss.data[0])+'\tvalid_loss: '+\
# # # #                                         str(valid_loss.data[0])+'\t test_loss: '+str(test_loss.data[0])+'\n')
# # #                 performance_record.write('epoch: '+str(epoch+1)+'\tscore:' + str(score)+'\n')
# # #                 performance_record.close()
# #                 with open('./model/'+model_num+"/model_state_SGD.pkl", "wb") as output_file:
# #                     pickle.dump(best_model, output_file)
#             if verbose:
#                 print '-'*80
# #     performance_record.close()
    time_elapsed = (time.time() - start_time)
    print 'time elapsed: %i m %f s' %(time_elapsed//60, time_elapsed%60)
#     return model, best_model, best_acc, (time.time() - start_time)
    return model

In [11]:
model_simple = Encoder2Decoder()
# with open('./model/'+model_num+"/model_state_Adam.pkl", "rb") as output_file:
#     pre_model = pickle.load(output_file)
# temp = copy.deepcopy(pre_model.state_dict())
# model_simple.load_state_dict(temp)
learnable_param_CNN = list(model_simple.encoder.resnet_conv.parameters())
learnable_param_RNN = list(model_simple.encoder.global_layer.parameters()) + \
                    list(model_simple.encoder.local_layer.parameters()) +\
                    list(model_simple.decoder.parameters())
# learnable_param = list(model_simple.decoder.parameters())
# optimizer = optim.SGD(learnable_param, lr=LR, momentum=MOMENTUM, weight_decay = WEIGHT_DECAY, nesterov = True)
# optimizer = optim.Adadelta(learnable_param, lr=LR, rho=0.9, eps=1e-06, weight_decay=0)
optimizer_CNN = optim.Adam(learnable_param_CNN, lr = LR_CNN, betas = (0.8, 0.99))
optimizer_RNN = optim.Adam(learnable_param_RNN, lr = LR_RNN, betas = (0.8, 0.999))

objective = nn.CrossEntropyLoss(ignore_index = encoder.word_to_idx['^'])

lr_decay_RNN = optim.lr_scheduler.ReduceLROnPlateau(optimizer_RNN, mode='max', factor=0.5, patience=2, 
                                                      verbose=False, threshold=0.0001, threshold_mode='rel', 
                                                      cooldown=0, min_lr=0, eps=1e-08)
lr_decay_CNN = optim.lr_scheduler.ReduceLROnPlateau(optimizer_CNN, mode='max', factor=0.5, patience=2, 
                                                      verbose=False, threshold=0.0001, threshold_mode='rel', 
                                                      cooldown=0, min_lr=0, eps=1e-08)

In [13]:
with open('./model/'+model_num+"/model_state_Adam_CNN.pkl", "rb") as output_file:
    test = pickle.load(output_file)

In [14]:
model_simple.load_state_dict(test)

In [20]:
model = train(model_simple, optimizer_CNN, optimizer_RNN, objective, lr_decay_CNN, lr_decay_RNN, model_num,
                                                                epochs = 100, early_stop = 3, verbose = True)

Training:   0%|          | 0/235 [00:00<?, ?it/s]

epoch 1


Training: 100%|██████████| 235/235 [02:52<00:00,  1.36it/s]
Generating Caption: 100%|██████████| 40/40 [01:13<00:00,  1.83s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 417.16it/s]


score: 0.370216


Training:   0%|          | 0/235 [00:00<?, ?it/s]

best score: 0.370216
epoch 2


Training: 100%|██████████| 235/235 [02:50<00:00,  1.38it/s]
Generating Caption: 100%|██████████| 40/40 [01:10<00:00,  1.75s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 436.53it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.358170
epoch 3


Training: 100%|██████████| 235/235 [02:50<00:00,  1.38it/s]
Generating Caption: 100%|██████████| 40/40 [01:14<00:00,  1.86s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 420.71it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.361851
epoch 4


Training: 100%|██████████| 235/235 [02:48<00:00,  1.40it/s]
Generating Caption: 100%|██████████| 40/40 [01:12<00:00,  1.82s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 400.93it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.365534
epoch 5


Training: 100%|██████████| 235/235 [02:51<00:00,  1.37it/s]
Generating Caption: 100%|██████████| 40/40 [01:12<00:00,  1.82s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 397.72it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.366345
epoch 6


Training: 100%|██████████| 235/235 [02:49<00:00,  1.38it/s]
Generating Caption: 100%|██████████| 40/40 [01:14<00:00,  1.86s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 391.67it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.367622
epoch 7


Training: 100%|██████████| 235/235 [02:48<00:00,  1.39it/s]
Generating Caption: 100%|██████████| 40/40 [01:18<00:00,  1.95s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 377.06it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.357470
epoch 8


Training: 100%|██████████| 235/235 [02:50<00:00,  1.38it/s]
Generating Caption: 100%|██████████| 40/40 [01:16<00:00,  1.92s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 435.78it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.358917
epoch 9


Training: 100%|██████████| 235/235 [02:52<00:00,  1.37it/s]
Generating Caption: 100%|██████████| 40/40 [01:20<00:00,  2.02s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 422.29it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.358995
epoch 10


Training: 100%|██████████| 235/235 [02:51<00:00,  1.37it/s]
Generating Caption: 100%|██████████| 40/40 [01:16<00:00,  1.92s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 383.71it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.350439
epoch 11


Training: 100%|██████████| 235/235 [02:57<00:00,  1.33it/s]
Generating Caption: 100%|██████████| 40/40 [01:24<00:00,  2.10s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 393.81it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.357982
epoch 12


Training: 100%|██████████| 235/235 [02:57<00:00,  1.32it/s]
Generating Caption: 100%|██████████| 40/40 [01:24<00:00,  2.11s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 384.71it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.359846
epoch 13


Training: 100%|██████████| 235/235 [02:56<00:00,  1.33it/s]
Generating Caption: 100%|██████████| 40/40 [01:25<00:00,  2.14s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 384.19it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.362647
epoch 14


Training: 100%|██████████| 235/235 [02:54<00:00,  1.35it/s]
Generating Caption: 100%|██████████| 40/40 [01:19<00:00,  1.98s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 411.68it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.369519
epoch 15


Training: 100%|██████████| 235/235 [02:56<00:00,  1.33it/s]
Generating Caption: 100%|██████████| 40/40 [01:15<00:00,  1.90s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:03<00:00, 321.35it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.366594
epoch 16


Training: 100%|██████████| 235/235 [02:49<00:00,  1.39it/s]
Generating Caption: 100%|██████████| 40/40 [01:17<00:00,  1.94s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 405.34it/s]


score: 0.370980


Training:   0%|          | 0/235 [00:00<?, ?it/s]

best score: 0.370980
epoch 17


Training: 100%|██████████| 235/235 [02:57<00:00,  1.33it/s]
Generating Caption: 100%|██████████| 40/40 [01:17<00:00,  1.95s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 388.86it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.369973
epoch 18


Training: 100%|██████████| 235/235 [02:53<00:00,  1.35it/s]
Generating Caption: 100%|██████████| 40/40 [01:19<00:00,  1.99s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 408.80it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.364654
epoch 19


Training: 100%|██████████| 235/235 [02:53<00:00,  1.35it/s]
Generating Caption: 100%|██████████| 40/40 [01:16<00:00,  1.91s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 375.68it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.366696
epoch 20


Training: 100%|██████████| 235/235 [02:52<00:00,  1.36it/s]
Generating Caption: 100%|██████████| 40/40 [01:18<00:00,  1.95s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 385.38it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.365722
epoch 21


Training: 100%|██████████| 235/235 [02:52<00:00,  1.36it/s]
Generating Caption: 100%|██████████| 40/40 [01:14<00:00,  1.86s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 379.35it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.361400
epoch 22


Training: 100%|██████████| 235/235 [02:52<00:00,  1.36it/s]
Generating Caption: 100%|██████████| 40/40 [01:18<00:00,  1.97s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 394.23it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.362945
epoch 23


Training: 100%|██████████| 235/235 [02:52<00:00,  1.37it/s]
Generating Caption: 100%|██████████| 40/40 [01:16<00:00,  1.91s/it]
Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 366.81it/s]
Training:   0%|          | 0/235 [00:00<?, ?it/s]

score: 0.361866
epoch 24


Training:  34%|███▎      | 79/235 [00:59<01:57,  1.33it/s]Process Process-195:
Process Process-193:
Process Process-196:
Process Process-194:
Traceback (most recent call last):
  File "/opt/conda/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/opt/conda/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/opt/conda/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    self.run()
    self.run()
    self.run()
  File "/opt/conda/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/opt/conda/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/opt/conda/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/opt/conda/lib/python2.7/multiprocessing/process.py", line 114, in ru

KeyboardInterrupt: 

In [23]:
with open('./model/'+model_num+"/model_state_Adam_CNN.pkl", "rb") as output_file:
    test = pickle.load(output_file)

In [24]:
model_simple = Encoder2Decoder()
model_simple.load_state_dict(test)

In [16]:
predDict_back, refDict, _ = generate_caption_backward(model_simple.cuda(), test_data_loader)

Generating Caption: 100%|██████████| 40/40 [01:15<00:00,  1.89s/it]


In [15]:
final_pred, final_score = calc_score(predDict_for, predDict_back, refDict, mode = 'cider', n=4)

Calculating Score: 100%|██████████| 1000/1000 [00:02<00:00, 339.10it/s]


In [18]:
predDict_back

{'3110649716_c17e14670e.jpg': ['a man admiring paintings on the street'],
 '543007912_23fc735b99.jpg': ['a boy in a striped shirt is standing in front of a teddy bear'],
 '2813033949_e19fa08805.jpg': ['a black and white dog running in the grass'],
 '247704641_d883902277.jpg': ['a black and white dog walking in the water'],
 '2398605966_1d0c9e6a20.jpg': ['two dogs are playing in the snow'],
 '732468337_a37075225e.jpg': ['a little girl is laughing while lying on a bed'],
 '2522297487_57edf117f7.jpg': ['a girl is laughing with a boy in a striped shirt'],
 '2490768374_45d94fc658.jpg': ['a little girl is running outside'],
 '2101457132_69c950bc45.jpg': ['a black dog jumps into the water'],
 '3224227640_31865b3651.jpg': ['two white dogs are fighting in front of a small white dog'],
 '3498997518_c2b16f0a0e.jpg': ['a football player winds up for the ball during a game'],
 '3310067561_b92017acab.jpg': ['a brown dog on a blanket'],
 '3554634863_5f6f616639.jpg': ['many people are gathered on the 

In [17]:
Cider().compute_score(refDict, predDict_back)

(0.38036213839492766,
 array([2.05063122e-01, 2.68171093e-01, 5.39829159e-01, 4.49342502e-01,
        1.62455551e-01, 1.20212339e+00, 5.35798546e-01, 3.72397005e-01,
        1.17734148e+00, 2.53875277e-01, 1.12015905e-01, 1.33900537e-01,
        1.91114218e-01, 9.31161055e-02, 2.40619670e-01, 7.33243253e-01,
        3.21557601e-01, 9.61632772e-02, 5.12470180e-02, 2.85370385e-01,
        1.49564221e-01, 2.62014955e-01, 5.25071217e-01, 6.13965087e-02,
        2.35466800e-03, 2.96720784e-02, 1.08434823e-01, 3.10408095e-01,
        5.43901168e-01, 4.50546197e-03, 3.22448203e-01, 7.20943288e-01,
        5.58068568e-02, 1.90048443e-01, 3.33194457e-01, 2.92407523e-04,
        1.31359819e-01, 1.63644609e-01, 3.62722677e-01, 7.09221960e-04,
        2.15974400e-01, 1.87343098e-01, 7.72811131e-03, 1.23785956e-02,
        1.53353196e-01, 6.05375076e-06, 3.04411263e-01, 3.98162689e-01,
        1.10079427e-01, 1.94985861e-01, 2.91108777e-01, 7.45109325e-01,
        3.72021133e-01, 8.38598914e-03, 4.

In [40]:
refDict


{'124972799_de706b6d0b.jpg': ['a cat under the bench hissing with a dog growling and trying to get it ',
  'a dog toys with a cat hiding under a bench ',
  'a snarling brown and black dog corners a brown longhaired cat under a wooden bench ',
  'a snarling dog approaches a hissing cat ',
  'black and brown dog growling at brown cat hiding under wooden bench '],
 '2574194729_1f099647ee.jpg': ['a light colored dog runs on a narrow path ',
  'a pale dog runs down a path ',
  'a white dog is running down a path between bushes ',
  'a white dog rushes down a dirt path surrounded by grass and trees ',
  'yellow dog running through bushes on path '],
 '3429351222_17ae744daf.jpg': ['a man and a woman walk at a flea market ',
  'an asian couple walks through a market ',
  'a young couple walking together through an outdoor market ',
  'a young man and woman are walking along an outdoor market ',
  'people standing and walking in a flea market '],
 '2978409165_acc4f29a40.jpg': ['a man in a blue 

In [19]:
with open('test_max_predDict_bi_back.p', 'wb') as f:
    pickle.dump(predDict_back, f)

In [121]:
test_CNN = AttentiveCNN().cuda()
test_decoder = Decoder().cuda()

In [169]:
img, x,y,length, ID = next(iter(train_data_loader))

In [21]:
pred_forward = predDict_for
pred_backward = predDict_back
ref = refDict
mode = 'cider'
n = 4
if mode == 'cider':
    final_pred = {}
    score_record = []
    for img_ID in tqdm(pred_forward.keys(), desc = 'Calculating Score'):
        temp_ref = {}
        temp_for, temp_back = {}, {}
        temp_ref[img_ID] = ref[img_ID]
        temp_for[img_ID], temp_back[img_ID] = pred_forward[img_ID], pred_backward[img_ID]
        score_for, _ = Cider(n = n).compute_score(temp_ref, temp_for)
        score_back, _ = Cider(n = n).compute_score(temp_ref, temp_back)
        if score_for >= score_back:
            final_pred[img_ID] = pred_forward[img_ID]
        else:
            final_pred[img_ID] = pred_backward[img_ID]
    final_score = Cider(n = n).compute_score(ref, final_pred)

NameError: name 'predDict_for' is not defined

In [135]:
# captions = x
captions_rev = torch.zeros((x.size()))
for i, cap_temp in enumerate(x):
    inv_index = torch.arange(length[i]-1,-1,-1,out = torch.LongTensor()) 
    captions_rev[i,:length[i]] = cap_temp.unsqueeze(0).unsqueeze(2)[:,inv_index,:]#[0].view(length[i])
#     if length.numpy()[i] != PAD_LENGTH+2:
#         captions_rev[i,length[i]:] = captions.data[i,length[i]:]

captions_rev = Variable(captions_rev)

In [136]:
y_rev = torch.zeros(y.size())
for i,y_temp in enumerate(y):
    y_rev[i,length[i]-2] = x[i,0]
    inv_index = torch.arange(length[i]-3,-1,-1,out = torch.LongTensor()) 
    y_rev[i,:length[i]-2] = y_temp[inv_index]
    y_rev[i,length[i]-1:] = y[i,length[i]-1:]

In [137]:
encoder.decode(captions_rev.data[0])

'<end> snow the in dog white a after running is dog black a <start> raining raining raining raining raining raining raining raining raining raining raining raining raining raining raining raining raining raining raining raining raining raining raining raining'

In [32]:
threshold = 50
model_simple.eval()
predDict = {}
refDict = {}
Beta = {}
imgSet = set()
prime = torch.LongTensor(encoder.encode('<start>'))
img, x, y, length, ID = next(iter(train_data_loader))
img = Variable(img.cuda())
V, v_g = model_simple.encoder(img)
batch = 
V_t, v_g_t = V[batch].unsqueeze(0), v_g[batch].unsqueeze(0)
states = None
temp_result = []
temp_beta = []
end_found = False
inp = Variable(prime.cuda(), volatile = True).unsqueeze(0)
current = 0
if ID[batch] not in imgSet:
    while current <= threshold:
        scores_for,scores_rev, states, atten_weights, beta = model_simple.decoder( V_t, v_g_t, inp, length, states, 
                                                              mode = 'generate')
        _, pred_y = torch.max(F.softmax(scores_for[0]),1)
        if pred_y.data[0] == encoder.encode('<end>')[0]:
            end_found = True
            break
        current += 1
        pred_y_word = encoder.decode([pred_y.data[0]])
        temp_result.append(pred_y_word)
        inp = pred_y.unsqueeze(0)
        temp_beta.append(beta)
#     imgSet.add(ID[batch])
#     temp_result = temp_result[::-1]
#     predDict[ID[batch]] = [' '.join(temp_result)]
#     Beta[ID[batch]] = temp_beta
#     refDict[ID[batch]] = all_captions[ID[batch]]

In [33]:
temp_result

['a',
 'man',
 'in',
 'a',
 'black',
 'shirt',
 'and',
 'a',
 'woman',
 'in',
 'a',
 'black',
 'shirt',
 'with',
 'a',
 'white',
 'dog',
 'in',
 'a',
 'grassy',
 'field']

In [35]:
encoder.decode(x[100])

'<start> a boy in a red shirt in front of a long blue wall raises his eyebrow at the camera <end> ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^'

In [82]:
torch.zeros((x.size())).type(torch.cuda.LongTensor)


    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.cuda.LongTensor of size 128x38 (GPU 0)]

In [89]:
img, x, y, length, ID = next(iter(train_data_loader))

In [90]:
x.size()

torch.Size([128, 38])

In [65]:
model = model.cuda()
score_for, score_back = model.forward(Variable(img.cuda()), Variable(x.cuda()),length)

In [67]:
temp_score = score_for.view((score_for.size()[0]*score_for.size()[1],score_for.size()[-1]))


In [97]:
 _, pred_y = torch.max(F.softmax(score_for[2]),1)
encoder.decode(pred_y.data)

'a dogs are each other in the grass field <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>'

In [121]:
encoder.decode(x[0])

'<start> a black dog is running after a white dog in the snow <end> ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^'

In [118]:
img, x, y, length, ID = next(iter(train_data_loader))
#reverse label
y_rev = torch.zeros(y.size()).type(torch.LongTensor)
for i,y_temp in enumerate(y):
    y_rev[i,length[i]-2] = x[i,0]
    inv_index = torch.arange(length[i]-3,-1,-1,out = torch.LongTensor()) 
    y_rev[i,:length[i]-2] = y_temp[inv_index]
    if length.numpy()[i] != PAD_LENGTH+2:
        y_rev[i,length[i]-1:] = y[i,length[i]-1:]

In [119]:
encoder.decode(y_rev[0])

'snow the in dog white a after running is dog black a <start> ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^'

In [145]:
threshold = 50
model.eval()
predDict = {}
refDict = {}
Beta = {}
imgSet = set()
prime = torch.LongTensor(encoder.encode('<start>'))
img, x, y, length, ID = next(iter(train_data_loader))
img = Variable(img.cuda())
V, v_g = model.encoder(img)
batch = 0
V_t, v_g_t = V[batch].unsqueeze(0), v_g[batch].unsqueeze(0)
states = None
temp_result = []
temp_beta = []
end_found = False
inp = Variable(prime.cuda(), volatile = True).unsqueeze(0)
current = 0
if ID[batch] not in imgSet:
    while current <= threshold:
        scores_for,scores_rev, states, atten_weights, beta = model.decoder( V_t, v_g_t, inp, length, states, 
                                                              mode = 'generate')
        _, pred_y = torch.max(F.softmax(scores_for[0]),1)
        if pred_y.data[0] == encoder.encode('<end>')[0]:
            end_found = True
#             break
        current += 1
        pred_y_word = encoder.decode([pred_y.data[0]])
        temp_result.append(pred_y_word)
        inp = pred_y.unsqueeze(0)
        temp_beta.append(beta)
#     imgSet.add(ID[batch])
#     temp_result = temp_result[::-1]
#     predDict[ID[batch]] = [' '.join(temp_result)]
#     Beta[ID[batch]] = temp_beta
#     refDict[ID[batch]] = all_captions[ID[batch]]

In [146]:
temp_result

['a',
 'man',
 'in',
 'a',
 'red',
 'shirt',
 'and',
 'a',
 'black',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt',
 'and',
 'white',
 'shirt']