In [24]:
from __future__ import division
import time
import torch 
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import cv2 
from util import *
import argparse
import glob
import os 
import os.path as osp
from darknet import Darknet
from preprocess import prep_image, inp_to_image
import pandas as pd
import random 
import pickle as pkl
import itertools
from base64 import b64encode
from os import makedirs
from os.path import join, basename
import json
import requests
import nltk
import pickle
from collections import Counter
from pycocotools.coco import COCO
from PIL import Image
import matplotlib.pyplot as plt
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
import torch.utils.data as data
import torchvision.transforms as transforms
%matplotlib inline


In [2]:
br=pd.read_csv("brands.csv")
brands=br['BRAND NAME']
brands=brands.tolist()

In [3]:
class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

def build_vocab(json, threshold):
    """Build a simple vocabulary wrapper."""
    coco = COCO(json)
    counter = Counter()
    ids = coco.anns.keys()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if (i+1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions.".format(i+1, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

In [4]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        
    def forward(self, images):
        """Extract feature vectors from input images."""
        with torch.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features


class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        """Set the hyper-parameters and build the layers."""
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seg_length = max_seq_length
        
    def forward(self, features, captions, lengths):
        """Decode image feature vectors and generates captions."""
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
        hiddens, _ = self.lstm(packed)
        outputs = self.linear(hiddens[0])
        return outputs
    
    def sample(self, features, states=None):
        """Generate captions for given image features using greedy search."""
        sampled_ids = []
        inputs = features.unsqueeze(1)
        for i in range(self.max_seg_length):
            hiddens, states = self.lstm(inputs, states)          # hiddens: (batch_size, 1, hidden_size)
            outputs = self.linear(hiddens.squeeze(1))            # outputs:  (batch_size, vocab_size)
            _, predicted = outputs.max(1)                        # predicted: (batch_size)
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)                       # inputs: (batch_size, embed_size)
            inputs = inputs.unsqueeze(1)                         # inputs: (batch_size, 1, embed_size)
        sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
        return sampled_ids

In [5]:
def load_image(image_path, transform=None):
    image = Image.open(image_path)
    image = image.resize([224, 224], Image.LANCZOS)
    
    if transform is not None:
        image = transform(image).unsqueeze(0)
    
    return image

In [6]:

encoder_path='encoder-5-6000.ckpt'
decoder_path='decoder-5-6000.ckpt'


In [7]:
   
images = "test_data"
det ="dett"
bs = 1
confidence = 0.5
nms_thresh =0.4
cfgfile= "cfg/yolov3.cfg"
weightsfile= "yolov3.weights"
reso= "416"
scales= "1,2,3"
caption_path='data/annotations/captions_train2014.json'
vocab_path='vocab.pkl'
threshold=4

image_dir = 'data/train2014/'
output_dir = 'data/resized2014/'
resize_images= [256,256]
model_path='models/'
crop_size=224
log_step=10
save_step=1000
embed_size=256
hidden_size=512
num_layers=1
num_epochs=5
batch_size=64
num_workers=1
learning_rate=0.001

    

In [8]:
ENDPOINT_URL = 'https://vision.googleapis.com/v1/images:annotate'
RESULTS_DIR = 'jsons'
makedirs(RESULTS_DIR, exist_ok=True)


def make_image_data_list(image_filenames):
    """
    image_filenames is a list of filename strings
    Returns a list of dicts formatted as the Vision API
        needs them to be
    """
    img_requests = []
    for imgname in image_filenames:
        print(image_filenames)
        with open(imgname, 'rb') as f:
            ctxt = b64encode(f.read()).decode()
            img_requests.append({
                    'image': {'content': ctxt},
                    'features': [{
                        'type': 'TEXT_DETECTION',
                        'maxResults': 1
                    }]
            })
    return img_requests

def make_image_data(image_filenames):
    """Returns the image data lists as bytes"""
    imgdict = make_image_data_list(image_filenames)
    return json.dumps({"requests": imgdict }).encode()


def request_ocr(api_key,image_filenames):
    response = requests.post(ENDPOINT_URL,
                             data=make_image_data(image_filenames),
                             params={'key': api_key},
                             headers={'Content-Type': 'application/json'})
    return response



api_key = 'AIzaSyCnCILAt_jNYdX4Yeac4s9dXl0NLH2W6f8'

In [16]:
class test_net(nn.Module):
    def __init__(self, num_layers, input_size):
        super(test_net, self).__init__()
        self.num_layers= num_layers
        self.linear_1 = nn.Linear(input_size, 5)
        self.middle = nn.ModuleList([nn.Linear(5,5) for x in range(num_layers)])
        self.output = nn.Linear(5,2)
    
    def forward(self, x):
        x = x.view(-1)
        fwd = nn.Sequential(self.linear_1, *self.middle, self.output)
        return fwd(x)
        
def get_test_input(input_dim, CUDA):
    img = cv2.imread("dog-cycle-car.png")
    img = cv2.resize(img, (input_dim, input_dim)) 
    img_ =  img[:,:,::-1].transpose((2,0,1))
    img_ = img_[np.newaxis,:,:,:]/255.0
    img_ = torch.from_numpy(img_).float()
    img_ = Variable(img_)
    
    if CUDA:
        img_ = img_.cuda()
    num_classes
    return img_




    
    
    

    


   

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:

batch_size = int(bs)
confidence = float(confidence)
nms_thesh = float(nms_thresh)
start = 0
CUDA = torch.cuda.is_available()
num_classes = 80
classes = load_classes('data/coco.names') 

#Set up the neural network
print("Loading network.....")
model = Darknet(cfgfile)
model.load_weights(weightsfile)
print("Network successfully loaded")
 
model.net_info["height"] = reso
inp_dim = int(model.net_info["height"])
assert inp_dim % 32 == 0 
assert inp_dim > 32
#If there's a GPU availible, put the model on GPU
if CUDA:
    model.cuda()
 
    
#Set the model in evaluation mode
model.eval()
   
read_dir = time.time()
#Detection phase
try:
    imlist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images) if os.path.splitext(img)[1] == '.png' or os.path.splitext(img)[1] =='.jpeg' or os.path.splitext(img)[1] =='.jpg']
except NotADirectoryError:
    imlist = []
    imlist.append(osp.join(osp.realpath('.'), images))
except FileNotFoundError:
    print ("No file or directory with the name {}".format(images))
    exit()
       
if not os.path.exists(det):
    os.makedirs(det)
     
load_batch = time.time()
    
batches = list(map(prep_image, imlist, [inp_dim for x in range(len(imlist))]))
im_batches = [x[0] for x in batches]
orig_ims = [x[1] for x in batches]
im_dim_list = [x[2] for x in batches]
im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2)
   
    
    
if CUDA:
    im_dim_list = im_dim_list.cuda()
    
leftover = 0
    
if (len(im_dim_list) % batch_size):
    leftover = 1
       
        
if batch_size != 1:
    num_batches = len(imlist) // batch_size + leftover            
    im_batches = [torch.cat((im_batches[i*batch_size : min((i +  1)*batch_size,
                        len(im_batches))]))  for i in range(num_batches)]        


i = 0
    

write = False
model(get_test_input(inp_dim, CUDA), CUDA)
    
start_det_loop = time.time()
    
objs = {}
    
    
    
for batch in im_batches:
        #load the image 
    start = time.time()
    if CUDA:
        batch = batch.cuda()
        

        #Apply offsets to the result predictions
        #Tranform the predictions as described in the YOLO paper
        #flatten the prediction vector 
        # B x (bbox cord x no. of anchors) x grid_w x grid_h --> B x bbox x (all the boxes) 
        # Put every proposed box as a row.
    with torch.no_grad():
        prediction = model(Variable(batch), CUDA)
        
#        prediction = prediction[:,scale_indices]

        
        #get the boxes with object confidence > threshold
        #Convert the cordinates to absolute coordinates
        #perform NMS on these boxes, and save the results 
        #I could have done NMS and saving seperately to have a better abstraction
        #But both these operations require looping, hence 
        #clubbing these ops in one loop instead of two. 
        #loops are slower than vectorised operations. 
        
    prediction = write_results(prediction, confidence, num_classes, nms = True, nms_conf = nms_thesh)
        
        
    if type(prediction) == int:
        i += 1
        continue

    end = time.time()
        
                    
#        print(end - start)

            

    prediction[:,0] += i*batch_size
        
    
            
          
    if not write:
        output = prediction
        write = 1
    else:
        output = torch.cat((output,prediction))
           
        
        

    for im_num, image in enumerate(imlist[i*batch_size: min((i +  1)*batch_size, len(imlist))]):
        transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
        with open(vocab_path, 'rb') as f:
            vocab = pickle.load(f)
        

        brandname=""
        image_filenames=[image]
        if not api_key or not image_filenames:
            print("""
                Please supply an api key, then one or more image filenames
                $ python cloudvisreq.py api_key image1.jpg image2.png""")
        else:
            response = request_ocr(api_key, image_filenames)
            if response.status_code != 200 or response.json().get('error'):
                print(response.text)
            else:
                for idx, resp in enumerate(response.json()['responses']):
           

                # print the plaintext to screen for convenience
                    print("---------------------------------------------")
                    if bool(resp):
                        t = resp['textAnnotations'][0]
                        text=t['description'].split()
                        text=[x.lower() for x in text]
                        print(text)
                        for words in text:
                            for items in brands:
                                if len(words)>=4 and words in items.lower():
                                    brandname=items
                
        im_id = i*batch_size + im_num
        objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id]
        print("{0:20s} predicted in {1:6.3f} seconds".format(image.split("/")[-1], (end - start)/batch_size))
        print("{0:20s}"+brandname+ " {1:s}".format("Objects Detected:", " ".join(objs)))
        print("----------------------------------------------------------")
    i += 1
    
    if CUDA:
        torch.cuda.synchronize()
    
    try:
        output
    except NameError:
        print("No detections were made")
        exit()
        
im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long())
    
scaling_factor = torch.min(inp_dim/im_dim_list,1)[0].view(-1,1)
    
    
output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2
output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2
    
    
    
output[:,1:5] /= scaling_factor
    
for i in range(output.shape[0]):
    output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0])
    output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1])
        
        
output_recast = time.time()
    
    
class_load = time.time()

colors = pkl.load(open("pallete", "rb"))
    
    
draw = time.time()


def write(x, batches, results):
    c1 = tuple(x[1:3].int())
    c2 = tuple(x[3:5].int())
    img = results[int(x[0])]
    cls = int(x[-1])
    label = "{0}".format(classes[cls])
    color = random.choice(colors)
    cv2.rectangle(img, c1, c2,color, 1)
    t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
    c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
    cv2.rectangle(img, c1, c2,color, -1)
    cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1)
    return img
    
            
list(map(lambda x: write(x, im_batches, orig_ims), output))
      
det_names = pd.Series(imlist).apply(lambda x: "{}/det_{}".format(det,x.split("/")[-1]))
    
list(map(cv2.imwrite, det_names, orig_ims))
    
end = time.time()
    
print()
print("SUMMARY")
print("----------------------------------------------------------")
print("{:25s}: {}".format("Task", "Time Taken (in seconds)"))
print()
print("{:25s}: {:2.3f}".format("Reading addresses", load_batch - read_dir))
print("{:25s}: {:2.3f}".format("Loading batch", start_det_loop - load_batch))
print("{:25s}: {:2.3f}".format("Detection (" + str(len(imlist)) +  " images)", output_recast - start_det_loop))
print("{:25s}: {:2.3f}".format("Output Processing", class_load - output_recast))
print("{:25s}: {:2.3f}".format("Drawing Boxes", end - draw))
print("{:25s}: {:2.3f}".format("Average time_per_img", (end - load_batch)/len(imlist)))
print("----------------------------------------------------------")

    
torch.cuda.empty_cache()
    
    
        
        
    
    


Loading network.....
Network successfully loaded
['/home/nandwani_vaibhav/pytorch-yolo-v3/test_data/3.jpg']
---------------------------------------------
['olla!', 'er', 'estb.', 'lock', '1870']
3.jpg                predicted in  0.091 seconds
{0:20s} clock
----------------------------------------------------------
['/home/nandwani_vaibhav/pytorch-yolo-v3/test_data/11.jpg']
---------------------------------------------
11.jpg               predicted in  0.084 seconds
{0:20s} handbag suitcase
----------------------------------------------------------
['/home/nandwani_vaibhav/pytorch-yolo-v3/test_data/14.jpg']
---------------------------------------------
14.jpg               predicted in  0.084 seconds
{0:20s} cake
----------------------------------------------------------
['/home/nandwani_vaibhav/pytorch-yolo-v3/test_data/1.jpg']
---------------------------------------------
1.jpg                predicted in  0.084 seconds
{0:20s} keyboard
----------------------------------------------

In [None]:
try:
            encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
            decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
            encoder = encoder.to(device)
            decoder = decoder.to(device)

            encoder.load_state_dict(torch.load(encoder_path))
            decoder.load_state_dict(torch.load(decoder_path))
            from IPython.display import Image as pic
            pic(image)

            print("image:"+image)
            image2 = load_image(image, transform)
            image_tensor = image2.to(device)
   

            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
  

            sampled_caption = []
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
            sentence = ' '.join(sampled_caption)
    
# Print out the image and the generated caption
            print ("sentence:"+sentence)
        except:
            print("No description")
            
        



In [31]:
transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)
# Build models
for images in glob.glob("test_dkata"):
    print(images)
    encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

# Load the trained model parameters
    encoder.load_state_dict(torch.load(encoder_path))
    decoder.load_state_dict(torch.load(decoder_path))

# Prepare an image
    image = load_image(images, transform)
    image_tensor = image.to(device)
   
# Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
  
# Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
# Print out the image and the generated caption
    print (sentence)
