## predict

In [None]:
#encoding:utf-8
#
#created by xiongzihua
#
import torch
from torch.autograd import Variable
import torch.nn as nn

from net import vgg16, vgg16_bn
from resnet_yolo import resnet50
import torchvision.transforms as transforms
import cv2
import numpy as np

VOC_CLASSES = (    # always index 0
    'aeroplane', 'bicycle', 'bird', 'boat',
    'bottle', 'bus', 'car', 'cat', 'chair',
    'cow', 'diningtable', 'dog', 'horse',
    'motorbike', 'person', 'pottedplant',
'sheep', 'sofa', 'train', 'tvmonitor')

Color = [[0, 0, 0],
                    [128, 0, 0],
                    [0, 128, 0],
                    [128, 128, 0],
                    [0, 0, 128],
                    [128, 0, 128],
                    [0, 128, 128],
                    [128, 128, 128],
                    [64, 0, 0],
                    [192, 0, 0],
                    [64, 128, 0],
                    [192, 128, 0],
                    [64, 0, 128],
                    [192, 0, 128],
                    [64, 128, 128],
                    [192, 128, 128],
                    [0, 64, 0],
                    [128, 64, 0],
                    [0, 192, 0],
                    [128, 192, 0],
                    [0, 64, 128]]

def decoder(pred):
    '''
    pred (tensor) 1x7x7x30
    return (tensor) box[[x1,y1,x2,y2]] label[...]
    '''
    grid_num = 14
    boxes=[]
    cls_indexs=[]
    probs = []
    cell_size = 1./grid_num
    pred = pred.data
    pred = pred.squeeze(0) #7x7x30
    ## 获取P(obj)，即该gridcell是否包含目标。
    contain1 = pred[:,:,4].unsqueeze(2)  # (S,S,1)
    contain2 = pred[:,:,9].unsqueeze(2)  # (S,S,1)
    contain = torch.cat((contain1,contain2),2)  # (S,S,2)
    ## 计算P(obj)大于阈值的mask，即大于阈值，表示包含目标。
    mask1 = contain > 0.1 #大于阈值 (S,S,2)
    ## 计算P(obj)最大值的mask。
    mask2 = (contain==contain.max()) # (S,S,2)  we always select the best contain_prob what ever it>0.9
    ## P(obj)大于阈值的mask和P(obj)最大值mask的并集。
    mask = (mask1+mask2).gt(0)  # (S,S,2)
    # min_score,min_index = torch.min(contain,2) #每个cell只选最大概率的那个预测框
    for i in range(grid_num):
        for j in range(grid_num):
            ## 计算第<i,j>位置的gridcell
            for b in range(2):
                ## 计算每个BBox
                # index = min_index[i,j]
                # mask[i,j,index] = 0
                if mask[i,j,b] == 1:
                    ## 如果第<i,j>位置的gridcell的第b个BBox预测包含目标。
                    #print(i,j,b)
                    box = pred[i,j,b*5:b*5+4]
                    contain_prob = torch.FloatTensor([pred[i,j,b*5+4]])
                    ## 获取第<i,j>位置的gridcell的左上角点坐标
                    xy = torch.FloatTensor([j,i])*cell_size #cell左上角  up left of cell
                    ## 计算目标左上角点的坐标
                    ## box[:2]*cell_size：表示中心点相对第<i,j>位置左上角点的坐标
                    ## box[:2]*cell_size + xy：表示中心点相对归一化图像坐标(值域[0,1])左上角点的坐标
                    box[:2] = box[:2]*cell_size + xy # return cxcy relative to image
                    ## 转化为(cx,cy,w,h)==>(x1,y1,x2,y2)形式的坐标
                    box_xy = torch.FloatTensor(box.size())#转换成xy形式    convert[cx,cy,w,h] to [x1,xy1,x2,y2]
                    box_xy[:2] = box[:2] - 0.5*box[2:]
                    box_xy[2:] = box[:2] + 0.5*box[2:]
                    ## 获取当前目标类别
                    max_prob,cls_index = torch.max(pred[i,j,10:],0)
                    if float((contain_prob*max_prob)[0]) > 0.1:
                        ## 关键部分来了：
                        ## P(cls|obj) = P(obj)*P(cls)
                        ## 当前gridcell包含目标，并且预测的最大类别概率大于阈值
                        boxes.append(box_xy.view(1,4))
                        cls_indexs.append(cls_index)
                        probs.append(contain_prob*max_prob)
    if len(boxes) ==0:
        boxes = torch.zeros((1,4))
        probs = torch.zeros(1)
        cls_indexs = torch.zeros(1)
    else:
        boxes = torch.cat(boxes,0) #(n,4)
        probs = torch.cat(probs,0) #(n,)
        cls_indexs = torch.cat(cls_indexs,0) #(n,)
    keep = nms(boxes,probs)
    return boxes[keep],cls_indexs[keep],probs[keep]

def nms(bboxes,scores,threshold=0.5):
    '''
    NMS
    包含IOU和score两个重要内容。
    IOU：IOU作用就是选取最大的那个交并比的框。
    score：score作用就是按照每个框的得分，进行挑选某个目标的框。
    NMS过程：
    1> 计算所有框的面积；
    2> score排序；
    3> 按照score排序，循环进行box挑选；
      3.1> 保存当前score最高的order；
      3.2> 取出当前最大score和最大score对应原来顺序的order；
      3.3> 按照order[1:]取出剩余所有框；
      3.4> 将当前框与剩余所有框进行对比，获取交集框的左上角点和右下角点；
      3.5> 计算交集框的wh和面积；
      3.6> 计算交并比；
      3.7> 计算交并比小于T的下标；
      3.8> 根据交并比小于T的下标，更新order(更新后的order全部是与当前box交并比小于T的box。)；
    4> 返回所有score最高，且所有box之间IOU小于T的box的order;
    bboxes(tensor) [N,4]
    scores(tensor) [N,]
    '''
    ## 取出x1,y1,x2,y2坐标
    x1 = bboxes[:,0]
    y1 = bboxes[:,1]
    x2 = bboxes[:,2]
    y2 = bboxes[:,3]
    ## 计算所有框的面积
    areas = (x2-x1) * (y2-y1)

    ## 排序所有框，主要获取排序后的order。
    _,order = scores.sort(0,descending=True)
    ## 保留的框
    keep = []
    ## 每次都检查剩余排序的order，直到所有排序后的order完成，即所有框都进行了nms。
    while order.numel() > 0:
        ## 保存当前得分最高的box的idx
        i = order[0]
        keep.append(i)

        if order.numel() == 1:
            break
        
        ## 计算当前最高score和剩余boxes交集左上角点和右下角点
        xx1 = x1[order[1:]].clamp(min=x1[i])
        yy1 = y1[order[1:]].clamp(min=y1[i])
        xx2 = x2[order[1:]].clamp(max=x2[i])
        yy2 = y2[order[1:]].clamp(max=y2[i])

        ## 计算当前最高score的box剩余box交集的面积
        w = (xx2-xx1).clamp(min=0)
        h = (yy2-yy1).clamp(min=0)
        inter = w*h

        ## 计算当前最高score的box和剩余boxes的交并比
        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        ## 挑选交并比低于阈值的idx
        ids = (ovr<=threshold).nonzero().squeeze()
        if ids.numel() == 0:
            break
        ## 在原来order中，用剩余boxes中交并比低于阈值的idx，挑选符合条件的order，并更新order。
        order = order[ids+1]
    return torch.LongTensor(keep)
#
#start predict one image
#
def predict_gpu(model,image_name,root_path=''):

    result = []
    image = cv2.imread(root_path+image_name)
    h,w,_ = image.shape
    img = cv2.resize(image,(448,448))
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    mean = (123,117,104)#RGB
    img = img - np.array(mean,dtype=np.float32)

    transform = transforms.Compose([transforms.ToTensor(),])
    img = transform(img)
    img = Variable(img[None,:,:,:],volatile=True)
    img = img.cuda()

    pred = model(img) #1x7x7x30
    pred = pred.cpu()
    boxes,cls_indexs,probs =  decoder(pred)

    for i,box in enumerate(boxes):
        x1 = int(box[0]*w)
        x2 = int(box[2]*w)
        y1 = int(box[1]*h)
        y2 = int(box[3]*h)
        cls_index = cls_indexs[i]
        cls_index = int(cls_index) # convert LongTensor to int
        prob = probs[i]
        prob = float(prob)
        result.append([(x1,y1),(x2,y2),VOC_CLASSES[cls_index],image_name,prob])
    return result
        



if __name__ == '__main__':
    model = resnet50()
    print('load model...')
    model.load_state_dict(torch.load('best.pth'))
    model.eval()
    model.cuda()
    image_name = 'dog.jpg'
    image = cv2.imread(image_name)
    print('predicting...')
    result = predict_gpu(model,image_name)
    for left_up,right_bottom,class_name,_,prob in result:
        color = Color[VOC_CLASSES.index(class_name)]
        cv2.rectangle(image,left_up,right_bottom,color,2)
        label = class_name+str(round(prob,2))
        text_size, baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)
        p1 = (left_up[0], left_up[1]- text_size[1])
        cv2.rectangle(image, (p1[0] - 2//2, p1[1] - 2 - baseline), (p1[0] + text_size[0], p1[1] + text_size[1]), color, -1)
        cv2.putText(image, label, (p1[0], p1[1] + baseline), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255,255,255), 1, 8)

    cv2.imwrite('result.jpg',image)




