In [None]:
import sys
import os
import numpy as np
from numpy import loadtxt
import cv2

# caffe layers
caffe_root = '/users/vijay.kumar/caffe/'
sys.path.insert(0, caffe_root + 'python')
sys.path.insert(0, '/users/vijay.kumar/tools/liblinear-2.1/python')

import caffe
from caffe import layers as L
from liblinearutil import *
from utils import *

# enable gpu
caffe.set_mode_gpu()

In [None]:
params = read_params()

In [None]:
# initilaze caffe transformer
transformer = define_transformer()

# load pose nets and the pose estimator
nets, pose_net = load_models()

In [None]:
import PIL.Image
from cStringIO import StringIO
import IPython.display

def showarray(a, fmt='png'):
    a = np.uint8(a)
    f = StringIO()
    PIL.Image.fromarray(a).save(f, fmt)
    IPython.display.display(IPython.display.Image(data=f.getvalue()))

In [None]:
# imdb labeled training set.
imdb_imgs_dir = '../data/imdb/images/'
imdb_annot_dir = '../data/imdb/annot/'

imdb_annot_files = os.listdir(imdb_annot_dir)
labeled_actors = [int(_f.split('.')[0]) for _f in imdb_annot_files] 

In [None]:
num_models = len(nets)
num_examples = 0
# get count of number of examples for initialization.
for _f in imdb_annot_files:
    annotations = np.genfromtxt(imdb_annot_dir + _f, dtype = float, delimiter=',')
    if len(annotations.shape)==1:
        annotations = annotations.reshape((1,5))    
    num_examples = num_examples + len(annotations)
    
# initiliaze train set
train_features = np.zeros((num_examples, num_models, params['FEATSIZE']))
train_labels = np.zeros((num_examples,1))   

# extract features
count = 0

for la in labeled_actors:
    print 'Extracting features for subject:', la
    annotations = loadtxt(imdb_annot_dir + str(la) + '.txt', dtype = float, delimiter=',')
    
    if len(annotations.shape) == 1:
        annotations = annotations.reshape((1,5))
        
    for annot_sample in annotations:        
        img_name = imdb_imgs_dir + str(la) + '/' + str(int(annot_sample[0])) + '.jpg'        
        image = cv2.imread(img_name)     
        hbox = annot_sample[1:] 

        head = get_region_imdb(image, hbox, 'HEAD')
        upper_body = get_region_imdb(image, hbox, 'UB')       
                                                           
        # get psm features
        train_features[count] = get_pose_features(transformer, nets, head, upper_body, num_models, params['FEATSIZE'])
        train_labels[count] = la            
        
        count = count + 1  

train_features = train_features[range(count)]
train_labels = train_labels[range(count)]

In [None]:
# Classifier training
classifiers = train_linear_classifiers(train_features, np.squeeze(train_labels), num_models, params)

## TESTING

In [None]:
faces = loadtxt('../data/hannah/annotations/hannah_video_faces_.txt')
track_char_map = loadtxt('../data/hannah/annotations/hannah_video_tracks.txt', usecols=(0,1), skiprows=2)
track_char_dict_map = {}
for i in range(track_char_map.shape[0]):
    track_char_dict_map[track_char_map[i,0]] = track_char_map[i,1] 

In [None]:
# movie is split into four small 1GB videos..
video_names = ['../data/hannah/videos/hannah-001_1_37784.vob', '../data/hannah/videos/hannah-002_37785_75444.vob', 
             '../data/hannah/videos/hannah-003_75445_115473.vob','../data/hannah/videos/hannah-004_115474_153475.vob']

video_frame_split = np.zeros((5,))
for i in range(4):
    cap = cv2.VideoCapture(video_names[i])
    video_frame_split[i+1] = video_frame_split[i] + cap.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT)
    cap.release()
print video_frame_split
print "total_frames:", np.sum(video_frame_split)

In [None]:
def get_frame_no(fno, vf_split, vid_objs):
    for i in range(4):
        if fno > vf_split[i] and fno <= vf_split[i+1]:
            fno = fno - vf_split[i]
            working_cap = vid_objs[i]
    return working_cap, fno

video_objs = {} 
for i in range(4):
    video_objs[i] = cv2.VideoCapture(video_names[i])    
    success,image = video_objs[i].read()    
    
frame_interval = 1
track_ids = faces[:,5]
unique_track_ids = np.unique(track_ids)
num_tracks = len(unique_track_ids)

test_labels = []
test_tracks = []
pred_scores = []
for i in range(num_tracks):       
    
    tid = unique_track_ids[i]
    print 'Processing track:',tid
    
    # obtain track, frame and facebb details.
    track_data = faces[np.where(track_ids==tid)]
    track_data = track_data[0::frame_interval,:]
    frame_nos = track_data[:,0]                    
    num_frames = len(frame_nos)
    
    for j in range(num_frames):                             
        
        # get corresponding frame from movie
        cap, fno = get_frame_no(frame_nos[j], video_frame_split, video_objs)
                
        # there is a misplacement of 3 frames between the movie video I used and annotation provided.. 
        # adjusting it.
        cap.set(cv2.cv.CV_CAP_PROP_POS_FRAMES, fno-3) 
        
        # read frame
        fbox = track_data[j,1:5]             
        success, image = cap.read()          
                
        if success:
            # Resize the frame as the annotations provided by Ozerov et al. are with respect to 996x560 frame size. 
            image = cv2.resize(image,(996, 560))                   
            
            # crop head and ub
            head = get_region_hannah(image, fbox, 'HEAD')
            upper_body = get_region_hannah(image, fbox, 'UB')
           
            # not considering regions with <10 pixels.
            if head.shape[0] < 10 or head.shape[1] < 10 or upper_body.shape[0] < 10 or upper_body.shape[1] < 10:             
                continue                                                                       

            # obtain psm feature
            test_feature = get_pose_features(transformer, nets, head, upper_body, num_models, params['FEATSIZE'])            
            
            # get pose weights
            pose_weights = get_pose_weights(transformer, pose_net, upper_body)                                        
                            
            # identify label            
            pred_sample_sc = pose_aware_identity_prediction_(classifiers, test_feature, np.array([track_char_dict_map[tid]]), pose_weights,  params, num_models)        
            
            # store data
            test_labels.append(int(track_char_dict_map[tid]))
            test_tracks.append(tid)
            pred_scores.append(pred_sample_sc[0]) 
        
for i in range(4):            
    video_objs[i].release()

In [None]:
# obtain predicted labels
model_labels = classifiers[0][0].get_labels()
pred_labels = [model_labels[np.argmax(ps)] for ps in pred_scores]

In [None]:
# Overall accuracy including known and unknown subjects
overall_acc = 100*np.mean(np.squeeze(np.array(test_labels)) == np.squeeze(np.array(pred_labels)))
print 'overall accuracy:', overall_acc
print

# Overall accuracy of known subjects
correct = 0
num_examples_labeled_actors = 0
correct_per_labeled_actor = {}
num_examples_per_labeled_actor = {}
for i in range(len(test_labels)):
    if test_labels[i] in labeled_actors:
        num_examples_labeled_actors = num_examples_labeled_actors + 1
        
        # initiliaze dict
        if test_labels[i] not in num_examples_per_labeled_actor:
            num_examples_per_labeled_actor[test_labels[i]] = 0  
            correct_per_labeled_actor[test_labels[i]] = 0
        
        num_examples_per_labeled_actor[test_labels[i]] = num_examples_per_labeled_actor[test_labels[i]] + 1
            
        if test_labels[i]==pred_labels[i]:
            correct = correct + 1            
            correct_per_labeled_actor[test_labels[i]] = correct_per_labeled_actor[test_labels[i]] + 1

labeled_accuracy = 100*correct/float(num_examples_labeled_actors)            
print 'Overall accuracy of Known subjects:'
print '     #correct:', correct, '#samples', num_examples_labeled_actors, 'Accuracy:', labeled_accuracy