In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
import numpy as np
import cv2
import os

import pickle

In [3]:
## data path
DATA_DIR = '../../data'
DATA_NAME = DATA_DIR + '/10vid'
DATA_FEAT = DATA_DIR + '/10vid-feature'
VIDEO_CAPS = DATA_DIR + '/msvd_video_caps.pkl'

FEAT_DIM = 2048

In [4]:
def load_caps(caps_path):
    # load video id and captions
    with open(caps_path, 'rb') as f:
        #video_ids, video_caps = pickle.load(f)
        rec = pickle.load(f)
        video_ids = set(rec[:,0]) # get only video id
    return video_ids

In [5]:
def create_resnet():
    # load pretrained model
    resnet152 = models.resnet152(pretrained=True)
    # remove the last layer
    modules=list(resnet152.children())[:-1]
    resnet152=nn.Sequential(*modules)
    for p in resnet152.parameters():
        p.requires_grad = False
    return resnet152

In [15]:
def video2tensor(video_path):
    # load avi file
    cap = cv2.VideoCapture(video_path)
    h_fps = int(cap.get(cv2.CAP_PROP_FPS)/2) # number of frame in 0.5 second
    print(h_fps)
    ret=True
    frames=[]
    count = 0
    while ret:
        ret, frame = cap.read()
        if ret==False:
            break
        if count % h_fps == 0:
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            resize_rgb = cv2.resize(rgb_frame, dsize=(224, 224), interpolation=cv2.INTER_LINEAR)
            # normalize to [0,1]
            norm_rgb = cv2.normalize(resize_rgb, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
            # transform data using (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 
            frames.append(np.array([(norm_rgb[:,:,0]-0.485)/0.229, (norm_rgb[:,:,1]-0.456)/0.224, (norm_rgb[:,:,2]-0.406)/0.225]))
        # frame counter
        count = count + 1
    
    # assign it to a variable
    # frames_var = Variable(torch.tensor(frames)) # works with torch 0.4
    frames_var = Variable(torch.FloatTensor(np.array(frames).astype(np.float64))) # works with torch 0.3.1
    print(len(frames))
    return frames_var

In [7]:
def extract_feature_resnet(resnet152, video_path):
    frames_var = video2tensor(video_path)
    features_var = resnet152(frames_var) # get the output from the last hidden layer of the pretrained resnet
    features = features_var.data # get the tensor out of the variable
    return features

In [17]:
video_ids = ["_UqnTFs1BLc_23_27.avi"]
video_path = DATA_NAME+'/'+video_id
frames_var = video2tensor(video_path)

7
9


In [8]:
## Main program

# load pre-trained model
model = create_resnet()

# load video id from caption file
#video_ids = load_caps(VIDEO_CAPS)
video_ids = ["_UqnTFs1BLc_23_27.avi"]

# extract feature for each video which is a batch of frames
for video_id in video_ids:
    video_path = DATA_NAME+'/'+video_id
    if os.path.isfile(video_path):
        feature = extract_feature_resnet(model, video_path)
        # mean pooling
        feature_mean = torch.mean(feature, dim=0)
        np.save(DATA_FEAT+"/"+video_id[:-4]+'.npy', feature_mean.numpy().reshape(FEAT_DIM))
    

In [26]:
feature.numpy().shape

(60, 2048, 1, 1)

In [23]:
feature_mean


( 0  ,.,.) = 
  0.7615

( 1  ,.,.) = 
  0.4713

( 2  ,.,.) = 
  0.7870
 ... 

(2045,.,.) = 
  0.4091

(2046,.,.) = 
  0.3742

(2047,.,.) = 
  0.4015
[torch.FloatTensor of size 2048x1x1]

In [34]:
torch.mean(feature, dim=0).data.numpy().reshape(2048)


array([0.7615013 , 0.47133332, 0.78696907, ..., 0.4090519 , 0.3741508 ,
       0.40154096], dtype=float32)

In [10]:
count

60

In [11]:
np.array(frames).shape

(60, 3, 224, 224)

In [12]:
img = torch.Tensor(1,3, 224, 224).normal_() # random image

In [18]:
img.shape

torch.Size([1, 3, 224, 224])

In [15]:
img2 = torch.tensor(frames)

In [17]:
img2.shape

torch.Size([60, 3, 224, 224])

In [12]:
a=np.arange(2*3*4).reshape(2,3,4)
a

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])

In [13]:
a.reshape(3,2,4)

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]],

       [[16, 17, 18, 19],
        [20, 21, 22, 23]]])