In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import cv2
import time
import torch
import numpy as np
import pandas as pd
from PIL import Image
from moviepy.editor import *
from torchvision import transforms

In [3]:
df = pd.read_csv('drive/MyDrive/it416/data.csv')
df

Unnamed: 0,VideoID,Description
0,-4wsuPCjDBc_5_15,a squirrel is eating a peanut in it s shell
1,-4wsuPCjDBc_5_15,a chipmunk is eating
2,-4wsuPCjDBc_5_15,a chipmunk is eating a peanut
3,-4wsuPCjDBc_5_15,a chipmunk is eating a nut
4,-4wsuPCjDBc_5_15,a squirrel is eating a nut
...,...,...
80783,zxB4dFJhHR8_1_9,a girl riding a bicycle
80784,zxB4dFJhHR8_1_9,a smiling girl wearing backpack is riding a bike
80785,zxB4dFJhHR8_1_9,the girl rode her bike at the beach
80786,zzit5b_-ukg_5_20,a boy is doing exercise by cycle


In [4]:
videoIDs = np.array(df.VideoID.unique())
videoIDs

array(['-4wsuPCjDBc_5_15', '-7KMZQEsJW4_205_208', '-8y1Q0rA3n8_108_115',
       ..., 'zv2RIbUsnSw_335_341', 'zxB4dFJhHR8_1_9', 'zzit5b_-ukg_5_20'],
      dtype=object)

In [28]:
path = 'drive/MyDrive/it416/YouTubeClips/'
videoID = videoIDs[2]
# videoID = '0vmoZEaN_-o_4_12'
path+=videoID
path+='.avi'

clip=VideoFileClip(path)
clip.ipython_display(width=280)

100%|██████████| 211/211 [00:03<00:00, 61.31it/s]


In [29]:
def extract_frames_equally_spaced(frames, K):
        n_frames = len(frames)
        splits = np.array_split(range(n_frames), K)
        idx_taken = [s[0] for s in splits]
        sub_frames = []

        for idx in idx_taken:
          sub_frames.append(frames[idx])
        return sub_frames

def pad_frames(frames, limit, jpegs=False):
        last_frame = frames[-1]
        if jpegs:
            frames_padded = frames + [last_frame]*(limit-len(frames))
        else:
            padding = np.asarray([last_frame * 0.]*(limit-len(frames)))
            frames_padded = np.concatenate([frames, padding], axis=0)
        return frames_padded

def video_to_frames(input_loc,K):
    cap = cv2.VideoCapture(input_loc)
    
    video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
    count = 0

    frames=[]
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            continue
        frames.append(frame)
        count = count + 1
        
        if (count > (video_length-1)):
            cap.release()
            break
    if len(frames) < K:
        frames = pad_frames(frames,K)
    else:
        frames = extract_frames_equally_spaced(frames,K)
    
    return frames

K=28
input_loc = path
frames = video_to_frames(input_loc,K)
cv2.imwrite("img.jpg" , frames[0])
print("No. of frames:",len(frames))

No. of frames: 28


In [30]:
# frames[0]

In [31]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'googlenet', pretrained=True)
model.eval()

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


GoogLeNet(
  (conv1): BasicConv2d(
    (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (conv2): BasicConv2d(
    (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv3): BasicConv2d(
    (conv): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (inception3a): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track

In [32]:
children_counter = 0
for n,c in model.named_children():
    print("Children Counter: ",children_counter," Layer Name: ",n,)
    children_counter+=1

Children Counter:  0  Layer Name:  conv1
Children Counter:  1  Layer Name:  maxpool1
Children Counter:  2  Layer Name:  conv2
Children Counter:  3  Layer Name:  conv3
Children Counter:  4  Layer Name:  maxpool2
Children Counter:  5  Layer Name:  inception3a
Children Counter:  6  Layer Name:  inception3b
Children Counter:  7  Layer Name:  maxpool3
Children Counter:  8  Layer Name:  inception4a
Children Counter:  9  Layer Name:  inception4b
Children Counter:  10  Layer Name:  inception4c
Children Counter:  11  Layer Name:  inception4d
Children Counter:  12  Layer Name:  inception4e
Children Counter:  13  Layer Name:  maxpool4
Children Counter:  14  Layer Name:  inception5a
Children Counter:  15  Layer Name:  inception5b
Children Counter:  16  Layer Name:  avgpool
Children Counter:  17  Layer Name:  dropout
Children Counter:  18  Layer Name:  fc


In [39]:

# input_image = Image.open('img.jpg')

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_image = Image.fromarray(frames[0])
# input_image = cv2.cvtColor(np.float32(input_image), cv2.COLOR_BGR2RGB)
# image.save('image.jpg')
# input_image=Image.fromarray(frames[0].astype('uint8'), 'RGB')
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0) 


if torch.cuda.is_available():
    input_batch = input_batch.to('cuda')
    model.to('cuda')

with torch.no_grad():
    output = model(input_batch)

print(output.shape)
print(output)

probabilities = torch.nn.functional.softmax(output[0], dim=0)
print(probabilities)

torch.Size([1, 1000])
tensor([[ 8.4543e-01, -1.7465e+00,  1.7437e-01, -9.6075e-01, -1.7993e-01,
          2.1671e-01, -1.3568e+00, -3.1005e-01,  3.4662e-01,  7.7535e-01,
          4.3689e-01, -7.4540e-01, -1.1357e+00, -9.4223e-01, -5.8694e-01,
         -1.0097e+00, -1.5689e+00, -6.6072e-01, -1.2860e+00, -1.8819e+00,
         -6.5582e-01,  1.0537e-01, -1.7450e-01, -4.9566e-01,  8.2227e-01,
          1.1320e+00, -1.1593e-02, -2.7128e-01,  5.7169e-01, -1.3878e+00,
         -1.2450e+00, -1.3026e+00, -1.0062e+00, -4.1702e-01,  2.4093e+00,
         -8.0609e-01, -4.6942e-01, -8.6303e-01, -9.1874e-01, -7.7479e-01,
         -1.6542e+00, -1.4347e+00, -5.7878e-02, -3.2421e-01, -1.1651e+00,
          2.8218e-01, -1.2898e+00, -5.3459e-01, -6.4298e-01, -1.2092e+00,
         -6.8328e-01, -8.0711e-01, -7.4473e-01, -1.2859e+00,  3.7801e-04,
         -1.8215e+00, -1.2880e+00, -3.5648e-01, -2.9726e+00, -7.2088e-01,
         -4.5221e-01, -7.4048e-01, -1.1585e-01,  2.7103e+00, -1.0537e+00,
          1.9366

In [23]:
!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt

--2022-02-09 19:54:27--  https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10472 (10K) [text/plain]
Saving to: ‘imagenet_classes.txt.9’


2022-02-09 19:54:27 (67.6 MB/s) - ‘imagenet_classes.txt.9’ saved [10472/10472]



In [40]:
# Read the categories
with open("imagenet_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]
# Show top categories per image
top5_prob, top5_catid = torch.topk(probabilities, 10)
for i in range(top5_prob.size(0)):
    print(categories[top5_catid[i]], top5_prob[i].item())

binoculars 0.18653704226016998
ski mask 0.029434097930788994
flute 0.017521612346172333
Kerry blue terrier 0.01617616042494774
television 0.012757646851241589
ski 0.009573721326887608
unicycle 0.009379993192851543
Bouvier des Flandres 0.009097294881939888
assault rifle 0.008746830746531487
maraca 0.00845545344054699
