This code saves result as output.txt

In [None]:
import os
import glob

import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torchvision
from collections import defaultdict

import glob
import random

import os
from torchvision.transforms._transforms_video import (
    NormalizeVideo,
)

In [None]:
mapping = {'Tooth brushing': 'brushing teeth',
'Basketball': 'playing basketball',
 'Tennis': 'playing tennis',
 'Push up': 'push up',
 'Reading': 'reading book',
 'Cake eating': 'eating cake',
 'Computer': 'using computer',
 'Flipping pancake': 'flipping pancake',
'Shaving': 'trimming or shaving beard',
'Jogging':'jogging'}

In [None]:
def sample_frames(videolength, segment=8, train=True, cont=False, stride=1):
    # sparse sampling
    
    # tublet #########################
    if cont: 
        if train:  # scalar    +      [-4, -3,   ..., +3  ]
            return (np.random.randint(videolength) + (np.arange(segment) - segment//2) * stride).clip(0, videolength-1)
        else:
            return ((videolength)//2 + (np.arange(segment) - segment//2) * stride).clip(0, videolength-1)

        
    # TSN style #####################
    step = (videolength-1) / segment

    if train:
        sampled = (np.arange(segment) * step + np.random.random(size=segment) * step).round().astype(int)
    else:
        sampled = (np.arange(segment) * ((videolength-1)/(segment-1))).round().astype(int)
    return sampled

def randomcrop(video):
    
    h,w = video.shape[-2:]
    new_size = int(min(h,w) * 0.99)
    
    h_start = random.randint(0, h-new_size)
    w_start = random.randint(0, w-new_size)
    
    return video[:, :, h_start:h_start+new_size, w_start:w_start+new_size]

def centercrop(video):
    
    h,w = video.shape[-2:]
    new_size = int(min(h,w) * 0.8)
    
    h_start = h//2 - new_size //2
    w_start = w//2 - new_size //2
    
    return video[:, :, h_start:h_start+new_size, w_start:w_start+new_size]
    
def randomflip(video):
    
    if random.randint(0,1) == 0:
        
        video = torch.flip(video, [-1])
    
    return video
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
        
class wkdataset():
    def __init__(self, videos=[], stride=16):
        
        self.videos = []
        for v in videos:
            
            frames = sorted(glob.glob(os.path.join(v, "*.jpg")))
            for i in range(0, len(frames), stride):
                this_frames = []
                
                for j in range(0,64, 8):
                    j = min(len(frames)-1, i+j)
                    this_frames.append(frames[j])
                self.videos.append([v, this_frames, len(frames)])
        
        self.frame_size=256
        self.norm = NormalizeVideo(mean, std)
    
    def __len__(self):
        return len(self.videos)
    
    def __getitem__(self, idx):
        
        v, frame_names, video_length = self.videos[idx]
        
        frames = []
        for frame_name in frame_names:
            frames.append(torchvision.io.read_image(frame_name))
            
        frames = torch.stack(frames)        
        frames = centercrop(frames)
        frames = torch.nn.functional.interpolate(frames, (self.frame_size,self.frame_size)) 
        
        
        # frames = frames / 255
        # frames = frames  * 2 - 1
        
        frames = frames.transpose(0,1)
        
        frames = frames / 255
        frames = self.norm(frames)
        
        return v, frames, video_length

In [None]:
import torch
# Choose the `slow_r50` model 
model = torch.hub.load('facebookresearch/pytorchvideo', 'i3d_r50', pretrained=True)

# Set to GPU or CPU
device = "cuda"
model = model.eval()
model = model.to(device)

In [None]:
import json
import urllib

json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)

with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

In [None]:
dataset = wkdataset(videos=glob.glob('./frames/*/'))
loader = DataLoader(dataset, batch_size=44, drop_last=False, num_workers=48)

In [None]:
import progressbar

results = defaultdict(list)

with torch.no_grad():
    for vs, frames, _ in progressbar.progressbar(loader):
        frames = frames.cuda()
        outputs = model(frames).detach().cpu().numpy()
        
        
        for output, v in zip(outputs, vs):
            results[v].append(output)

In [None]:
selected_kinetics_classes = list(mapping.values())
selected_kinetics_classes_idx = [kinetics_classnames[f'"{k}"'] if k!='jogging' else kinetics_classnames[f'{k}']  for k in selected_kinetics_classes]


with open("output.txt", "w") as f:
    for k, v in results.items():

        top5_idx = np.array(v).mean(0)[selected_kinetics_classes_idx].argsort()[::-1]
        top5_name = [selected_kinetics_classes[i] for i in top5_idx]


        videoname = k.split("/")[-2]
        
        my_name = " ".join(videoname.split(" ")[:-1])

        if my_name in mapping:
            f.write(f"{videoname:40s}{top5_name[0]}")
            f.write("\n")