In [1]:
%load_ext autoreload
%autoreload 2
# Helper libraries
import matplotlib
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
from PIL import Image
import cv2 as cv
from tqdm import tqdm
import IPython
from sklearn.metrics import confusion_matrix
from tabulate import tabulate
import os

import glob
import pandas as pd
import random
from colour.plotting import *

from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib import colors
import data_utils as data_utils
import tqdm

import multiprocessing
from functools import partial

## Load data from csv and json

In [2]:
csv_path = '../data/trailers.csv'
metadata_path = '../data/movie_details.json'

metadata = data_utils.load_json(metadata_path, csv_path)
print(len(metadata))

Column names are , tmdb_id, tmdb_title, trailer_title, trailer_youtube_key
1187


In [48]:
all_genres = {}
for vid_key in metadata.keys():
    genres =  metadata[vid_key]['genres']
    genre_names = [''.join(g['name'].split(' ')) for g in genres]
    for genre in genres:
        genre_name = ''.join(genre['name'].split(' '))
        if genre_name not in all_genres.keys():
            all_genres[genre_name] = [{metadata[vid_key]['yt_filename']: genre_names}]
        else:
            all_genres[genre_name].append({metadata[vid_key]['yt_filename']: genre_names})
print('All genres in this dataset: ', all_genres.keys())

All genres in this dataset:  dict_keys(['Drama', 'Crime', 'History', 'War', 'Romance', 'Animation', 'Family', 'Fantasy', 'Thriller', 'Comedy', 'Action', 'Adventure', 'ScienceFiction', 'Horror', 'Western', 'Music', 'Mystery', 'TVMovie'])


In [49]:
with open('classes.txt') as f:
    class_list = [line.rstrip() for line in f]
print(class_list)
for c in class_list:
    print(len(all_genres[c]))
    
print(all_genres['Action'][0])

['Action', 'Family', 'Fantasy', 'Romance', 'Thriller', 'Comedy', 'Horror', 'ScienceFiction']
448
199
234
158
334
343
117
260
{'kmJLuwP3MbY': ['Drama', 'Action', 'Crime', 'Thriller']}


## Split videos from /data dir into frames

In [5]:
genre = 'War'
save_path = os.path.join('/home/bmild/comp-color/data', genre)
if not os.path.exists(save_path):
    os.makedirs(save_path)
vid_paths = [os.path.join('/home/bmild/comp-color/data', video_path + '.mp4') for video_path in all_genres[genre]]
#     print(vid_paths[0])
#     print(save_path)
#     data_utils.extract_every_n_frames(vid_paths[0], frames_dir_path=save_path, n=10)

objPool = multiprocessing.Pool(8)
r = list(tqdm.tqdm(objPool.imap(partial(data_utils.extract_every_n_frames, frames_dir_path=save_path, n=10), 
                  vid_paths), total=len(vid_paths)))
objPool.close()
objPool.join()

100%|██████████| 35/35 [02:53<00:00,  4.96s/it]


In [None]:
for genre in class_list:
    vid_paths = ['/home/bmild/comp-color/data/{}/{}'.format(genre, video_path) for video_path in all_genres[genre]]
    objPool = multiprocessing.Pool(8)
    r = list(tqdm.tqdm(objPool.imap(partial(data_utils.clean_frames, remove_beg_end=True, secs_to_remove=5, incr=10), 
                      vid_paths), total=len(vid_paths)))
    objPool.close()
    objPool.join()

In [7]:
no_anim_keys = []
for vid_key in metadata.keys():
    genres = [g['name'] for g in metadata[vid_key]['genres']]
    if 'Animation' not in genres:
        no_anim_keys.append(vid_key)
        
random.shuffle(no_anim_keys)
split = int(len(no_anim_keys)*.85)
train_vids = no_anim_keys[:split]
val_vids = no_anim_keys[split:]
print(len(val_vids))
print(len(train_vids))

158
891


In [8]:
with open('classes.txt') as f:
    class_list = [line.rstrip() for line in f]
print(class_list)

['Action', 'Family', 'Fantasy', 'Romance', 'Thriller', 'Comedy', 'Horror', 'ScienceFiction']


In [16]:
text_path = 'train_6.txt'
vid_list = train_vids


with open(text_path, 'w+') as f:
    for v in tqdm.tqdm(vid_list):
        genres = [g['name'] for g in metadata[v]['genres']]
        sampled_frames = None
        for g in genres:
            filename = metadata[v]['yt_filename']
            search_file = '/home/bmild/comp-color/data/{}/{}_*.jpg'.format(g, filename)
#             print(search_file)
            vid_frames = glob.glob(search_file)
            if len(vid_frames) != 0:
                if len(vid_frames) >= 20:
                    for i in range(10):
                        sampled_frames = random.sample(vid_frames, 20)
                        vid_class_nums = np.intersect1d(genres, class_list, return_indices=True)[2].reshape(-1)
                        one_hot_classes = np.zeros(len(class_list))
                        np.put(one_hot_classes, vid_class_nums, 1)
                        one_hot_str = ''
                        for c in one_hot_classes:
                            one_hot_str += str(int(c)) + ','
                        one_hot_str = one_hot_str[:-1]
                        f.write(','.join(sampled_frames) + ',' + one_hot_str + '\n')        

100%|██████████| 891/891 [02:08<00:00,  6.94it/s]


In [17]:
text_path = 'train_5.txt'
with open(text_path) as f:
    data = [l for l in f]
random.shuffle(data)
with open(text_path, 'w') as f:
    for d in data:
        f.write(d)

In [35]:
len(glob.glob('/home/bmild/comp-color/data/Action/'))

1

In [51]:
list(all_genres['ScienceFiction'][0].values())[0]
print(class_list)

['Action', 'Family', 'Fantasy', 'Romance', 'Thriller', 'Comedy', 'Horror', 'ScienceFiction']


In [52]:
train_path = 'train_6.txt'
val_path = 'val_6.txt'
from random import shuffle
with open(train_path, 'w+') as f:
    with open(val_path, 'w+') as f2:

        for g in class_list:
            count = 0
            genre_vids = all_genres[g].copy()
            shuffle(genre_vids)
            idx = 0
            while count < 100:
                vid_id = list(genre_vids[idx].keys())[0]
                genres = list(genre_vids[idx].values())
                if 'Animation' not in genres:
                    vid_frames = glob.glob('/home/bmild/comp-color/data/{}/{}_*.jpg'.format(g, vid_id))
                    if len(vid_frames) != 0:
                        if len(vid_frames) >= 20:
                            for i in range(10):
                                sampled_frames = random.sample(vid_frames, 20)
                                vid_class_nums = np.intersect1d(genres, class_list, return_indices=True)[2].reshape(-1)
                                one_hot_classes = np.zeros(len(class_list))
                                np.put(one_hot_classes, vid_class_nums, 1)
                                one_hot_str = ''
                                for c in one_hot_classes:
                                    one_hot_str += str(int(c)) + ','
                                one_hot_str = one_hot_str[:-1]
                                if count < 85:
                                    f.write(','.join(sampled_frames) + ',' + one_hot_str + '\n')
                                else:
                                    f2.write(','.join(sampled_frames) + ',' + one_hot_str + '\n')
                    count += 1
                idx += 1
        

In [53]:
text_path = 'train_6.txt'
with open(text_path) as f:
    data = [l for l in f]
random.shuffle(data)
with open(text_path, 'w') as f:
    for d in data:
        f.write(d)
        
text_path = 'val_6.txt'
with open(text_path) as f:
    data = [l for l in f]
random.shuffle(data)
with open(text_path, 'w') as f:
    for d in data:
        f.write(d)