#Initilialize

In [0]:
import json
import os
import pickle
import urllib

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

In [4]:
root_dir = '/root/aml/'
drive_dir = root_dir + 'My Drive/AML/'
git_rep = 'Git'
git_dir = drive_dir + git_rep+'/'
dataset_dir = git_dir + 'datasets/final_dataset/'

neuralnet_dir = git_dir + 'neural-network/'
trailers_dir = dataset_dir + 'trailers/'

audio_dir = os.path.join(drive_dir, 'Honza', 'audio_dataset')
naive_audio_dir = os.path.join(audio_dir, 'naive')

from google.colab import drive
drive.mount(root_dir, force_remount=True) # run this line every time you have changed something in you drive
os.chdir(drive_dir)

# Check mounted disk
os.listdir(neuralnet_dir)

Mounted at /root/aml/


['examples',
 'NN_metadata',
 'template_metaInfo.ipynb',
 'template_trailers.ipynb',
 'trailer_models']

#Dataset

##Utils

In [0]:
def urlopen(url, mobile = False):
    try:
        if mobile:
            urlheader =  {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46' ,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Accept-Encoding': 'none',
            'Accept-Language': 'en-US,en;q=0.8',
            'Connection': 'keep-alive'}
        else:
            urlheader = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 
                          'AppleWebKit/537.11 (KHTML, like Gecko) '
                          'Chrome/23.0.1271.64 Safari/537.11',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Accept-Encoding': 'none',
            'Accept-Language': 'en-US,en;q=0.8',
            'Connection': 'keep-alive'
                        }
        #header2 = 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17'
        return urllib.request.urlopen(urllib.request.Request(url=url, data=None, headers=urlheader)).read().decode('utf-8')
    except HTTPError as e:
        if (_WARNINGS):
            time.sleep(5);
            warnings.warn(str(e))
            return urlopen(url)
        else:
            raise e

##Dataset representation

In [0]:
class Entry:
  
  def __init__(self, **kwargs):
    self.movie_id = kwargs['movie_id']
    self.name = kwargs['name']
    self.revenue_opening = kwargs['revenue_opening']
    self.revenue_total = kwargs['revenue_total']
    
  def __repr__(self):
    return f'Name: {self.name}; movie ID: {self.movie_id}'
    
  def set_metadata(self, metadata_dict):
    self.year = metadata_dict['year']
    self.genres = metadata_dict['genres']
    self.actors = metadata_dict['actors']
    self.directors = metadata_dict['directors']
    self.creators = metadata_dict['creators']
    self.duration = metadata_dict['duration']
    
  def set_trailers(self, trailer_dict):
    self.imdb_trailer = trailer_dict['imdb_trailer']
    self.youtube_trailer = trailer_dict['youtube_trailer']
    

class Dataset:
  
  def __init__(self, dataset_dir):
    self.basic_data = pd.read_csv(os.path.join(dataset_dir, 'final_dataset.csv'))
    self.dataset_dir = dataset_dir
    
    self.entries = list()
    self.movie_indeces = dict()
    self._create_entries()
    
    self.Youtube_urlroot = "https://www.youtube.com"
    self.Imdb_urlroot = "https://www.imdb.com"
    
  def get_entry(self, movie_id):
    return self.entries[self.movie_indeces[movie_id]]
  
  def get_revenue(self, movie_id):
    tmp_df = self.basic_data[self.basic_data['movie_id'] == movie_id]
    return tmp_df['revenue_opening'].values[0]
  
  def load_metadata(self):
    with open(os.path.join(self.dataset_dir, 'metadata.data'), 'rb') as fr:
      metadata_dict = pickle.load(fr)
      
    for k, v in metadata_dict['global'].items():
      self.__dict__[k] = self._decompress_global_categorical(v)
      
    for movie_id, movie_metadata in metadata_dict['entries'].items():
      entry = self.entries[self.movie_indeces[movie_id]]
      entry.set_metadata(movie_metadata)
      
      # decompress categorical
      entry.actors = self._decompress_categorical(entry.actors, len(self.all_actors))
      entry.creators = self._decompress_categorical(entry.creators, len(self.all_creators))
      entry.directors = self._decompress_categorical(entry.directors, len(self.all_directors))
      entry.genres = self._decompress_categorical(entry.genres, len(self.all_genres))
    
  def load_trailers(self):
    with open(os.path.join(self.dataset_dir, 'trailers.data'), 'rb') as fr:
      trailers_dict = pickle.load(fr)
      
    for movie_id, trailers in trailers_dict.items():
      entry = self.entries[self.movie_indeces[movie_id]]
      entry.set_trailers(trailers)
  
  def GetTrailer(self, IMDbVideoUrl, YoutubeVideoUrl, trailers_dir, filename): # download trailer
        os.chdir(trailers_dir)
        if len(IMDbVideoUrl):
            try:
                imdb_vid = IMDbVideoUrl[IMDbVideoUrl.index('/imdb/vi')+6:]
                html = urlopen(IMDbVideoUrl)
                script = BeautifulSoup(html, 'html.parser').find_all('script')[-3].text
                load = json.loads(script[script.index('push(')+len('push('):script.index(');')])
                for video in load['videos']['videoMetadata'][imdb_vid]['encodings']:
                    if video['definition'] == '1080p':
                        urllib.request.urlretrieve (video['videoUrl'], filename+'.mp4')
                        break
                    if video['definition'] == '720p':
                        urllib.request.urlretrieve (video['videoUrl'], filename+'.mp4')
                        break
                    if video['definition'] == '480p':
                        urllib.request.urlretrieve (video['videoUrl'], filename+'.mp4')
                        break
                    if video['definition'] == 'SD':
                        urllib.request.urlretrieve (video['videoUrl'], filename+'.mp4')
                        break
            except:
                os.system("youtube-dl --format 'bestvideo+bestaudio[ext=m4a]/bestvideo+bestaudio/best' --merge-output-format mp4 " +YoutubeVideoUrl +" -o '"+filename+".mp4'")
        else:
            os.system("youtube-dl --format 'bestvideo+bestaudio[ext=m4a]/bestvideo+bestaudio/best' --merge-output-format mp4 " +YoutubeVideoUrl +" -o '"+filename+".mp4'")
            
  def _create_entries(self):
    for idx, [movie_id, name, rev_open, rev_total] in self.basic_data.iterrows():
      entry = Entry(movie_id=movie_id, name=name,
                   revenue_opening = rev_open,
                   revenue_total = rev_total)
      self.entries.append(entry)
      self.movie_indeces[movie_id] = idx
      
  def _compress_global_categorical(self, input_dict):
    output_dict = dict()
    for k, v in input_dict.items():
      output_dict[k] = np.argmax(v)

    return output_dict
  
  def _decompress_global_categorical(self, input_dict):
    output_dict = dict()
    for k, v in input_dict.items():
      indicator = np.zeros(len(input_dict))
      indicator[v] = 1
      output_dict[k] = indicator
      
    return output_dict
  
  def _compress_categorical(self, arr):
    dct = dict()
    for idx, val in enumerate(arr):
      if val > 0:
        dct[idx] = val

    return dct
  
  def _decompress_categorical(self, arr, total_len):
    indicator = np.zeros(total_len)
    for idx, val in arr.items():
      indicator[idx] = val
    
    return indicator

## Load dataset

In [0]:
dataset = Dataset(dataset_dir)

dataset.load_trailers()
dataset.load_metadata()

In [29]:
len(dataset.entries)

7872

##Demo usage

In [27]:
dataset.get_revenue('tt0120338')

28638131

In [28]:
entry = dataset.get_entry('tt4154796')
print(entry.movie_id)
print(entry.name)
print(entry.revenue_opening)
print(entry.revenue_total)

# only available if 'dataset.load_trailers()' was invoked
print(entry.imdb_trailer)
print(entry.youtube_trailer)

# only available if 'dataset.load_metadata()' was invoked
print(dataset.all_actors)
print(dataset.all_creators)
print(dataset.all_directors)

print(entry.year)
print(entry.duration)
print(entry.actors)
print(entry.creators)
print(entry.directors)
print(entry.genres)


tt4154796
Avengers: Endgame
357115007
833130966
https://www.imdb.com/video/imdb/vi2163260441
https://www.youtube.com/watch?v=TcMBFSGVi1c
{'Eugene Byrd': array([1., 0., 0., ..., 0., 0., 0.]), 'Anjanette Comer': array([0., 1., 0., ..., 0., 0., 0.]), 'Sophia Mitrolakis': array([0., 0., 1., ..., 0., 0., 0.]), 'Mohamad Almusari': array([0., 0., 0., ..., 0., 0., 0.]), 'Peggy Lee': array([0., 0., 0., ..., 0., 0., 0.]), 'George Tiller': array([0., 0., 0., ..., 0., 0., 0.]), 'Victoria Song': array([0., 0., 0., ..., 0., 0., 0.]), 'Clint Eastwood': array([0., 0., 0., ..., 0., 0., 0.]), 'Gwyneth Cravens': array([0., 0., 0., ..., 0., 0., 0.]), "Kieran O'Reilly": array([0., 0., 0., ..., 0., 0., 0.]), 'Leonardo Salerni': array([0., 0., 0., ..., 0., 0., 0.]), 'Walter Suskind': array([0., 0., 0., ..., 0., 0., 0.]), 'Gene Atkins': array([0., 0., 0., ..., 0., 0., 0.]), 'Dan Beene': array([0., 0., 0., ..., 0., 0., 0.]), 'Corey Haim': array([0., 0., 0., ..., 0., 0., 0.]), 'Bresha Webb': array([0., 0., 0., 